async def check_id(self, current_user_id, aphia_id) -> str: """ Check the given aphia_id, adjust the DB if needed. """ # Security check _user = RightsBO.user_has_role(self.ro_session, current_user_id, Role.APP_ADMINISTRATOR) lineage = await WoRMSFinder.aphia_classif_by_id(aphia_id) # Nested struct, e.g. : {'AphiaID': 1, 'rank': 'Superdomain', 'scientificname': 'Biota', 'child': # {'AphiaID': 3, 'rank': 'Kingdom', 'scientificname': 'Plantae', 'child': # {'AphiaID': 368663, 'rank': 'Subkingdom', 'scientificname': 'Viridiplantae', 'child': # {'AphiaID': 536191, 'rank': 'Infrakingdom', 'scientificname': 'Streptophyta', 'child': # ... # }}}}}}}}} prev_level = None while lineage is not None: aphia_id_for_level = lineage["AphiaID"] db_entry = self.session.query(WoRMS).get(aphia_id_for_level) if db_entry is None: assert prev_level is not None prev_level.all_fetched = False self.session.commit() return "%d was not found, so parent %d was marked as incomplete" % \ (aphia_id_for_level, prev_level.aphia_id) lineage = lineage["child"] prev_level = db_entry return "All OK"
def delete(self, current_user_id: UserIDT, coll_id: CollectionIDT) -> int: # TODO, for now only admins _user = RightsBO.user_has_role(self.ro_session, current_user_id, Role.APP_ADMINISTRATOR) CollectionBO.delete(self.session, coll_id) self.session.commit() return 0
def query(self, current_user_id: UserIDT, coll_id: CollectionIDT) -> Optional[CollectionBO]: # TODO, for now only admins _user = RightsBO.user_has_role(self.session, current_user_id, Role.APP_ADMINISTRATOR) ret = CollectionBO.get_one(self.session, coll_id) return ret
async def db_refresh(self, current_user_id: int): """ Refresh the local taxonomy DB. """ # Security check _user = RightsBO.user_has_role(self.ro_session, current_user_id, Role.APP_ADMINISTRATOR) await self._do_refresh()
def search(self, current_user_id: UserIDT, title: str) -> List[CollectionBO]: # TODO, for now only admins _user = RightsBO.user_has_role(self.ro_session, current_user_id, Role.APP_ADMINISTRATOR) qry = self.ro_session.query(Collection).filter( Collection.title.ilike(title)) ret = [CollectionBO(a_rec).enrich() for a_rec in qry.all()] return ret
def create(self, current_user_id: UserIDT, req: CreateCollectionReq) -> Union[CollectionIDT, str]: """ Create a collection. """ # TODO, for now only admins _user = RightsBO.user_has_role(self.ro_session, current_user_id, Role.APP_ADMINISTRATOR) coll_id = CollectionBO.create(self.session, req.title, req.project_ids) return coll_id
def do_digests(self, current_user_id: UserIDT, prj_id: Optional[ProjectIDT], max_digests: int) -> str: """ Pick some images without checksum and compute it. """ _user = RightsBO.user_has_role(self.ro_session, current_user_id, Role.APP_ADMINISTRATOR) qry: Query = self.ro_session.query(Image.file_name) if prj_id is not None: # Find missing images in a project qry = qry.join(ObjectHeader).join(Acquisition).join(Sample).join( Project) qry = qry.filter(Project.projid == prj_id) else: # Find missing images globally pass qry = qry.outerjoin(ImageFile, Image.file_name == ImageFile.path) qry = qry.filter(ImageFile.path.is_(None)) qry = qry.limit(max_digests) cnt = 0 with CodeTimer("Files without md5, query '%s':" % str(qry), logger): files_without_md5 = [file_name for file_name, in qry.all()] for an_img_file_name in files_without_md5: cnt += 1 img_file = ImageFile(path=an_img_file_name) self.session.add(img_file) self._md5_on_record(img_file) self.session.commit() # Eventually we can still satisfy the constraint while doing a few missing md5s left_for_unknown = max_digests - cnt if left_for_unknown > 0: # Also do unknown image file lines miss_qry: Query = self.session.query(ImageFile) miss_qry = miss_qry.filter( and_(ImageFile.state == ImageFileStateEnum.UNKNOWN.value, ImageFile.digest_type == '?')) if prj_id is not None: # Find unknown images in a project miss_qry = miss_qry.outerjoin( Image, Image.file_name == ImageFile.path) miss_qry = miss_qry.join(ObjectHeader).join(Acquisition).join( Sample).join(Project) miss_qry = miss_qry.filter(Project.projid == prj_id) # On purpose, no "order by" clause. Results are random, but sorting takes a while on lots of images miss_qry = miss_qry.limit(left_for_unknown) with CodeTimer( "Files with unknown state, query '%s':" % str(miss_qry), logger): missing_ones = [an_img_file for an_img_file in miss_qry.all()] for a_missing in missing_ones: cnt += 1 self._md5_on_record(a_missing) self.session.commit() return "Digest for %d images done." % cnt
def run(self, current_user_id: int) -> EMODnetExportRsp: """ Initial run, basically just create the job. """ # TODO, for now only admins _user = RightsBO.user_has_role(self.ro_session, current_user_id, Role.APP_ADMINISTRATOR) # OK, go background straight away self.create_job(self.JOB_TYPE, current_user_id) ret = EMODnetExportRsp(job_id=self.job_id) return ret
def do_run(self, current_user_id: int) -> EMODnetExportRsp: # Security check # TODO, for now only admins _user = RightsBO.user_has_role(self.session, current_user_id, Role.APP_ADMINISTRATOR) # Adjust the task self.set_task_params(current_user_id, self.DWC_ZIP_NAME) # Do the job logger.info("------------ starting --------------") # Update DB statistics self.update_db_stats() ret = EMODnetExportRsp() # Build metadata with what comes from the collection meta = self.build_meta() if meta is None: # If we can't have meta there has to be reasons assert len(self.errors) > 0 ret.errors = self.errors ret.warnings = self.warnings return ret # Create a container arch = DwC_Archive( DatasetMetadata(meta), self.temp_for_task.base_dir_for(self.task_id) / self.DWC_ZIP_NAME) # Add data from DB # OK because https://edmo.seadatanet.org/v_edmo/browse_step.asp?step=003IMEV_0021 # But TODO: hardcoded, implement https://github.com/oceanomics/ecotaxa_dev/issues/514 self.institution_code = "IMEV" self.add_events(arch) # OK we issue warning in case of individual issue, but if no content at all # then it's an error if arch.events.count() == 0 and arch.occurences.count( ) == 0 and arch.emofs.count() == 0: self.errors.append( "No content produced." " See previous warnings or check the presence of samples in the projects" ) else: # Produce the zip arch.build() self.log_stats() ret.errors = self.errors ret.warnings = self.warnings if len(ret.errors) == 0: ret.task_id = self.task_id return ret
def do_digests(self, current_user_id: UserIDT, prj_id: Optional[ProjectIDT], max_digests: int) -> str: """ Pick some images without checksum and compute it. """ _user = RightsBO.user_has_role(self.session, current_user_id, Role.APP_ADMINISTRATOR) qry: Query = self.session.query(Image, ImageFile) if prj_id is not None: qry = qry.join(ObjectHeader).join(Acquisition).join(Sample).join( Project) qry = qry.outerjoin(ImageFile, Image.file_name == ImageFile.path) qry = qry.filter(ImageFile.path.is_(None)) if prj_id is not None: qry = qry.filter(Project.projid == prj_id) qry = qry.limit(max_digests) cnt = 0 for an_img, img_file in qry.all(): cnt += 1 if img_file is None: # No image_file line, add it img_file = ImageFile(path=an_img.file_name) self.session.add(img_file) img_file_path = self.vault.sub_path(an_img.file_name) try: md5 = self.compute_md5(img_file_path) img_file.digest = md5 img_file.digest_type = '5' img_file.state = ImageFileStateEnum.OK.value except FileNotFoundError: img_file.state = ImageFileStateEnum.MISSING.value except Exception as e: logger.exception(e) img_file.state = ImageFileStateEnum.ERROR.value self.session.commit() return "Digest for %d images done." % cnt
def do_cleanup_dup_same_obj(self, current_user_id: UserIDT, prj_id: ProjectIDT, max_deletes: int) -> str: """ Simplest duplication pattern. Inside the same object there are several identical images. """ _user = RightsBO.user_has_role(self.ro_session, current_user_id, Role.APP_ADMINISTRATOR) orig_img = aliased(Image, name="orig") orig_file = aliased(ImageFile, name="orig_file") qry: Query = self.session.query(orig_img.file_name, orig_img.imgid, Image, ImageFile) # Select what to delete qry = qry.join(ObjectHeader, ObjectHeader.objid == Image.objid).join( Acquisition).join(Sample).join(Project) # We consider that original image is the oldest one, so others have a superior ID qry = qry.join( orig_img, and_(orig_img.objid == Image.objid, orig_img.orig_file_name == Image.orig_file_name, orig_img.width == Image.width, orig_img.height == Image.height, orig_img.imgid < Image.imgid)) # Must have a checksum, with the same state (sane) qry = qry.join( ImageFile, and_(ImageFile.path == Image.file_name, ImageFile.state == ImageFileStateEnum.OK.value)) qry = qry.join( orig_file, and_(orig_file.path == orig_img.file_name, orig_file.state == ImageFileStateEnum.OK.value)) # and the same value of course qry = qry.filter( and_(ImageFile.digest_type == orig_file.digest_type, ImageFile.digest == orig_file.digest)) qry = qry.filter(Project.projid == prj_id) qry = qry.order_by(Image.objid, orig_img.imgid, Image.imgid) qry = qry.limit(max_deletes) with CodeTimer( "Dups same objs inside %d, query '%s':" % (prj_id, str(qry)), logger): to_do = [(orig_file_name, orig_img_id, an_image, an_image_file) for orig_file_name, orig_img_id, an_image, an_image_file in qry.all()] ko_not_same = 0 ko_except = 0 # Prepare & start a remover thread that will run in // with DB queries remover = VaultRemover(self.link_src, logger).do_start() filecmp.clear_cache() deleted_imgids: Set[int] = set() for orig_file_name, orig_img_id, an_image, an_image_file in to_do: # The query returns multiple rows if there are more than 2 duplicates if orig_img_id in deleted_imgids: continue # Even if MD5s match, be paranoid and compare files orig_path = self.vault.sub_path(orig_file_name) dup_path = self.vault.sub_path(an_image.file_name) assert orig_path != dup_path orig_exists = exists(orig_path) dup_exists = exists(dup_path) if orig_exists: if dup_exists: try: same = filecmp.cmp(orig_path, dup_path, False) except Exception as exc: logger.info( "Exception while comparing orig:%s and dup:%s: %s", orig_path, dup_path, str(exc)) ko_except += 1 continue if not same: ko_not_same += 1 continue else: # Duplicate is gone already pass else: # DB record of physical file is wrong # TODO continue # Do the cleanup deleted_imgids.add(an_image.imgid) if dup_exists: remover.add_files([an_image.file_name]) self.session.delete(an_image) self.session.delete(an_image_file) # Wait for the files handled self.session.commit() remover.wait_for_done() return ( "Dupl remover for %s dup images done but %d problems %d false file comp" % (len(deleted_imgids), ko_except, ko_not_same))