Esempio n. 1
0
 async def check_id(self, current_user_id, aphia_id) -> str:
     """
         Check the given aphia_id, adjust the DB if needed.
     """
     # Security check
     _user = RightsBO.user_has_role(self.ro_session, current_user_id,
                                    Role.APP_ADMINISTRATOR)
     lineage = await WoRMSFinder.aphia_classif_by_id(aphia_id)
     # Nested struct, e.g. : {'AphiaID': 1, 'rank': 'Superdomain', 'scientificname': 'Biota', 'child':
     # {'AphiaID': 3, 'rank': 'Kingdom', 'scientificname': 'Plantae', 'child':
     # {'AphiaID': 368663, 'rank': 'Subkingdom', 'scientificname': 'Viridiplantae', 'child':
     # {'AphiaID': 536191, 'rank': 'Infrakingdom', 'scientificname': 'Streptophyta', 'child':
     # ...
     # }}}}}}}}}
     prev_level = None
     while lineage is not None:
         aphia_id_for_level = lineage["AphiaID"]
         db_entry = self.session.query(WoRMS).get(aphia_id_for_level)
         if db_entry is None:
             assert prev_level is not None
             prev_level.all_fetched = False
             self.session.commit()
             return "%d was not found, so parent %d was marked as incomplete" % \
                    (aphia_id_for_level, prev_level.aphia_id)
         lineage = lineage["child"]
         prev_level = db_entry
     return "All OK"
Esempio n. 2
0
 def delete(self, current_user_id: UserIDT, coll_id: CollectionIDT) -> int:
     # TODO, for now only admins
     _user = RightsBO.user_has_role(self.ro_session, current_user_id,
                                    Role.APP_ADMINISTRATOR)
     CollectionBO.delete(self.session, coll_id)
     self.session.commit()
     return 0
Esempio n. 3
0
 def query(self, current_user_id: UserIDT,
           coll_id: CollectionIDT) -> Optional[CollectionBO]:
     # TODO, for now only admins
     _user = RightsBO.user_has_role(self.session, current_user_id,
                                    Role.APP_ADMINISTRATOR)
     ret = CollectionBO.get_one(self.session, coll_id)
     return ret
Esempio n. 4
0
 async def db_refresh(self, current_user_id: int):
     """
         Refresh the local taxonomy DB.
     """
     # Security check
     _user = RightsBO.user_has_role(self.ro_session, current_user_id,
                                    Role.APP_ADMINISTRATOR)
     await self._do_refresh()
Esempio n. 5
0
 def search(self, current_user_id: UserIDT,
            title: str) -> List[CollectionBO]:
     # TODO, for now only admins
     _user = RightsBO.user_has_role(self.ro_session, current_user_id,
                                    Role.APP_ADMINISTRATOR)
     qry = self.ro_session.query(Collection).filter(
         Collection.title.ilike(title))
     ret = [CollectionBO(a_rec).enrich() for a_rec in qry.all()]
     return ret
Esempio n. 6
0
 def create(self, current_user_id: UserIDT,
            req: CreateCollectionReq) -> Union[CollectionIDT, str]:
     """
         Create a collection.
     """
     # TODO, for now only admins
     _user = RightsBO.user_has_role(self.ro_session, current_user_id,
                                    Role.APP_ADMINISTRATOR)
     coll_id = CollectionBO.create(self.session, req.title, req.project_ids)
     return coll_id
Esempio n. 7
0
 def do_digests(self, current_user_id: UserIDT,
                prj_id: Optional[ProjectIDT], max_digests: int) -> str:
     """
         Pick some images without checksum and compute it.
     """
     _user = RightsBO.user_has_role(self.ro_session, current_user_id,
                                    Role.APP_ADMINISTRATOR)
     qry: Query = self.ro_session.query(Image.file_name)
     if prj_id is not None:
         # Find missing images in a project
         qry = qry.join(ObjectHeader).join(Acquisition).join(Sample).join(
             Project)
         qry = qry.filter(Project.projid == prj_id)
     else:
         # Find missing images globally
         pass
     qry = qry.outerjoin(ImageFile, Image.file_name == ImageFile.path)
     qry = qry.filter(ImageFile.path.is_(None))
     qry = qry.limit(max_digests)
     cnt = 0
     with CodeTimer("Files without md5, query '%s':" % str(qry), logger):
         files_without_md5 = [file_name for file_name, in qry.all()]
     for an_img_file_name in files_without_md5:
         cnt += 1
         img_file = ImageFile(path=an_img_file_name)
         self.session.add(img_file)
         self._md5_on_record(img_file)
     self.session.commit()
     # Eventually we can still satisfy the constraint while doing a few missing md5s
     left_for_unknown = max_digests - cnt
     if left_for_unknown > 0:
         # Also do unknown image file lines
         miss_qry: Query = self.session.query(ImageFile)
         miss_qry = miss_qry.filter(
             and_(ImageFile.state == ImageFileStateEnum.UNKNOWN.value,
                  ImageFile.digest_type == '?'))
         if prj_id is not None:
             # Find unknown images in a project
             miss_qry = miss_qry.outerjoin(
                 Image, Image.file_name == ImageFile.path)
             miss_qry = miss_qry.join(ObjectHeader).join(Acquisition).join(
                 Sample).join(Project)
             miss_qry = miss_qry.filter(Project.projid == prj_id)
         # On purpose, no "order by" clause. Results are random, but sorting takes a while on lots of images
         miss_qry = miss_qry.limit(left_for_unknown)
         with CodeTimer(
                 "Files with unknown state, query '%s':" % str(miss_qry),
                 logger):
             missing_ones = [an_img_file for an_img_file in miss_qry.all()]
         for a_missing in missing_ones:
             cnt += 1
             self._md5_on_record(a_missing)
         self.session.commit()
     return "Digest for %d images done." % cnt
Esempio n. 8
0
 def run(self, current_user_id: int) -> EMODnetExportRsp:
     """
         Initial run, basically just create the job.
     """
     # TODO, for now only admins
     _user = RightsBO.user_has_role(self.ro_session, current_user_id,
                                    Role.APP_ADMINISTRATOR)
     # OK, go background straight away
     self.create_job(self.JOB_TYPE, current_user_id)
     ret = EMODnetExportRsp(job_id=self.job_id)
     return ret
Esempio n. 9
0
 def do_run(self, current_user_id: int) -> EMODnetExportRsp:
     # Security check
     # TODO, for now only admins
     _user = RightsBO.user_has_role(self.session, current_user_id,
                                    Role.APP_ADMINISTRATOR)
     # Adjust the task
     self.set_task_params(current_user_id, self.DWC_ZIP_NAME)
     # Do the job
     logger.info("------------ starting --------------")
     # Update DB statistics
     self.update_db_stats()
     ret = EMODnetExportRsp()
     # Build metadata with what comes from the collection
     meta = self.build_meta()
     if meta is None:
         # If we can't have meta there has to be reasons
         assert len(self.errors) > 0
         ret.errors = self.errors
         ret.warnings = self.warnings
         return ret
     # Create a container
     arch = DwC_Archive(
         DatasetMetadata(meta),
         self.temp_for_task.base_dir_for(self.task_id) / self.DWC_ZIP_NAME)
     # Add data from DB
     # OK because https://edmo.seadatanet.org/v_edmo/browse_step.asp?step=003IMEV_0021
     # But TODO: hardcoded, implement https://github.com/oceanomics/ecotaxa_dev/issues/514
     self.institution_code = "IMEV"
     self.add_events(arch)
     # OK we issue warning in case of individual issue, but if no content at all
     # then it's an error
     if arch.events.count() == 0 and arch.occurences.count(
     ) == 0 and arch.emofs.count() == 0:
         self.errors.append(
             "No content produced."
             " See previous warnings or check the presence of samples in the projects"
         )
     else:
         # Produce the zip
         arch.build()
         self.log_stats()
     ret.errors = self.errors
     ret.warnings = self.warnings
     if len(ret.errors) == 0:
         ret.task_id = self.task_id
     return ret
Esempio n. 10
0
 def do_digests(self, current_user_id: UserIDT,
                prj_id: Optional[ProjectIDT], max_digests: int) -> str:
     """
         Pick some images without checksum and compute it.
     """
     _user = RightsBO.user_has_role(self.session, current_user_id,
                                    Role.APP_ADMINISTRATOR)
     qry: Query = self.session.query(Image, ImageFile)
     if prj_id is not None:
         qry = qry.join(ObjectHeader).join(Acquisition).join(Sample).join(
             Project)
     qry = qry.outerjoin(ImageFile, Image.file_name == ImageFile.path)
     qry = qry.filter(ImageFile.path.is_(None))
     if prj_id is not None:
         qry = qry.filter(Project.projid == prj_id)
     qry = qry.limit(max_digests)
     cnt = 0
     for an_img, img_file in qry.all():
         cnt += 1
         if img_file is None:
             # No image_file line, add it
             img_file = ImageFile(path=an_img.file_name)
             self.session.add(img_file)
         img_file_path = self.vault.sub_path(an_img.file_name)
         try:
             md5 = self.compute_md5(img_file_path)
             img_file.digest = md5
             img_file.digest_type = '5'
             img_file.state = ImageFileStateEnum.OK.value
         except FileNotFoundError:
             img_file.state = ImageFileStateEnum.MISSING.value
         except Exception as e:
             logger.exception(e)
             img_file.state = ImageFileStateEnum.ERROR.value
     self.session.commit()
     return "Digest for %d images done." % cnt
Esempio n. 11
0
 def do_cleanup_dup_same_obj(self, current_user_id: UserIDT,
                             prj_id: ProjectIDT, max_deletes: int) -> str:
     """
         Simplest duplication pattern. Inside the same object there are several identical images.
     """
     _user = RightsBO.user_has_role(self.ro_session, current_user_id,
                                    Role.APP_ADMINISTRATOR)
     orig_img = aliased(Image, name="orig")
     orig_file = aliased(ImageFile, name="orig_file")
     qry: Query = self.session.query(orig_img.file_name, orig_img.imgid,
                                     Image,
                                     ImageFile)  # Select what to delete
     qry = qry.join(ObjectHeader, ObjectHeader.objid == Image.objid).join(
         Acquisition).join(Sample).join(Project)
     # We consider that original image is the oldest one, so others have a superior ID
     qry = qry.join(
         orig_img,
         and_(orig_img.objid == Image.objid,
              orig_img.orig_file_name == Image.orig_file_name,
              orig_img.width == Image.width,
              orig_img.height == Image.height,
              orig_img.imgid < Image.imgid))
     # Must have a checksum, with the same state (sane)
     qry = qry.join(
         ImageFile,
         and_(ImageFile.path == Image.file_name,
              ImageFile.state == ImageFileStateEnum.OK.value))
     qry = qry.join(
         orig_file,
         and_(orig_file.path == orig_img.file_name,
              orig_file.state == ImageFileStateEnum.OK.value))
     # and the same value of course
     qry = qry.filter(
         and_(ImageFile.digest_type == orig_file.digest_type,
              ImageFile.digest == orig_file.digest))
     qry = qry.filter(Project.projid == prj_id)
     qry = qry.order_by(Image.objid, orig_img.imgid, Image.imgid)
     qry = qry.limit(max_deletes)
     with CodeTimer(
             "Dups same objs inside %d, query '%s':" % (prj_id, str(qry)),
             logger):
         to_do = [(orig_file_name, orig_img_id, an_image, an_image_file)
                  for orig_file_name, orig_img_id, an_image, an_image_file
                  in qry.all()]
     ko_not_same = 0
     ko_except = 0
     # Prepare & start a remover thread that will run in // with DB queries
     remover = VaultRemover(self.link_src, logger).do_start()
     filecmp.clear_cache()
     deleted_imgids: Set[int] = set()
     for orig_file_name, orig_img_id, an_image, an_image_file in to_do:
         # The query returns multiple rows if there are more than 2 duplicates
         if orig_img_id in deleted_imgids:
             continue
         # Even if MD5s match, be paranoid and compare files
         orig_path = self.vault.sub_path(orig_file_name)
         dup_path = self.vault.sub_path(an_image.file_name)
         assert orig_path != dup_path
         orig_exists = exists(orig_path)
         dup_exists = exists(dup_path)
         if orig_exists:
             if dup_exists:
                 try:
                     same = filecmp.cmp(orig_path, dup_path, False)
                 except Exception as exc:
                     logger.info(
                         "Exception while comparing orig:%s and dup:%s: %s",
                         orig_path, dup_path, str(exc))
                     ko_except += 1
                     continue
                 if not same:
                     ko_not_same += 1
                     continue
             else:
                 # Duplicate is gone already
                 pass
         else:
             # DB record of physical file is wrong
             # TODO
             continue
         # Do the cleanup
         deleted_imgids.add(an_image.imgid)
         if dup_exists:
             remover.add_files([an_image.file_name])
         self.session.delete(an_image)
         self.session.delete(an_image_file)
     # Wait for the files handled
     self.session.commit()
     remover.wait_for_done()
     return (
         "Dupl remover for %s dup images done but %d problems %d false file comp"
         % (len(deleted_imgids), ko_except, ko_not_same))