Beispiel #1
0
 def _collect_existing_and_validate(self, source_dir_or_zip, loaded_files) \
         -> Tuple[ImportHow, ImportDiagnostic, int]:
     """
         Prepare the import by checking what's inside the project and scanning files to input.
     """
     # The mapping to TSV custom columns, either empty or from previous import operations on same project.
     mapping = ProjectMapping().load_from_project(self.prj)
     # Source bundle construction
     bundle_temp_dir = Path(self.temp_for_jobs.data_dir_for(self.job_id))
     source_bundle = InBundle(source_dir_or_zip, bundle_temp_dir)
     # Configure the validation to come, directives.
     import_how = ImportHow(self.prj_id, self.req.update_mode, mapping,
                            self.req.skip_existing_objects, loaded_files)
     if self.req.skip_loaded_files:
         import_how.compute_skipped(source_bundle, logger)
     # A structure to collect validation result
     import_diag = ImportDiagnostic()
     if not self.req.skip_existing_objects:
         with CodeTimer(
                 "collect_existing: Existing images for %d: " % self.prj_id,
                 logger):
             import_diag.existing_objects_and_image = Image.fetch_existing_images(
                 self.session, self.prj_id)
     import_diag.topology.read_from_db(self.session, prj_id=self.prj_id)
     # Do the bulk job of validation
     nb_rows = source_bundle.validate_import(
         import_how, import_diag, self.session,
         self.report_validation_progress)
     return import_how, import_diag, nb_rows
Beispiel #2
0
 def do_intra_step_1(self, loaded_files):
     # The mapping to custom columns, either empty or from previous import API_operations on same project.
     custom_mapping = ProjectMapping().load_from_project(self.prj)
     # Source bundle construction
     source_bundle = InBundle(
         self.source_dir_or_zip,
         Path(self.temp_for_task.data_dir_for(self.task_id)))
     # Configure the validation to come, directives.
     import_how = ImportHow(self.prj_id, self.req.update_mode,
                            custom_mapping, self.req.skip_existing_objects,
                            loaded_files)
     if self.req.skip_loaded_files:
         import_how.compute_skipped(source_bundle, logger)
     # A structure to collect validation result
     import_diag = ImportDiagnostic()
     if not self.req.skip_existing_objects:
         with CodeTimer(
                 "do_intra_step_1: Existing images for %d: " % self.prj_id,
                 logger):
             import_diag.existing_objects_and_image = Image.fetch_existing_images(
                 self.session, self.prj_id)
     import_diag.topology.read_from_db(self.session, prj_id=self.prj_id)
     # Do the bulk job of validation
     nb_rows = source_bundle.validate_import(import_how, import_diag,
                                             self.session,
                                             self.report_progress)
     return import_how, import_diag, nb_rows
Beispiel #3
0
    def do_run(self, current_user_id: int) -> ImportRealRsp:
        """
            Do the real job using injected parameters.
            :return:
        """
        # Security check
        RightsBO.user_wants(self.session, current_user_id, Action.ADMINISTRATE,
                            self.prj_id)
        # OK
        loaded_files = none_to_empty(self.prj.fileloaded).splitlines()
        logger.info("Previously loaded files: %s", loaded_files)

        # Save mappings straight away
        self.save_mapping(self.custom_mapping)

        source_bundle = InBundle(
            self.req.source_path,
            Path(self.temp_for_task.data_dir_for(self.task_id)))
        # Configure the import to come, destination
        db_writer = DBWriter(self.session)
        import_where = ImportWhere(
            db_writer, self.vault,
            self.temp_for_task.base_dir_for(self.task_id))
        # Configure the import to come, directives
        import_how = ImportHow(self.prj_id, self.req.update_mode,
                               self.custom_mapping,
                               self.req.skip_existing_objects, loaded_files)
        import_how.taxo_mapping = self.req.taxo_mappings
        import_how.taxo_found = self.req.found_taxa
        import_how.found_users = self.req.found_users
        if self.req.skip_loaded_files:
            import_how.compute_skipped(source_bundle, logger)
        if not self.req.skip_existing_objects:
            with CodeTimer("run: Existing images for %d: " % self.prj_id,
                           logger):
                import_how.objects_and_images_to_skip = Image.fetch_existing_images(
                    self.session, self.prj_id)
        import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT']))

        # Do the bulk job of import
        row_count = source_bundle.do_import(import_where, import_how,
                                            self.req.rowcount,
                                            self.report_progress)

        # Update loaded files in DB, removing duplicates
        self.prj.fileloaded = "\n".join(set(import_how.loaded_files))
        self.session.commit()

        # Recompute stats
        ProjectBO.do_after_load(self.session, self.prj_id)
        self.session.commit()

        logger.info("Total of %d rows loaded" % row_count)

        # Prepare response
        ret = ImportRealRsp()
        return ret
Beispiel #4
0
    def do_real(self) -> None:
        """
            Do the real job, i.e. write everywhere (DB/filesystem)
        """
        loaded_files = none_to_empty(self.prj.fileloaded).splitlines()
        logger.info("Previously loaded files: %s", loaded_files)

        found_users, taxo_found, col_mapping_dict, \
        nb_rows, source_path = self._load_vars_from_state(self.STATE_KEYS)

        # Save mappings straight away
        col_mapping = ProjectMapping().load_from_dict(col_mapping_dict)
        col_mapping.write_to_project(self.prj)
        self.session.commit()

        # TODO: Duplicated code
        source_bundle = InBundle(
            source_path, Path(self.temp_for_jobs.data_dir_for(self.job_id)))
        # Configure the import to come, destination
        db_writer = DBWriter(self.session)
        import_where = ImportWhere(
            db_writer, self.vault,
            self.temp_for_jobs.base_dir_for(self.job_id))
        # Configure the import to come, directives
        import_how = ImportHow(self.prj_id, self.req.update_mode, col_mapping,
                               self.req.skip_existing_objects, loaded_files)
        import_how.taxo_mapping = self.req.taxo_mappings
        import_how.found_taxa = taxo_found
        import_how.found_users = found_users
        if self.req.skip_loaded_files:
            import_how.compute_skipped(source_bundle, logger)
        if self.req.skip_existing_objects:
            # If we must skip existing objects then do an inventory of what's in already
            with CodeTimer("run: Existing images for %d: " % self.prj_id,
                           logger):
                import_how.objects_and_images_to_skip = Image.fetch_existing_images(
                    self.session, self.prj_id)
        import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT']))

        # Do the bulk job of import
        rowcount_from_validate = nb_rows
        row_count = source_bundle.do_import(import_where, import_how,
                                            rowcount_from_validate,
                                            self.report_progress)

        # Update loaded files in DB, removing duplicates
        self.prj.fileloaded = "\n".join(set(import_how.loaded_files))
        self.session.commit()

        # Recompute stats
        ProjectBO.do_after_load(self.session, self.prj_id)
        self.session.commit()

        msg = "Total of %d rows loaded" % row_count
        logger.info(msg)
        self.set_job_result(errors=[], infos={"rowcount": row_count})
Beispiel #5
0
    def _verify_possible(self, dest_prj: Project,
                         src_prj: Project) -> List[str]:
        """
            Verify that the merge would not mean a loss of information.
                The mappings of src project should be preserved and copied into dest project.
                Augmented mappings should fit in the allowed maximum size for each entity.
            :param dest_prj:
            :param src_prj:
            :return: a list of problems, empty means we can proceed.
        """
        ret = []
        dest_mappings = ProjectMapping().load_from_project(dest_prj)
        src_mappings = ProjectMapping().load_from_project(src_prj)
        a_tbl: MappedTableTypeT
        for a_tbl in MAPPED_TABLES:
            mappings_for_dest_tbl = dest_mappings.by_table[a_tbl]
            mappings_for_src_tbl = src_mappings.by_table[a_tbl]
            # Compute the new mapping and eventual transformations to get there
            aug, remaps, errs = mappings_for_dest_tbl.augmented_with(
                mappings_for_src_tbl)
            ret.extend(errs)
            if len(remaps) > 0:
                self.remap_operations[a_tbl] = remaps
            # Load future mapping
            self.dest_augmented_mappings.by_table[a_tbl].load_from(aug)

        # Also check problems on consistency of unique orig_id
        dest_parents = InBundle.fetch_existing_parents(self.ro_session,
                                                       prj_id=self.prj_id)
        src_parents = InBundle.fetch_existing_parents(self.ro_session,
                                                      prj_id=self.src_prj_id)

        for an_orig_id_container in [
                Sample.__tablename__, Acquisition.__tablename__
        ]:
            # key=orig_id value, value=full record
            dest_orig_ids = dest_parents[an_orig_id_container]
            src_orig_ids = src_parents[an_orig_id_container]
            common_orig_ids = set(dest_orig_ids.keys()).intersection(
                src_orig_ids.keys())
            if len(common_orig_ids) != 0:
                logger.info("Common %s orig_ids: %s", an_orig_id_container,
                            common_orig_ids)
            for common_orig_id in common_orig_ids:
                orm_diff = orm_equals(dest_orig_ids[common_orig_id],
                                      src_orig_ids[common_orig_id])
                if orm_diff:
                    msg = (
                        "Data conflict: %s record with orig_id '%s' is different in destination project: %s"
                        %
                        (an_orig_id_container, common_orig_id, str(orm_diff)))
                    # TODO: Should be an error?
                    logger.warning(msg)
        return ret
Beispiel #6
0
 def make_tsv(self, bundle: InBundle, images: List[Path]):
     """
         Generate a TSV file from values, inject it into the bundle.
     """
     dest_file = Path(
         self.temp_for_task.in_base_dir_for(self.task_id,
                                            "import_meta.tsv"))
     with open(dest_file, "w", encoding='latin_1') as fp:
         fp.write(self.make_header())
         for an_image in images:
             tsv_line = self.make_line(str(an_image))
             fp.write(tsv_line)
     bundle.add_tsv(dest_file)
Beispiel #7
0
    def _do_clone(self):
        """
            Cloning operation itself. Assumes that @see self.to_clone was populated before.
        """
        # Get the mappings in source project, in order to determines the useful columns
        custom_mapping = ProjectMapping().load_from_project(self.prj)
        obj_mapping = custom_mapping.object_mappings
        used_columns = set(obj_mapping.real_cols_to_tsv.keys())
        used_columns.add("orig_id")  # By safety
        # Create a DB writer
        writer = DBWriter(self.session)
        # Narrow the writes in ObjectFields thanks to mappings of original project
        writer.generators({"obj_field": used_columns})
        # Use import helpers
        dest_prj_id = self.dest_prj.projid
        import_how = ImportHow(prj_id=dest_prj_id,
                               update_mode="No",
                               custom_mapping=ProjectMapping(),
                               skip_object_duplicates=False,
                               loaded_files=[])
        # Get parent (enclosing) Sample, Acquisition, Process. There should be 0 in this context...
        import_how.existing_parents = InBundle.fetch_existing_parents(
            self.session, prj_id=dest_prj_id)

        self._clone_all(import_how, writer)
        # Copy mappings to destination. We could narrow them to the minimum?
        custom_mapping.write_to_project(self.dest_prj)
Beispiel #8
0
    def do_import(self):
        """
            Do the real job, i.e. copy files while creating records.
        """
        errors = []
        self.manage_uploaded()
        self.unzip_if_needed()
        # Use a Bundle
        source_bundle = InBundle(
            self.source_dir_or_zip,
            Path(self.temp_for_task.data_dir_for(self.task_id)))
        # Clean it, in case the ZIP contains a CSV
        source_bundle.remove_all_tsvs()
        images = source_bundle.list_image_files()
        # Configure the import to come, destination
        db_writer = DBWriter(self.session)
        import_where = ImportWhere(
            db_writer, self.vault,
            self.temp_for_task.base_dir_for(self.task_id))
        # Configure the import to come, directives
        import_how = ImportHow(prj_id=self.prj_id,
                               update_mode="",
                               custom_mapping=ProjectMapping(),
                               skip_object_duplicates=False,
                               loaded_files=[])
        import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT']))
        # Generate TSV
        req_values = self.req.values
        if req_values.get(SimpleImportFields.userlb, ""):
            import_how.found_users["user"] = {
                "id": req_values.get(SimpleImportFields.userlb)
            }
            req_values[SimpleImportFields.userlb] = "user"
        if req_values.get(SimpleImportFields.status, ""):
            req_values[SimpleImportFields.status] = classif_qual.get(
                req_values[SimpleImportFields.status], "")
        self.make_tsv(source_bundle, images)
        # Import
        nb_image_files = len(images)
        nb_images = source_bundle.do_import(import_where, import_how,
                                            nb_image_files,
                                            self.report_progress)
        self.session.commit()

        # Recompute stats and so on
        ProjectBO.do_after_load(self.session, self.prj_id)
        self.session.commit()

        ret = SimpleImportRsp(errors=errors, nb_images=nb_images)
        return ret
Beispiel #9
0
    def _do_merge(self, dest_prj: Project):
        """
            Real merge operation.
        """
        # Loop over involved tables and remap free columns
        for a_mapped_tbl in MAPPED_TABLES:
            remaps = self.remap_operations.get(a_mapped_tbl)
            # Do the remappings if any
            if remaps is not None:
                logger.info("Doing re-mapping in %s: %s",
                            a_mapped_tbl.__tablename__, remaps)
                ProjectBO.remap(self.session, self.src_prj_id, a_mapped_tbl,
                                remaps)

        # Collect orig_id
        dest_parents = InBundle.fetch_existing_parents(self.ro_session,
                                                       prj_id=self.prj_id)
        src_parents = InBundle.fetch_existing_parents(self.ro_session,
                                                      prj_id=self.src_prj_id)

        # Compute needed projections in order to keep orig_id unicity
        common_samples = self.get_ids_for_common_orig_id(
            Sample, dest_parents, src_parents)
        common_acquisitions = self.get_ids_for_common_orig_id(
            Acquisition, dest_parents, src_parents)

        # Align foreign keys, to Project, Sample and Acquisition
        for a_fk_to_proj_tbl in [
                Sample, Acquisition, ObjectHeader, ParticleProject
        ]:
            upd: Query = self.session.query(a_fk_to_proj_tbl)
            if a_fk_to_proj_tbl == Sample:
                # Move (i.e. change project) samples which are 'new' from merged project,
                #    so take all of them from src project...
                upd = upd.filter(
                    a_fk_to_proj_tbl.projid == self.src_prj_id)  # type: ignore
                # ...but not the ones with same orig_id, which are presumably equal.
                upd = upd.filter(
                    Sample.sampleid != all_(list(common_samples.keys())))
                # And update the column
                upd_values = {'projid': self.prj_id}
            elif a_fk_to_proj_tbl == Acquisition:
                # Acquisitions which were created, in source, under new samples, will 'follow'
                #    them during above move, thanks to the FK on acq_sample_id.
                # BUT some acquisitions were potentially created in source project, inside
                #    forked samples. They need to be attached to the dest (self) corresponding sample.
                if len(common_samples) > 0:
                    # Build a CTE with values for the update
                    smp_cte = values_cte("upd_smp", ("src_id", "dst_id"),
                                         [(k, v)
                                          for k, v in common_samples.items()])
                    smp_subqry = self.session.query(smp_cte.c.column2).filter(
                        smp_cte.c.column1 == Acquisition.acq_sample_id)
                    upd_values = {
                        'acq_sample_id':
                        func.coalesce(
                            smp_subqry.scalar_subquery(),  # type: ignore
                            Acquisition.acq_sample_id)
                    }
                    upd = upd.filter(Acquisition.acq_sample_id == any_(
                        list(common_samples.keys())))
                    # upd = upd.filter(Acquisition.acquisid != all_(list(common_acquisitions.keys())))
                if len(common_samples) == 0:
                    # Nothing to do. There were only new samples, all of them moved to self.
                    continue
            elif a_fk_to_proj_tbl == ObjectHeader:
                # Generated SQL looks like:
                # with upd_acq (src_id, dst_id) as (values (5,6), (7,8))
                # update obj_head
                #    set acquisid = coalesce((select dst_id from upd_acq where acquisid=src_id), acquisid)
                #  where acquisid in (select src_id from upd_acq)
                if len(common_acquisitions) > 0:
                    # Object must follow its acquisition
                    acq_cte = values_cte(
                        "upd_acq", ("src_id", "dst_id"),
                        [(k, v) for k, v in common_acquisitions.items()])
                    acq_subqry = self.session.query(acq_cte.c.column2).filter(
                        acq_cte.c.column1 == ObjectHeader.acquisid)
                    upd_values = {
                        'acquisid':
                        func.coalesce(
                            acq_subqry.scalar_subquery(),  # type:ignore
                            ObjectHeader.acquisid)
                    }
                    upd = upd.filter(ObjectHeader.acquisid == any_(
                        list(common_acquisitions.keys())))
                if len(common_acquisitions) == 0:
                    # Nothing to do. There were only new acquisitions, all of them moved to self.
                    continue
            else:
                # For Particle project
                upd = upd.filter(
                    ParticleProject.projid == self.src_prj_id)  # type: ignore
                upd_values = {'projid': self.prj_id}
            rowcount = upd.update(values=upd_values, synchronize_session=False)
            table_name = a_fk_to_proj_tbl.__tablename__  # type: ignore
            logger.info("Update in %s: %s rows", table_name, rowcount)

        # Acquisition & twin Process have followed their enclosing Sample

        # Remove the parents which are duplicate from orig_id point of view
        for a_fk_to_proj_tbl in [Acquisition, Sample]:
            to_del: Query = self.session.query(a_fk_to_proj_tbl)
            if a_fk_to_proj_tbl == Acquisition:
                # Remove conflicting acquisitions, they should be empty?
                to_del = to_del.filter(Acquisition.acquisid == any_(
                    list(common_acquisitions.keys())))  # type: ignore
            elif a_fk_to_proj_tbl == Sample:
                # Remove conflicting samples
                to_del = to_del.filter(Sample.sampleid == any_(
                    list(common_samples.keys())))  # type: ignore
            rowcount = to_del.delete(synchronize_session=False)
            table_name = a_fk_to_proj_tbl.__tablename__  # type: ignore
            logger.info("Delete in %s: %s rows", table_name, rowcount)

        self.dest_augmented_mappings.write_to_project(dest_prj)

        ProjectPrivilegeBO.generous_merge_into(self.session, self.prj_id,
                                               self.src_prj_id)

        # Completely erase the source project
        ProjectBO.delete(self.session, self.src_prj_id)