def _collect_existing_and_validate(self, source_dir_or_zip, loaded_files) \ -> Tuple[ImportHow, ImportDiagnostic, int]: """ Prepare the import by checking what's inside the project and scanning files to input. """ # The mapping to TSV custom columns, either empty or from previous import operations on same project. mapping = ProjectMapping().load_from_project(self.prj) # Source bundle construction bundle_temp_dir = Path(self.temp_for_jobs.data_dir_for(self.job_id)) source_bundle = InBundle(source_dir_or_zip, bundle_temp_dir) # Configure the validation to come, directives. import_how = ImportHow(self.prj_id, self.req.update_mode, mapping, self.req.skip_existing_objects, loaded_files) if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) # A structure to collect validation result import_diag = ImportDiagnostic() if not self.req.skip_existing_objects: with CodeTimer( "collect_existing: Existing images for %d: " % self.prj_id, logger): import_diag.existing_objects_and_image = Image.fetch_existing_images( self.session, self.prj_id) import_diag.topology.read_from_db(self.session, prj_id=self.prj_id) # Do the bulk job of validation nb_rows = source_bundle.validate_import( import_how, import_diag, self.session, self.report_validation_progress) return import_how, import_diag, nb_rows
def do_intra_step_1(self, loaded_files): # The mapping to custom columns, either empty or from previous import API_operations on same project. custom_mapping = ProjectMapping().load_from_project(self.prj) # Source bundle construction source_bundle = InBundle( self.source_dir_or_zip, Path(self.temp_for_task.data_dir_for(self.task_id))) # Configure the validation to come, directives. import_how = ImportHow(self.prj_id, self.req.update_mode, custom_mapping, self.req.skip_existing_objects, loaded_files) if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) # A structure to collect validation result import_diag = ImportDiagnostic() if not self.req.skip_existing_objects: with CodeTimer( "do_intra_step_1: Existing images for %d: " % self.prj_id, logger): import_diag.existing_objects_and_image = Image.fetch_existing_images( self.session, self.prj_id) import_diag.topology.read_from_db(self.session, prj_id=self.prj_id) # Do the bulk job of validation nb_rows = source_bundle.validate_import(import_how, import_diag, self.session, self.report_progress) return import_how, import_diag, nb_rows
def do_run(self, current_user_id: int) -> ImportRealRsp: """ Do the real job using injected parameters. :return: """ # Security check RightsBO.user_wants(self.session, current_user_id, Action.ADMINISTRATE, self.prj_id) # OK loaded_files = none_to_empty(self.prj.fileloaded).splitlines() logger.info("Previously loaded files: %s", loaded_files) # Save mappings straight away self.save_mapping(self.custom_mapping) source_bundle = InBundle( self.req.source_path, Path(self.temp_for_task.data_dir_for(self.task_id))) # Configure the import to come, destination db_writer = DBWriter(self.session) import_where = ImportWhere( db_writer, self.vault, self.temp_for_task.base_dir_for(self.task_id)) # Configure the import to come, directives import_how = ImportHow(self.prj_id, self.req.update_mode, self.custom_mapping, self.req.skip_existing_objects, loaded_files) import_how.taxo_mapping = self.req.taxo_mappings import_how.taxo_found = self.req.found_taxa import_how.found_users = self.req.found_users if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) if not self.req.skip_existing_objects: with CodeTimer("run: Existing images for %d: " % self.prj_id, logger): import_how.objects_and_images_to_skip = Image.fetch_existing_images( self.session, self.prj_id) import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT'])) # Do the bulk job of import row_count = source_bundle.do_import(import_where, import_how, self.req.rowcount, self.report_progress) # Update loaded files in DB, removing duplicates self.prj.fileloaded = "\n".join(set(import_how.loaded_files)) self.session.commit() # Recompute stats ProjectBO.do_after_load(self.session, self.prj_id) self.session.commit() logger.info("Total of %d rows loaded" % row_count) # Prepare response ret = ImportRealRsp() return ret
def do_real(self) -> None: """ Do the real job, i.e. write everywhere (DB/filesystem) """ loaded_files = none_to_empty(self.prj.fileloaded).splitlines() logger.info("Previously loaded files: %s", loaded_files) found_users, taxo_found, col_mapping_dict, \ nb_rows, source_path = self._load_vars_from_state(self.STATE_KEYS) # Save mappings straight away col_mapping = ProjectMapping().load_from_dict(col_mapping_dict) col_mapping.write_to_project(self.prj) self.session.commit() # TODO: Duplicated code source_bundle = InBundle( source_path, Path(self.temp_for_jobs.data_dir_for(self.job_id))) # Configure the import to come, destination db_writer = DBWriter(self.session) import_where = ImportWhere( db_writer, self.vault, self.temp_for_jobs.base_dir_for(self.job_id)) # Configure the import to come, directives import_how = ImportHow(self.prj_id, self.req.update_mode, col_mapping, self.req.skip_existing_objects, loaded_files) import_how.taxo_mapping = self.req.taxo_mappings import_how.found_taxa = taxo_found import_how.found_users = found_users if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) if self.req.skip_existing_objects: # If we must skip existing objects then do an inventory of what's in already with CodeTimer("run: Existing images for %d: " % self.prj_id, logger): import_how.objects_and_images_to_skip = Image.fetch_existing_images( self.session, self.prj_id) import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT'])) # Do the bulk job of import rowcount_from_validate = nb_rows row_count = source_bundle.do_import(import_where, import_how, rowcount_from_validate, self.report_progress) # Update loaded files in DB, removing duplicates self.prj.fileloaded = "\n".join(set(import_how.loaded_files)) self.session.commit() # Recompute stats ProjectBO.do_after_load(self.session, self.prj_id) self.session.commit() msg = "Total of %d rows loaded" % row_count logger.info(msg) self.set_job_result(errors=[], infos={"rowcount": row_count})
def _verify_possible(self, dest_prj: Project, src_prj: Project) -> List[str]: """ Verify that the merge would not mean a loss of information. The mappings of src project should be preserved and copied into dest project. Augmented mappings should fit in the allowed maximum size for each entity. :param dest_prj: :param src_prj: :return: a list of problems, empty means we can proceed. """ ret = [] dest_mappings = ProjectMapping().load_from_project(dest_prj) src_mappings = ProjectMapping().load_from_project(src_prj) a_tbl: MappedTableTypeT for a_tbl in MAPPED_TABLES: mappings_for_dest_tbl = dest_mappings.by_table[a_tbl] mappings_for_src_tbl = src_mappings.by_table[a_tbl] # Compute the new mapping and eventual transformations to get there aug, remaps, errs = mappings_for_dest_tbl.augmented_with( mappings_for_src_tbl) ret.extend(errs) if len(remaps) > 0: self.remap_operations[a_tbl] = remaps # Load future mapping self.dest_augmented_mappings.by_table[a_tbl].load_from(aug) # Also check problems on consistency of unique orig_id dest_parents = InBundle.fetch_existing_parents(self.ro_session, prj_id=self.prj_id) src_parents = InBundle.fetch_existing_parents(self.ro_session, prj_id=self.src_prj_id) for an_orig_id_container in [ Sample.__tablename__, Acquisition.__tablename__ ]: # key=orig_id value, value=full record dest_orig_ids = dest_parents[an_orig_id_container] src_orig_ids = src_parents[an_orig_id_container] common_orig_ids = set(dest_orig_ids.keys()).intersection( src_orig_ids.keys()) if len(common_orig_ids) != 0: logger.info("Common %s orig_ids: %s", an_orig_id_container, common_orig_ids) for common_orig_id in common_orig_ids: orm_diff = orm_equals(dest_orig_ids[common_orig_id], src_orig_ids[common_orig_id]) if orm_diff: msg = ( "Data conflict: %s record with orig_id '%s' is different in destination project: %s" % (an_orig_id_container, common_orig_id, str(orm_diff))) # TODO: Should be an error? logger.warning(msg) return ret
def make_tsv(self, bundle: InBundle, images: List[Path]): """ Generate a TSV file from values, inject it into the bundle. """ dest_file = Path( self.temp_for_task.in_base_dir_for(self.task_id, "import_meta.tsv")) with open(dest_file, "w", encoding='latin_1') as fp: fp.write(self.make_header()) for an_image in images: tsv_line = self.make_line(str(an_image)) fp.write(tsv_line) bundle.add_tsv(dest_file)
def _do_clone(self): """ Cloning operation itself. Assumes that @see self.to_clone was populated before. """ # Get the mappings in source project, in order to determines the useful columns custom_mapping = ProjectMapping().load_from_project(self.prj) obj_mapping = custom_mapping.object_mappings used_columns = set(obj_mapping.real_cols_to_tsv.keys()) used_columns.add("orig_id") # By safety # Create a DB writer writer = DBWriter(self.session) # Narrow the writes in ObjectFields thanks to mappings of original project writer.generators({"obj_field": used_columns}) # Use import helpers dest_prj_id = self.dest_prj.projid import_how = ImportHow(prj_id=dest_prj_id, update_mode="No", custom_mapping=ProjectMapping(), skip_object_duplicates=False, loaded_files=[]) # Get parent (enclosing) Sample, Acquisition, Process. There should be 0 in this context... import_how.existing_parents = InBundle.fetch_existing_parents( self.session, prj_id=dest_prj_id) self._clone_all(import_how, writer) # Copy mappings to destination. We could narrow them to the minimum? custom_mapping.write_to_project(self.dest_prj)
def do_import(self): """ Do the real job, i.e. copy files while creating records. """ errors = [] self.manage_uploaded() self.unzip_if_needed() # Use a Bundle source_bundle = InBundle( self.source_dir_or_zip, Path(self.temp_for_task.data_dir_for(self.task_id))) # Clean it, in case the ZIP contains a CSV source_bundle.remove_all_tsvs() images = source_bundle.list_image_files() # Configure the import to come, destination db_writer = DBWriter(self.session) import_where = ImportWhere( db_writer, self.vault, self.temp_for_task.base_dir_for(self.task_id)) # Configure the import to come, directives import_how = ImportHow(prj_id=self.prj_id, update_mode="", custom_mapping=ProjectMapping(), skip_object_duplicates=False, loaded_files=[]) import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT'])) # Generate TSV req_values = self.req.values if req_values.get(SimpleImportFields.userlb, ""): import_how.found_users["user"] = { "id": req_values.get(SimpleImportFields.userlb) } req_values[SimpleImportFields.userlb] = "user" if req_values.get(SimpleImportFields.status, ""): req_values[SimpleImportFields.status] = classif_qual.get( req_values[SimpleImportFields.status], "") self.make_tsv(source_bundle, images) # Import nb_image_files = len(images) nb_images = source_bundle.do_import(import_where, import_how, nb_image_files, self.report_progress) self.session.commit() # Recompute stats and so on ProjectBO.do_after_load(self.session, self.prj_id) self.session.commit() ret = SimpleImportRsp(errors=errors, nb_images=nb_images) return ret
def _do_merge(self, dest_prj: Project): """ Real merge operation. """ # Loop over involved tables and remap free columns for a_mapped_tbl in MAPPED_TABLES: remaps = self.remap_operations.get(a_mapped_tbl) # Do the remappings if any if remaps is not None: logger.info("Doing re-mapping in %s: %s", a_mapped_tbl.__tablename__, remaps) ProjectBO.remap(self.session, self.src_prj_id, a_mapped_tbl, remaps) # Collect orig_id dest_parents = InBundle.fetch_existing_parents(self.ro_session, prj_id=self.prj_id) src_parents = InBundle.fetch_existing_parents(self.ro_session, prj_id=self.src_prj_id) # Compute needed projections in order to keep orig_id unicity common_samples = self.get_ids_for_common_orig_id( Sample, dest_parents, src_parents) common_acquisitions = self.get_ids_for_common_orig_id( Acquisition, dest_parents, src_parents) # Align foreign keys, to Project, Sample and Acquisition for a_fk_to_proj_tbl in [ Sample, Acquisition, ObjectHeader, ParticleProject ]: upd: Query = self.session.query(a_fk_to_proj_tbl) if a_fk_to_proj_tbl == Sample: # Move (i.e. change project) samples which are 'new' from merged project, # so take all of them from src project... upd = upd.filter( a_fk_to_proj_tbl.projid == self.src_prj_id) # type: ignore # ...but not the ones with same orig_id, which are presumably equal. upd = upd.filter( Sample.sampleid != all_(list(common_samples.keys()))) # And update the column upd_values = {'projid': self.prj_id} elif a_fk_to_proj_tbl == Acquisition: # Acquisitions which were created, in source, under new samples, will 'follow' # them during above move, thanks to the FK on acq_sample_id. # BUT some acquisitions were potentially created in source project, inside # forked samples. They need to be attached to the dest (self) corresponding sample. if len(common_samples) > 0: # Build a CTE with values for the update smp_cte = values_cte("upd_smp", ("src_id", "dst_id"), [(k, v) for k, v in common_samples.items()]) smp_subqry = self.session.query(smp_cte.c.column2).filter( smp_cte.c.column1 == Acquisition.acq_sample_id) upd_values = { 'acq_sample_id': func.coalesce( smp_subqry.scalar_subquery(), # type: ignore Acquisition.acq_sample_id) } upd = upd.filter(Acquisition.acq_sample_id == any_( list(common_samples.keys()))) # upd = upd.filter(Acquisition.acquisid != all_(list(common_acquisitions.keys()))) if len(common_samples) == 0: # Nothing to do. There were only new samples, all of them moved to self. continue elif a_fk_to_proj_tbl == ObjectHeader: # Generated SQL looks like: # with upd_acq (src_id, dst_id) as (values (5,6), (7,8)) # update obj_head # set acquisid = coalesce((select dst_id from upd_acq where acquisid=src_id), acquisid) # where acquisid in (select src_id from upd_acq) if len(common_acquisitions) > 0: # Object must follow its acquisition acq_cte = values_cte( "upd_acq", ("src_id", "dst_id"), [(k, v) for k, v in common_acquisitions.items()]) acq_subqry = self.session.query(acq_cte.c.column2).filter( acq_cte.c.column1 == ObjectHeader.acquisid) upd_values = { 'acquisid': func.coalesce( acq_subqry.scalar_subquery(), # type:ignore ObjectHeader.acquisid) } upd = upd.filter(ObjectHeader.acquisid == any_( list(common_acquisitions.keys()))) if len(common_acquisitions) == 0: # Nothing to do. There were only new acquisitions, all of them moved to self. continue else: # For Particle project upd = upd.filter( ParticleProject.projid == self.src_prj_id) # type: ignore upd_values = {'projid': self.prj_id} rowcount = upd.update(values=upd_values, synchronize_session=False) table_name = a_fk_to_proj_tbl.__tablename__ # type: ignore logger.info("Update in %s: %s rows", table_name, rowcount) # Acquisition & twin Process have followed their enclosing Sample # Remove the parents which are duplicate from orig_id point of view for a_fk_to_proj_tbl in [Acquisition, Sample]: to_del: Query = self.session.query(a_fk_to_proj_tbl) if a_fk_to_proj_tbl == Acquisition: # Remove conflicting acquisitions, they should be empty? to_del = to_del.filter(Acquisition.acquisid == any_( list(common_acquisitions.keys()))) # type: ignore elif a_fk_to_proj_tbl == Sample: # Remove conflicting samples to_del = to_del.filter(Sample.sampleid == any_( list(common_samples.keys()))) # type: ignore rowcount = to_del.delete(synchronize_session=False) table_name = a_fk_to_proj_tbl.__tablename__ # type: ignore logger.info("Delete in %s: %s rows", table_name, rowcount) self.dest_augmented_mappings.write_to_project(dest_prj) ProjectPrivilegeBO.generous_merge_into(self.session, self.prj_id, self.src_prj_id) # Completely erase the source project ProjectBO.delete(self.session, self.src_prj_id)