def _do_clone(self): """ Cloning operation itself. Assumes that @see self.to_clone was populated before. """ # Get the mappings in source project, in order to determines the useful columns custom_mapping = ProjectMapping().load_from_project(self.prj) obj_mapping = custom_mapping.object_mappings used_columns = set(obj_mapping.real_cols_to_tsv.keys()) used_columns.add("orig_id") # By safety # Create a DB writer writer = DBWriter(self.session) # Narrow the writes in ObjectFields thanks to mappings of original project writer.generators({"obj_field": used_columns}) # Use import helpers dest_prj_id = self.dest_prj.projid import_how = ImportHow(prj_id=dest_prj_id, update_mode="No", custom_mapping=ProjectMapping(), skip_object_duplicates=False, loaded_files=[]) # Get parent (enclosing) Sample, Acquisition, Process. There should be 0 in this context... import_how.existing_parents = InBundle.fetch_existing_parents( self.session, prj_id=dest_prj_id) self._clone_all(import_how, writer) # Copy mappings to destination. We could narrow them to the minimum? custom_mapping.write_to_project(self.dest_prj)
def __init__(self, prj_id: int, src_prj_id: int, dry_run: bool): super().__init__() # params self.prj_id = prj_id self.src_prj_id = src_prj_id self.dry_run = dry_run # work vars self.remap_operations: Dict[MappedTableTypeT, List[RemapOp]] = {} self.dest_augmented_mappings = ProjectMapping()
def do_real(self) -> None: """ Do the real job, i.e. write everywhere (DB/filesystem) """ loaded_files = none_to_empty(self.prj.fileloaded).splitlines() logger.info("Previously loaded files: %s", loaded_files) found_users, taxo_found, col_mapping_dict, \ nb_rows, source_path = self._load_vars_from_state(self.STATE_KEYS) # Save mappings straight away col_mapping = ProjectMapping().load_from_dict(col_mapping_dict) col_mapping.write_to_project(self.prj) self.session.commit() # TODO: Duplicated code source_bundle = InBundle( source_path, Path(self.temp_for_jobs.data_dir_for(self.job_id))) # Configure the import to come, destination db_writer = DBWriter(self.session) import_where = ImportWhere( db_writer, self.vault, self.temp_for_jobs.base_dir_for(self.job_id)) # Configure the import to come, directives import_how = ImportHow(self.prj_id, self.req.update_mode, col_mapping, self.req.skip_existing_objects, loaded_files) import_how.taxo_mapping = self.req.taxo_mappings import_how.found_taxa = taxo_found import_how.found_users = found_users if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) if self.req.skip_existing_objects: # If we must skip existing objects then do an inventory of what's in already with CodeTimer("run: Existing images for %d: " % self.prj_id, logger): import_how.objects_and_images_to_skip = Image.fetch_existing_images( self.session, self.prj_id) import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT'])) # Do the bulk job of import rowcount_from_validate = nb_rows row_count = source_bundle.do_import(import_where, import_how, rowcount_from_validate, self.report_progress) # Update loaded files in DB, removing duplicates self.prj.fileloaded = "\n".join(set(import_how.loaded_files)) self.session.commit() # Recompute stats ProjectBO.do_after_load(self.session, self.prj_id) self.session.commit() msg = "Total of %d rows loaded" % row_count logger.info(msg) self.set_job_result(errors=[], infos={"rowcount": row_count})
def _verify_possible(self, dest_prj: Project, src_prj: Project) -> List[str]: """ Verify that the merge would not mean a loss of information. The mappings of src project should be preserved and copied into dest project. Augmented mappings should fit in the allowed maximum size for each entity. :param dest_prj: :param src_prj: :return: a list of problems, empty means we can proceed. """ ret = [] dest_mappings = ProjectMapping().load_from_project(dest_prj) src_mappings = ProjectMapping().load_from_project(src_prj) a_tbl: MappedTableTypeT for a_tbl in MAPPED_TABLES: mappings_for_dest_tbl = dest_mappings.by_table[a_tbl] mappings_for_src_tbl = src_mappings.by_table[a_tbl] # Compute the new mapping and eventual transformations to get there aug, remaps, errs = mappings_for_dest_tbl.augmented_with( mappings_for_src_tbl) ret.extend(errs) if len(remaps) > 0: self.remap_operations[a_tbl] = remaps # Load future mapping self.dest_augmented_mappings.by_table[a_tbl].load_from(aug) # Also check problems on consistency of unique orig_id dest_parents = InBundle.fetch_existing_parents(self.ro_session, prj_id=self.prj_id) src_parents = InBundle.fetch_existing_parents(self.ro_session, prj_id=self.src_prj_id) for an_orig_id_container in [ Sample.__tablename__, Acquisition.__tablename__ ]: # key=orig_id value, value=full record dest_orig_ids = dest_parents[an_orig_id_container] src_orig_ids = src_parents[an_orig_id_container] common_orig_ids = set(dest_orig_ids.keys()).intersection( src_orig_ids.keys()) if len(common_orig_ids) != 0: logger.info("Common %s orig_ids: %s", an_orig_id_container, common_orig_ids) for common_orig_id in common_orig_ids: orm_diff = orm_equals(dest_orig_ids[common_orig_id], src_orig_ids[common_orig_id]) if orm_diff: msg = ( "Data conflict: %s record with orig_id '%s' is different in destination project: %s" % (an_orig_id_container, common_orig_id, str(orm_diff))) # TODO: Should be an error? logger.warning(msg) return ret
def __init__(self, current_user: int, prj_id: int, filters: ProjectFilters): super().__init__() self.requester_id = current_user self.filters = filters prj = self.session.query(Project).get(prj_id) # We don't check for existence self.prj: Project = prj # Work vars self.mapping = ProjectMapping() if self.prj: self.mapping.load_from_project(self.prj) self.ids_to_dump: List[int] = [] self.already_dumped: Set = set() self.first_query = True
def _collect_existing_and_validate(self, source_dir_or_zip, loaded_files) \ -> Tuple[ImportHow, ImportDiagnostic, int]: """ Prepare the import by checking what's inside the project and scanning files to input. """ # The mapping to TSV custom columns, either empty or from previous import operations on same project. mapping = ProjectMapping().load_from_project(self.prj) # Source bundle construction bundle_temp_dir = Path(self.temp_for_jobs.data_dir_for(self.job_id)) source_bundle = InBundle(source_dir_or_zip, bundle_temp_dir) # Configure the validation to come, directives. import_how = ImportHow(self.prj_id, self.req.update_mode, mapping, self.req.skip_existing_objects, loaded_files) if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) # A structure to collect validation result import_diag = ImportDiagnostic() if not self.req.skip_existing_objects: with CodeTimer( "collect_existing: Existing images for %d: " % self.prj_id, logger): import_diag.existing_objects_and_image = Image.fetch_existing_images( self.session, self.prj_id) import_diag.topology.read_from_db(self.session, prj_id=self.prj_id) # Do the bulk job of validation nb_rows = source_bundle.validate_import( import_how, import_diag, self.session, self.report_validation_progress) return import_how, import_diag, nb_rows
def get_free_fields( cls, mapped: Any, # TODO: Should be 'MappedEntity' field_list: List[str], field_types: List[Any], field_absent_vals: List[Any]) -> List[Any]: """ Get free fields _values_ for the mapped entity, inside the project :param field_list: The names of the free columns for which value is returned :param field_types: The basic types e.g. int or float. Used as converters/verifiers e.g: float() :param field_absent_vals: The values used as marker for 'absent'. 'nan' for float. """ assert len(field_list) == len(field_types) == len(field_absent_vals) mapping = ProjectMapping().load_from_project( cls.PROJECT_ACCESSOR(mapped)) mapped_cols = getattr(mapping, cls.MAPPING_IN_PROJECT).find_tsv_cols(field_list) if len(mapped_cols) != len(field_list): raise TypeError("at least one of %s free column is absent" % field_list) # OK we have the real columns, get the values errs, ret = cls._get_values(mapped, mapped_cols, field_types, field_absent_vals) if len(errs) > 0: raise TypeError(",".join(errs)) return list(ret.values())
def get_computed_var( cls, mapped: Any, # TODO: Should be 'MappedEntity' var: ProjectVar, mapping: Optional[ProjectMapping] = None, constants: Optional[Dict] = None) -> Any: """ For given mapped entity, return the result of evaluating the formula (which returns a variable). """ if mapping is None: mapping = ProjectMapping().load_from_project( cls.PROJECT_ACCESSOR(mapped)) # Filter what is known in mapping. mapped_cols = getattr(mapping, cls.MAPPING_IN_PROJECT).find_tsv_cols( var.variable_names) types = [float] * len(mapped_cols) absent = [None] * len(mapped_cols) # Data extraction and formula evaluation _errs, var_vals = cls._get_values(mapped, mapped_cols, field_types=types, field_absent_vals=absent) if constants is not None: var_vals.update(constants) try: import math ret = eval(var.code, {"math": math}, var_vals) if not var.is_valid(ret): raise TypeError("Not valid %s: %s" % (var.formula, str(ret))) except Exception as e: # Basically anything can happen here raise TypeError(str(e)) return ret
def do_intra_step_1(self, loaded_files): # The mapping to custom columns, either empty or from previous import API_operations on same project. custom_mapping = ProjectMapping().load_from_project(self.prj) # Source bundle construction source_bundle = InBundle( self.source_dir_or_zip, Path(self.temp_for_task.data_dir_for(self.task_id))) # Configure the validation to come, directives. import_how = ImportHow(self.prj_id, self.req.update_mode, custom_mapping, self.req.skip_existing_objects, loaded_files) if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) # A structure to collect validation result import_diag = ImportDiagnostic() if not self.req.skip_existing_objects: with CodeTimer( "do_intra_step_1: Existing images for %d: " % self.prj_id, logger): import_diag.existing_objects_and_image = Image.fetch_existing_images( self.session, self.prj_id) import_diag.topology.read_from_db(self.session, prj_id=self.prj_id) # Do the bulk job of validation nb_rows = source_bundle.validate_import(import_how, import_diag, self.session, self.report_progress) return import_how, import_diag, nb_rows
def enrich(self): """ Add DB fields and relations as (hopefully more) meaningful attributes """ # Decode mappings to avoid exposing internal field mappings = ProjectMapping().load_from_project(self._project) self.obj_free_cols = mappings.object_mappings.tsv_cols_to_real self.sample_free_cols = mappings.sample_mappings.tsv_cols_to_real self.acquisition_free_cols = mappings.acquisition_mappings.tsv_cols_to_real self.process_free_cols = mappings.process_mappings.tsv_cols_to_real # Decode text list into numerical db_list = self._project.initclassiflist db_list = db_list if db_list else "" self.init_classif_list = [int(x) for x in db_list.split(",") if x.isdigit()] # Dispatch members by right by_right = {ProjectPrivilegeBO.MANAGE: self.managers, ProjectPrivilegeBO.ANNOTATE: self.annotators, ProjectPrivilegeBO.VIEW: self.viewers} a_priv: ProjectPrivilege # noinspection PyTypeChecker for a_priv in self._project.privs_for_members: if a_priv.user is None: # TODO: There is a line with NULL somewhere in DB continue if not a_priv.user.active: continue # noinspection PyTypeChecker by_right[a_priv.privilege].append(a_priv.user) if 'C' == a_priv.extra: self.contact = a_priv.user return self
def get_free_fields(cls, acquis: Acquisition, fields_list: List[str]) -> List[Any]: """ Get free fields _value_ for the acquisition. """ mapping = ProjectMapping().load_from_project(acquis.sample.project) real_cols = mapping.acquisition_mappings.find_tsv_cols(fields_list) if len(real_cols) != len(fields_list): raise TypeError("free column not found") return [getattr(acquis, real_col) for real_col in real_cols]
def _apply_on_all(self, clazz: MappedTableTypeT, project: Project, updates: List[ColUpdate]) -> int: """ Apply all updates on all impacted rows by their ID. """ prj_mappings = ProjectMapping().load_from_project(project) tbl_mappings = prj_mappings.by_table[clazz].tsv_cols_to_real clean_updates = self._sanitize_updates(clazz, tbl_mappings, updates) # Eventually there is nothing left after filtering if len(clean_updates) == 0: return 0 return self._do_updates(clazz, clean_updates)
def query(self, current_user_id: Optional[int], acquisition_id: AcquisitionIDT) -> Optional[AcquisitionBO]: ret = AcquisitionBO(self.ro_session, acquisition_id) if not ret.exists(): return None assert ret.acquis is not None # Security check if current_user_id is None: project = RightsBO.anonymous_wants(self.ro_session, Action.READ, ret.acquis.sample.projid) else: _user, project = RightsBO.user_wants(self.session, current_user_id, Action.READ, ret.acquis.sample.projid) mappings = ProjectMapping().load_from_project(project) ret.map_free_columns(mappings.acquisition_mappings) return ret
def query(self, current_user_id: Optional[int], object_id: ObjectIDT) -> Optional[ObjectBO]: ret = ObjectBO(self.ro_session, object_id) if not ret.exists(): return None # Security check projid = ret.header.acquisition.sample.projid if current_user_id is None: project = RightsBO.anonymous_wants(self.session, Action.READ, projid) else: _user, project = RightsBO.user_wants(self.session, current_user_id, Action.READ, projid) assert project is not None mappings = ProjectMapping().load_from_project(project) ret.map_free_columns(mappings.object_mappings) return ret
def query(self, current_user_id: Optional[UserIDT], sample_id: SampleIDT) -> Optional[SampleBO]: ret = SampleBO(self.ro_session, sample_id) if not ret.exists(): return None assert ret.sample is not None assert ret.sample.projid is not None # TODO: Why need this? # Security check if current_user_id is None: project = RightsBO.anonymous_wants(self.ro_session, Action.READ, ret.sample.projid) else: _user, project = RightsBO.user_wants(self.session, current_user_id, Action.READ, ret.sample.projid) mappings = ProjectMapping().load_from_project(project) ret.map_free_columns(mappings.sample_mappings) return ret
def do_import(self): """ Do the real job, i.e. copy files while creating records. """ errors = [] self.manage_uploaded() self.unzip_if_needed() # Use a Bundle source_bundle = InBundle( self.source_dir_or_zip, Path(self.temp_for_task.data_dir_for(self.task_id))) # Clean it, in case the ZIP contains a CSV source_bundle.remove_all_tsvs() images = source_bundle.list_image_files() # Configure the import to come, destination db_writer = DBWriter(self.session) import_where = ImportWhere( db_writer, self.vault, self.temp_for_task.base_dir_for(self.task_id)) # Configure the import to come, directives import_how = ImportHow(prj_id=self.prj_id, update_mode="", custom_mapping=ProjectMapping(), skip_object_duplicates=False, loaded_files=[]) import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT'])) # Generate TSV req_values = self.req.values if req_values.get(SimpleImportFields.userlb, ""): import_how.found_users["user"] = { "id": req_values.get(SimpleImportFields.userlb) } req_values[SimpleImportFields.userlb] = "user" if req_values.get(SimpleImportFields.status, ""): req_values[SimpleImportFields.status] = classif_qual.get( req_values[SimpleImportFields.status], "") self.make_tsv(source_bundle, images) # Import nb_image_files = len(images) nb_images = source_bundle.do_import(import_where, import_how, nb_image_files, self.report_progress) self.session.commit() # Recompute stats and so on ProjectBO.do_after_load(self.session, self.prj_id) self.session.commit() ret = SimpleImportRsp(errors=errors, nb_images=nb_images) return ret
def get_all_num_columns_values(self, session: Session): """ Get all numerical free fields values for all objects in a project. """ from DB.helpers.ORM import MetaData metadata = MetaData(bind=session.get_bind()) # TODO: Cache in a member mappings = ProjectMapping().load_from_project(self._project) num_fields_cols = set([col for col in mappings.object_mappings.tsv_cols_to_real.values() if col[0] == 'n']) obj_fields_tbl = minimal_table_of(metadata, ObjectFields, num_fields_cols, exact_floats=True) qry: Query = session.query(Project) qry = qry.join(Project.all_samples).join(Sample.all_acquisitions).join(Acquisition.all_objects) qry = qry.join(obj_fields_tbl, ObjectHeader.objid == obj_fields_tbl.c.objfid) qry = qry.filter(Project.projid == self._project.projid) qry = qry.order_by(Acquisition.acquisid) qry = qry.with_entities(Acquisition.acquisid, Acquisition.orig_id, obj_fields_tbl) return qry.all()
def dispatch_fields_by_table(custom_mapping: ProjectMapping, field_set: Set) -> Dict[str, Set]: """ Build a set of target DB columns by target table. :param custom_mapping: :param field_set: :return: """ ret: Dict[str, Set] = { alias: set() for alias in GlobalMapping.TARGET_CLASSES.keys() } for a_field in field_set: mapping = GlobalMapping.PREDEFINED_FIELDS.get(a_field) if not mapping: mapping = custom_mapping.search_field(a_field) assert mapping is not None target_tbl = mapping["table"] target_fld = mapping["field"] ret[target_tbl].add(target_fld) return ret
def create_tsv(self, src_project: Project, end_progress: int) -> Tuple[int, int]: """ Create the TSV file. """ req = self.req proj_id = src_project.projid self.update_progress(1, "Start TSV export") progress_range = end_progress - 1 # Get a fast count of the maximum of what to do count_sql = "SELECT SUM(nbr) AS cnt FROM projects_taxo_stat WHERE projid = :prj" res = self.ro_session.execute(text(count_sql), {"prj": proj_id}) obj_count = res.first()[0] # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet( self.ro_session, proj_id, self.filters) # Backup or not, the column namings are taken from common mapping # @See Mapping.py # TSV column order # field_order = ["object_id", "object_lat", "object_lon", "object_date", "object_time", "object_depth_max", # "object_annotation_status", "object_annotation_person_name", "object_annotation_person_email", # "object_annotation_date", "object_annotation_time", "object_annotation_category"] # formats = {"object_date": "TO_CHAR({0},'YYYYMMDD')", # "object_time": "TO_CHAR({0},'HH24MISS')", # "object_annotation_date": "TO_CHAR({0},'YYYYMMDD')", # "object_annotation_time": "TO_CHAR({0},'HH24MISS')", # "object_annotation_status": """ # CASE {0} # WHEN 'V' then 'validated' # WHEN 'P' then 'predicted' # WHEN 'D' then 'dubious' # ELSE {0} # END # """ # } # prefices = {ObjectHeader.__tablename__: "obh", # } # for a_fld in field_order: # mpg = GlobalMapping.PREDEFINED_FIELDS[a_fld] # mpg[""] # assert a_fld in GlobalMapping.PREDEFINED_FIELDS, "%s is not a mapped column" % a_fld date_fmt, time_fmt = "YYYYMMDD", "HH24MISS" if req.format_dates_times: date_fmt, time_fmt = "YYYY-MM-DD", "HH24:MI:SS" select_clause = "select " if req.with_images or (req.exp_type == ExportTypeEnum.backup): select_clause += "img.orig_file_name AS img_file_name, img.imgrank AS img_rank" if req.with_images: select_clause += ", img.file_name AS img_src_path" select_clause += ",\n" select_clause += """obh.orig_id AS object_id, obh.latitude AS object_lat, obh.longitude AS object_lon, TO_CHAR(obh.objdate,'{0}') AS object_date, TO_CHAR(obh.objtime,'{1}') AS object_time, obh.object_link, obh.depth_min AS object_depth_min, obh.depth_max AS object_depth_max, CASE obh.classif_qual WHEN 'V' then 'validated' WHEN 'P' then 'predicted' WHEN 'D' then 'dubious' ELSE obh.classif_qual END AS object_annotation_status, usr.name AS object_annotation_person_name, usr.email AS object_annotation_person_email, TO_CHAR(obh.classif_when,'{0}') AS object_annotation_date, TO_CHAR(obh.classif_when,'{1}') AS object_annotation_time, txo.display_name AS object_annotation_category """.format(date_fmt, time_fmt) if req.exp_type == ExportTypeEnum.backup: select_clause += ", txo.id AS object_annotation_category_id" else: select_clause += "," + TaxonomyBO.parents_sql( "obh.classif_id") + " AS object_annotation_hierarchy" if 'C' in req.tsv_entities: select_clause += "\n, obh.complement_info" # Deal with mappings, the goal is to emit SQL which will reconstitute the TSV structure src_mappings = ProjectMapping().load_from_project(src_project) if 'O' in req.tsv_entities: select_clause += "\n " + src_mappings.object_mappings.as_select_list( "obf") if 'S' in req.tsv_entities: select_clause += "\n, sam.orig_id AS sample_id, sam.dataportal_descriptor AS sample_dataportal_descriptor " select_clause += src_mappings.sample_mappings.as_select_list("sam") if 'P' in req.tsv_entities: select_clause += "\n, prc.orig_id AS process_id " select_clause += src_mappings.process_mappings.as_select_list( "prc") if 'A' in req.tsv_entities: select_clause += "\n, acq.orig_id AS acq_id, acq.instrument AS acq_instrument " select_clause += src_mappings.acquisition_mappings.as_select_list( "acq") if req.exp_type == ExportTypeEnum.dig_obj_ident: select_clause += "\n, obh.objid" if req.with_internal_ids: select_clause += """\n, obh.objid, obh.acquisid AS processid_internal, obh.acquisid AS acq_id_internal, sam.sampleid AS sample_id_internal, obh.classif_id, obh.classif_who, obh.classif_auto_id, txp.name classif_auto_name, obh.classif_auto_score, obh.classif_auto_when, obh.random_value object_random_value, obh.sunpos object_sunpos """ if 'S' in req.tsv_entities: select_clause += "\n, sam.latitude sample_lat, sam.longitude sample_long " # TODO: The condition on o.projid=1 in historical code below prevents any data production # if 'H' in req.tsv_entities: # sql1 += " , oh.classif_date AS histoclassif_date, classif_type AS histoclassif_type, " \ # "to3.name histoclassif_name, oh.classif_qual histoclassif_qual,uo3.name histoclassif_who, " \ # "classif_score histoclassif_score" # sql2 += """ LEFT JOIN (select o.objid, classif_date, classif_type, och.classif_id, # och.classif_qual, och.classif_who, classif_score # from objectsclassifhisto och # join objects o on o.objid=och.objid and o.projid=1 {0} # union all # select o.objid, o.classif_when classif_date, 'C' classif_type, classif_id, # classif_qual, classif_who, NULL # from objects o {0} where o.projid=1 # ) oh on o.objid=oh.objid # LEFT JOIN taxonomy to3 on oh.classif_id=to3.id # LEFT JOIN users uo3 on oh.classif_who=uo3.id # """.format(samplefilter) order_clause = OrderClause() if req.split_by == "sample": order_clause.add_expression("sam", "orig_id") split_field = "sample_id" # AKA sam.orig_id, but renamed in select list elif req.split_by == "taxo": select_clause += "\n, txo.display_name AS taxo_parent_child " order_clause.add_expression(None, "taxo_parent_child") split_field = "taxo_parent_child" else: order_clause.add_expression("sam", "orig_id") split_field = "object_id" # cette valeur permet d'éviter des erreurs plus loin dans r[split_field] order_clause.add_expression("obh", "objid") if req.with_images or (req.exp_type == ExportTypeEnum.backup): order_clause.add_expression(None, "img_rank") # Base SQL comes from filters from_, where, params = object_set.get_sql( self._get_owner_id(), order_clause, select_clause, all_images=not req.only_first_image) sql = select_clause + " FROM " + from_.get_sql() + where.get_sql( ) + order_clause.get_sql() logger.info("Execute SQL : %s" % sql) logger.info("Params : %s" % params) res = self.ro_session.execute(text(sql), params) now_txt = DateTime.now_time().strftime("%Y%m%d_%H%M") self.out_file_name = "export_{0:d}_{1:s}.{2}".format( proj_id, now_txt, "zip") produced_path = self.out_path / self.out_file_name zfile = zipfile.ZipFile(produced_path, 'w', allowZip64=True, compression=zipfile.ZIP_DEFLATED) splitcsv = (req.split_by != "") csv_filename = 'data.tsv' # Just a temp name as there is a rename while filling up the Zip if splitcsv: # Produce into the same temp file all the time, at zipping time the name in archive will vary prev_value = "NotAssigned" # To trigger a sequence change immediately else: # The zip will contain a single TSV with same base name as the zip prev_value = self.out_file_name.replace('.zip', '') csv_path: Path = self.out_path / csv_filename # Constant path to a (sometimes) changing file csv_fd: Optional[IO] = None csv_wtr = None # Store the images to save in a separate CSV. Useless if not exporting images but who cares. temp_img_file = self.out_path / "images.csv" img_file_fd = open(temp_img_file, 'w') img_wtr = csv.DictWriter(img_file_fd, ["src_path", "dst_path"], delimiter='\t', quotechar='"', lineterminator='\n') img_wtr.writeheader() # Prepare TSV structure col_descs = [ a_desc for a_desc in res.cursor.description if a_desc.name != "img_src_path" ] # read latitude column to get float DB type for a_desc in col_descs: if a_desc.name == "object_lat": db_float_type = a_desc.type_code break else: raise float_cols = set() # Prepare float separator conversion, if not required the set will just be empty if req.coma_as_separator: for a_desc in col_descs: if a_desc.type_code == db_float_type: float_cols.add(a_desc.name) tsv_cols = [a_desc.name for a_desc in col_descs] tsv_types_line = { name: ('[f]' if a_desc.type_code == db_float_type else '[t]') for name, a_desc in zip(tsv_cols, col_descs) } nb_rows = 0 nb_images = 0 used_dst_pathes = set() for r in res: # Rows from SQLAlchemy are not mutable, so we need a clone for arranging values a_row = dict(r) if ((splitcsv and (prev_value != a_row[split_field]) ) # At each split column values change or (nb_rows == 0)): # And anyway for the first row # Start of sequence, eventually end of previous sequence if csv_fd: csv_fd.close() # Close previous file self.store_csv_into_zip(zfile, prev_value, csv_path) if splitcsv: prev_value = a_row[split_field] logger.info("Writing into file %s", csv_path) csv_fd = open(csv_path, 'w', encoding='latin_1') csv_wtr = csv.DictWriter(csv_fd, tsv_cols, delimiter='\t', quotechar='"', lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC) csv_wtr.writeheader() if req.exp_type == ExportTypeEnum.backup: # Write types line for backup type csv_wtr.writerow(tsv_types_line) if req.with_images: copy_op = {"src_path": a_row.pop("img_src_path")} if req.exp_type == ExportTypeEnum.dig_obj_ident: # Images will be stored in a per-category directory, but there is a single TSV at the Zip root categ = a_row['object_annotation_category'] # All names cannot directly become directories a_row['img_file_name'] = self.get_DOI_imgfile_name( a_row['objid'], a_row['img_rank'], categ, a_row['img_file_name']) copy_op["dst_path"] = a_row['img_file_name'] else: # It's a backup # Images are stored in the Zip subdirectory per sample/taxo, i.e. at the same place as # their referring TSV dst_path = "{0}/{1}".format(prev_value, a_row['img_file_name']) if dst_path in used_dst_pathes: # Avoid duplicates in zip as only the last entry will be present during unzip # root cause: for UVP6 bundles, the vignette and original image are both stored # with the same name. img_with_rank = "{0}/{1}".format( a_row['img_rank'], a_row['img_file_name']) a_row[ 'img_file_name'] = img_with_rank # write into TSV the corrected path dst_path = prev_value + "/" + img_with_rank used_dst_pathes.add(dst_path) copy_op["dst_path"] = dst_path img_wtr.writerow(copy_op) nb_images += 1 # Remove CR from comments if 'C' in req.tsv_entities and a_row['complement_info']: a_row['complement_info'] = ' '.join( a_row['complement_info'].splitlines()) # Replace decimal separator for cname in float_cols: if a_row[cname] is not None: a_row[cname] = str(a_row[cname]).replace('.', ',') assert csv_wtr is not None # Produce the row in the TSV csv_wtr.writerow(a_row) nb_rows += 1 if nb_rows % self.ROWS_REPORT_EVERY == 0: msg = "Row %d of max %d" % (nb_rows, obj_count) logger.info(msg) self.update_progress(1 + progress_range / obj_count * nb_rows, msg) if csv_fd: csv_fd.close() # Close last file self.store_csv_into_zip(zfile, prev_value, csv_path) logger.info("Extracted %d rows", nb_rows) img_file_fd.close() if zfile: zfile.close() return nb_rows, nb_images
class JsonDumper(Service): """ Dump in JSON form a project content, with filters. Mapped columns are rendered with their user-visible name (TSV one). The fields are reduced to 4(max)-letter names for saving bandwidth and DB fields independence. TODO as an option: No numeric primary or foreign key is present so the output can be diff-ed. The sub-entities are ordered by their 'natural' key, e.g. sample_id for samples. """ def __init__(self, current_user: int, prj_id: int, filters: ProjectFilters): super().__init__() self.requester_id = current_user self.filters = filters prj = self.session.query(Project).get(prj_id) # We don't check for existence self.prj: Project = prj # Work vars self.mapping = ProjectMapping() if self.prj: self.mapping.load_from_project(self.prj) self.ids_to_dump: List[int] = [] self.already_dumped: Set = set() self.first_query = True def run(self, out_stream: IO): """ Produce the json into given stream. """ if self.prj is None: to_stream = {} else: self._find_what_to_dump() # TODO: The result seems unneeded so far, we just need the stuff loaded in SQLA session _to_dump = self._db_fetch(self.ids_to_dump) # prj = to_dump[0][0] if len(self.ids_to_dump) == 0 or len(_to_dump) == 0: to_stream = {} else: to_stream = self.dump_row(out_stream, self.prj) json.dump(obj=to_stream, fp=out_stream, indent=" ") def dump_row(self, out_stream: IO, a_row: Model) -> Dict[str, Any]: """ Dump inside returned value the fields and contained/linked entities from a_row. """ ret: Dict = {} self._dump_into_dict(out_stream, a_row, ret) return ret def _dump_into_dict(self, out_stream: IO, a_row: Model, tgt_dict: Dict): """ Dump inside the tgt_dict all fields and contained/linked entities from a_row. """ # Ensure there no infinite loop assert a_row not in self.already_dumped self.already_dumped.add(a_row) # Dump using instructions row_class = type(a_row) desc: JSONDesc = JSON_FIELDS[row_class] for a_field_or_relation, how in desc.items(): fld_name = a_field_or_relation.key # This is where SQLAlchemy does all its magic when it's a relation attr = getattr(a_row, fld_name) if isinstance(attr, list): # Serialize the list of child entities, ordinary relationship children: List[Dict] = [] tgt_dict[how] = children for a_child_row in sorted(attr): child_obj = self.dump_row(out_stream, a_child_row) children.append(child_obj) elif isinstance(attr, Model): # A twin object AKA 'uselist=False' relationship if isinstance(attr, Process): # Keep process in single-entity list for now child_obj = self.dump_row(out_stream, attr) tgt_dict[how] = [child_obj] else: # Dump into same output dict. self._dump_into_dict(out_stream, attr, tgt_dict) else: # Ordinary field if attr is not None: tgt_dict[how] = attr # Dump mapped fields if any tbl_mapping = self.mapping.by_table_name.get(row_class.__tablename__) if tbl_mapping is not None: for a_db_col, a_tsv_col in tbl_mapping.real_cols_to_tsv.items(): attr = getattr(a_row, a_db_col) if attr is not None: tgt_dict[a_tsv_col] = attr def _find_what_to_dump(self) -> None: """ Determine the objects to dump. """ # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet(self.session, self.prj.projid, self.filters) from_, where, params = object_set.get_sql(self.requester_id) sql = """ SELECT objid FROM """ + from_.get_sql() + where.get_sql() logger.info("SQL=%s", sql) logger.info("SQLParam=%s", params) with CodeTimer("Get IDs:", logger): res: ResultProxy = self.session.execute(sql, params) ids = [r['objid'] for r in res] logger.info("NB OBJIDS=%d", len(ids)) self.ids_to_dump = ids def _db_fetch(self, objids: List[int]) -> List[DBObjectTupleT]: """ Do a DB read of given objects, with auxiliary objects. Thanks to 'contains_eager' calls, the objects are loaded into SQLAlchemy session. :param objids: :return: """ ret: Query = self.session.query(Project, Sample, Acquisition, Process, ObjectHeader, ObjectFields, Image) ret = ret.join(Sample, Project.all_samples).options(contains_eager(Project.all_samples)) ret = ret.join(Acquisition, Sample.all_acquisitions) ret = ret.join(Process, Acquisition.process) ret = ret.join(ObjectHeader, Acquisition.all_objects) # Natural joins ret = ret.join(ObjectFields) ret = ret.join(Image, ObjectHeader.all_images).options(contains_eager(ObjectHeader.all_images)) ret = ret.filter(ObjectHeader.objid == any_(objids)) ret = ret.order_by(ObjectHeader.objid) ret = ret.order_by(Image.imgrank) if self.first_query: logger.info("Query: %s", str(ret)) self.first_query = False with CodeTimer("Get Objects:", logger): objs = [an_obj for an_obj in ret.all()] # We get as many lines as images logger.info("NB ROWS JOIN=%d", len(objs)) return objs
def __init__(self, prj_id: int, req: ImportRealReq): super().__init__(prj_id, req) # Transcode from serialized form self.custom_mapping = ProjectMapping().load_from_dict(req.mappings)
def aggregate_for_sample( self, sample: Sample) -> Dict[ClassifIDT, AggregForTaxon]: """ Do the aggregations for the sample for each taxon and return them, they will become emofs - 'Abundance' -> CountOfBiologicalEntity -> count of objects group by taxon - 'Concentration' -> AbundancePerUnitVolumeOfTheWaterBody -> sum(individual_concentration) group by taxon with individual_concentration = 1 / subsample_coef / total_water_volume - 'Biovolume' -> BiovolumeOfBiologicalEntity -> sum(individual_biovolume) group by taxon with individual_biovolume = individual_volume / subsample_coef / total_water_volume The abundance can always be computed. The 2 other ones depend on availability of values for the project and the configuration variable. """ # We return all per taxon. ret: Dict[ClassifIDT, EMODnetExport.AggregForTaxon] = {} count_per_taxon_per_acquis: Dict[AcquisitionIDT, Dict[ClassifIDT, int]] = {} # Start with abundances, simple count and giving its keys to the returned dict. acquis_for_sample = SampleBO.get_acquisitions(self.session, sample) for an_acquis in acquis_for_sample: # Get counts for acquisition (subsample) count_per_taxon_for_acquis: Dict[ ClassifIDT, int] = AcquisitionBO.get_sums_by_taxon(self.session, an_acquis.acquisid) if self.auto_morpho: self.add_morpho_counts(count_per_taxon_for_acquis) count_per_taxon_per_acquis[ an_acquis.acquisid] = count_per_taxon_for_acquis for an_id, count_4_acquis in count_per_taxon_for_acquis.items(): aggreg_for_taxon = ret.get(an_id) if aggreg_for_taxon is None: ret[an_id] = self.AggregForTaxon(count_4_acquis, None, None) else: aggreg_for_taxon.abundance += count_4_acquis if not self.with_computations: return ret # Enrich with concentrations subsampling_coeff_per_acquis: Dict[AcquisitionIDT, float] = {} try: # Fetch calculation data at sample level sample_volume = SampleBO.get_computed_var( sample, DefaultVars.volume_sampled) except TypeError as e: self.warnings.append( "Could not compute volume sampled from sample %s (%s)," " no concentration or biovolume will be computed." % (sample.orig_id, str(e))) sample_volume = -1 if sample_volume > 0: # Cumulate for subsamples AKA acquisitions for an_acquis in acquis_for_sample: try: subsampling_coefficient = AcquisitionBO.get_computed_var( an_acquis, DefaultVars.subsample_coeff) subsampling_coeff_per_acquis[ an_acquis.acquisid] = subsampling_coefficient except TypeError as e: self.warnings.append( "Could not compute subsampling coefficient from acquisition %s (%s)," " no concentration or biovolume will be computed" % (an_acquis.orig_id, str(e))) logger.info( "concentrations: no subsample coeff for '%s' (%s)", an_acquis.orig_id, str(e)) continue # Get counts for acquisition (sub-sample) logger.info("computing concentrations for '%s'", an_acquis.orig_id) count_per_taxon_for_acquis = count_per_taxon_per_acquis[ an_acquis.acquisid] for an_id, count_4_acquis in count_per_taxon_for_acquis.items( ): aggreg_for_taxon = ret[an_id] concentration_for_taxon = count_4_acquis / subsampling_coefficient / sample_volume if aggreg_for_taxon.concentration is None: aggreg_for_taxon.concentration = 0 aggreg_for_taxon.concentration += concentration_for_taxon # Enrich with biovolumes. This needs a computation for each object, so it's likely to be slow. if sample_volume > 0: # Mappings are constant for the sample # noinspection PyTypeChecker mapping = ProjectMapping().load_from_project(sample.project) # Cumulate for subsamples AKA acquisitions for an_acquis in acquis_for_sample: subsampling_coefficient = subsampling_coeff_per_acquis.get( an_acquis.acquisid) if subsampling_coefficient is None: logger.info("biovolumes: no subsample coeff for '%s'", an_acquis.orig_id) continue # Get pixel size from associated process, it a constant to individual biovol computations try: pixel_size, = ProcessBO.get_free_fields( an_acquis.process, ["particle_pixel_size_mm"], [float], [None]) except TypeError as _e: logger.info("biovolumes: no pixel size for '%s'", an_acquis.orig_id) continue constants = {"pixel_size": pixel_size} # Get all objects for the acquisition. The filter on classif_id is useless for now. with CodeTimer("Objects IDs for '%s': " % an_acquis.orig_id, logger): acq_object_ids = AcquisitionBO.get_all_object_ids( session=self.session, acquis_id=an_acquis.acquisid, classif_ids=list(ret.keys())) with CodeTimer("Objects for '%s': " % an_acquis.orig_id, logger): objects = ObjectBOSet(self.ro_session, acq_object_ids, mapping.object_mappings) nb_biovols = 0 for an_obj in objects.all: # Compute a biovol if possible try: biovol = ObjectBO.get_computed_var( an_obj, DefaultVars.equivalent_ellipsoidal_volume, mapping, constants) biovol = -1 except TypeError as _e: biovol = -1 if biovol == -1: try: biovol = ObjectBO.get_computed_var( an_obj, DefaultVars.equivalent_spherical_volume, mapping, constants) except TypeError as _e: continue # Aggregate by category/taxon aggreg_for_taxon = ret[an_obj.classif_id] individual_biovolume = biovol / subsampling_coefficient / sample_volume if aggreg_for_taxon.biovolume is None: aggreg_for_taxon.biovolume = 0 aggreg_for_taxon.biovolume += individual_biovolume # Update stats nb_biovols += 1 # A bit of display logger.info( "%d biovolumes computed for '%s' out of %d objects", nb_biovols, an_acquis.orig_id, len(acq_object_ids)) return ret
class MergeService(Service, LogEmitter): """ Merge operation, move everything from source into destination project. """ def __init__(self, prj_id: int, src_prj_id: int, dry_run: bool): super().__init__() # params self.prj_id = prj_id self.src_prj_id = src_prj_id self.dry_run = dry_run # work vars self.remap_operations: Dict[MappedTableTypeT, List[RemapOp]] = {} self.dest_augmented_mappings = ProjectMapping() def log_file_path(self) -> str: return "merge_%d_in_%d.log" % (self.prj_id, self.src_prj_id) def run(self, current_user_id: int) -> MergeRsp: with LogsSwitcher(self): return self.do_run(current_user_id) def do_run(self, current_user_id: int) -> MergeRsp: """ Run the service, merge the projects. :return: """ # Security check RightsBO.user_wants(self.session, current_user_id, Action.ADMINISTRATE, self.prj_id) RightsBO.user_wants(self.session, current_user_id, Action.ADMINISTRATE, self.src_prj_id) # OK prj = self.session.query(Project).get(self.prj_id) assert prj is not None src_prj = self.session.query(Project).get(self.src_prj_id) assert src_prj is not None logger.info("Validating Merge of '%s'", prj.title) ret = MergeRsp() errs = self._verify_possible(prj, src_prj) ret.errors = errs # Exit if errors or dry run if self.dry_run or len(errs) > 0: return ret logger.info("Remaps: %s", self.remap_operations) # Go for real if not dry run AND len(errs) == 0 logger.info("Starting Merge of '%s'", prj.title) self._do_merge(prj) self.session.commit() # Recompute stats and so on ProjectBO.do_after_load(self.session, prj_id=self.prj_id) self.session.commit() return ret def _verify_possible(self, dest_prj: Project, src_prj: Project) -> List[str]: """ Verify that the merge would not mean a loss of information. The mappings of src project should be preserved and copied into dest project. Augmented mappings should fit in the allowed maximum size for each entity. :param dest_prj: :param src_prj: :return: a list of problems, empty means we can proceed. """ ret = [] dest_mappings = ProjectMapping().load_from_project(dest_prj) src_mappings = ProjectMapping().load_from_project(src_prj) a_tbl: MappedTableTypeT for a_tbl in MAPPED_TABLES: mappings_for_dest_tbl = dest_mappings.by_table[a_tbl] mappings_for_src_tbl = src_mappings.by_table[a_tbl] # Compute the new mapping and eventual transformations to get there aug, remaps, errs = mappings_for_dest_tbl.augmented_with( mappings_for_src_tbl) ret.extend(errs) if len(remaps) > 0: self.remap_operations[a_tbl] = remaps # Load future mapping self.dest_augmented_mappings.by_table[a_tbl].load_from(aug) # Also check problems on consistency of unique orig_id dest_parents = InBundle.fetch_existing_parents(self.ro_session, prj_id=self.prj_id) src_parents = InBundle.fetch_existing_parents(self.ro_session, prj_id=self.src_prj_id) for an_orig_id_container in [ Sample.__tablename__, Acquisition.__tablename__ ]: # key=orig_id value, value=full record dest_orig_ids = dest_parents[an_orig_id_container] src_orig_ids = src_parents[an_orig_id_container] common_orig_ids = set(dest_orig_ids.keys()).intersection( src_orig_ids.keys()) if len(common_orig_ids) != 0: logger.info("Common %s orig_ids: %s", an_orig_id_container, common_orig_ids) for common_orig_id in common_orig_ids: orm_diff = orm_equals(dest_orig_ids[common_orig_id], src_orig_ids[common_orig_id]) if orm_diff: msg = ( "Data conflict: %s record with orig_id '%s' is different in destination project: %s" % (an_orig_id_container, common_orig_id, str(orm_diff))) # TODO: Should be an error? logger.warning(msg) return ret def _do_merge(self, dest_prj: Project): """ Real merge operation. """ # Loop over involved tables and remap free columns for a_mapped_tbl in MAPPED_TABLES: remaps = self.remap_operations.get(a_mapped_tbl) # Do the remappings if any if remaps is not None: logger.info("Doing re-mapping in %s: %s", a_mapped_tbl.__tablename__, remaps) ProjectBO.remap(self.session, self.src_prj_id, a_mapped_tbl, remaps) # Collect orig_id dest_parents = InBundle.fetch_existing_parents(self.ro_session, prj_id=self.prj_id) src_parents = InBundle.fetch_existing_parents(self.ro_session, prj_id=self.src_prj_id) # Compute needed projections in order to keep orig_id unicity common_samples = self.get_ids_for_common_orig_id( Sample, dest_parents, src_parents) common_acquisitions = self.get_ids_for_common_orig_id( Acquisition, dest_parents, src_parents) # Align foreign keys, to Project, Sample and Acquisition for a_fk_to_proj_tbl in [ Sample, Acquisition, ObjectHeader, ParticleProject ]: upd: Query = self.session.query(a_fk_to_proj_tbl) if a_fk_to_proj_tbl == Sample: # Move (i.e. change project) samples which are 'new' from merged project, # so take all of them from src project... upd = upd.filter( a_fk_to_proj_tbl.projid == self.src_prj_id) # type: ignore # ...but not the ones with same orig_id, which are presumably equal. upd = upd.filter( Sample.sampleid != all_(list(common_samples.keys()))) # And update the column upd_values = {'projid': self.prj_id} elif a_fk_to_proj_tbl == Acquisition: # Acquisitions which were created, in source, under new samples, will 'follow' # them during above move, thanks to the FK on acq_sample_id. # BUT some acquisitions were potentially created in source project, inside # forked samples. They need to be attached to the dest (self) corresponding sample. if len(common_samples) > 0: # Build a CTE with values for the update smp_cte = values_cte("upd_smp", ("src_id", "dst_id"), [(k, v) for k, v in common_samples.items()]) smp_subqry = self.session.query(smp_cte.c.column2).filter( smp_cte.c.column1 == Acquisition.acq_sample_id) upd_values = { 'acq_sample_id': func.coalesce( smp_subqry.scalar_subquery(), # type: ignore Acquisition.acq_sample_id) } upd = upd.filter(Acquisition.acq_sample_id == any_( list(common_samples.keys()))) # upd = upd.filter(Acquisition.acquisid != all_(list(common_acquisitions.keys()))) if len(common_samples) == 0: # Nothing to do. There were only new samples, all of them moved to self. continue elif a_fk_to_proj_tbl == ObjectHeader: # Generated SQL looks like: # with upd_acq (src_id, dst_id) as (values (5,6), (7,8)) # update obj_head # set acquisid = coalesce((select dst_id from upd_acq where acquisid=src_id), acquisid) # where acquisid in (select src_id from upd_acq) if len(common_acquisitions) > 0: # Object must follow its acquisition acq_cte = values_cte( "upd_acq", ("src_id", "dst_id"), [(k, v) for k, v in common_acquisitions.items()]) acq_subqry = self.session.query(acq_cte.c.column2).filter( acq_cte.c.column1 == ObjectHeader.acquisid) upd_values = { 'acquisid': func.coalesce( acq_subqry.scalar_subquery(), # type:ignore ObjectHeader.acquisid) } upd = upd.filter(ObjectHeader.acquisid == any_( list(common_acquisitions.keys()))) if len(common_acquisitions) == 0: # Nothing to do. There were only new acquisitions, all of them moved to self. continue else: # For Particle project upd = upd.filter( ParticleProject.projid == self.src_prj_id) # type: ignore upd_values = {'projid': self.prj_id} rowcount = upd.update(values=upd_values, synchronize_session=False) table_name = a_fk_to_proj_tbl.__tablename__ # type: ignore logger.info("Update in %s: %s rows", table_name, rowcount) # Acquisition & twin Process have followed their enclosing Sample # Remove the parents which are duplicate from orig_id point of view for a_fk_to_proj_tbl in [Acquisition, Sample]: to_del: Query = self.session.query(a_fk_to_proj_tbl) if a_fk_to_proj_tbl == Acquisition: # Remove conflicting acquisitions, they should be empty? to_del = to_del.filter(Acquisition.acquisid == any_( list(common_acquisitions.keys()))) # type: ignore elif a_fk_to_proj_tbl == Sample: # Remove conflicting samples to_del = to_del.filter(Sample.sampleid == any_( list(common_samples.keys()))) # type: ignore rowcount = to_del.delete(synchronize_session=False) table_name = a_fk_to_proj_tbl.__tablename__ # type: ignore logger.info("Delete in %s: %s rows", table_name, rowcount) self.dest_augmented_mappings.write_to_project(dest_prj) ProjectPrivilegeBO.generous_merge_into(self.session, self.prj_id, self.src_prj_id) # Completely erase the source project ProjectBO.delete(self.session, self.src_prj_id) @staticmethod def get_ids_for_common_orig_id(a_parent_class, dest_parents, src_parents) -> \ Dict[Union[SampleIDT, AcquisitionIDT], Union[SampleIDT, AcquisitionIDT]]: """ Return a link between IDs for resolving colliding orig_id. E.g. sample 'moose2015_ge_leg2_026' is present in source with ID 15482 and also in destination with ID 84678 -> return {15482:84678}, to read 15482->84678 :param a_parent_class: Sample/Acquisition :param dest_parents: :param src_parents: :return: """ ret = {} dst_orig_ids = dest_parents[a_parent_class.__tablename__] src_orig_ids = src_parents[a_parent_class.__tablename__] common_orig_ids = set(dst_orig_ids.keys()).intersection( src_orig_ids.keys()) for a_common_orig_id in common_orig_ids: src_orig_id = src_orig_ids[a_common_orig_id].pk() dst_orig_id = dst_orig_ids[a_common_orig_id].pk() ret[src_orig_id] = dst_orig_id return ret