def generous_merge_into(cls, session: Session, dest_prj_id: int, src_prj_id: int): """ Merge privileges from source project into destination project. """ # Each user who is present in both projects, gets the highest privilege from both projects. # TODO: Arguable sql = text(""" UPDATE projectspriv ppdst SET privilege = CASE WHEN 'Manage' IN (ppsrc.privilege, ppdst.privilege) THEN 'Manage' WHEN 'Annotate' IN (ppsrc.privilege, ppdst.privilege) THEN 'Annotate' ELSE 'View' END FROM projectspriv ppsrc WHERE ppsrc.projid = :src_prj AND ppdst.projid = :dst_prj AND ppsrc.member = ppdst.member""") session.execute(sql, {"dst_prj": dest_prj_id, "src_prj": src_prj_id}) # Users who were only in source project get their privileges transferred into destination # TODO: Arguable sql = text(""" UPDATE projectspriv SET projid = :dst_prj WHERE projid = :src_prj AND member NOT IN (SELECT member FROM projectspriv WHERE projid = :dst_prj)""") session.execute(sql, {"dst_prj": dest_prj_id, "src_prj": src_prj_id})
def incremental_update_taxo_stats(cls, session: Session, prj_id: int, collated_changes: Dict): """ Do not recompute the full stats for a project (which can be long). Instead, apply deltas because in this context we know them. TODO: All SQL to SQLAlchemy form """ needed_ids = list(collated_changes.keys()) # Lock taxo lines to prevent re-entering, during validation it's often a handful of them. pts_sql = """SELECT id FROM taxonomy WHERE id = ANY(:ids) FOR NO KEY UPDATE """ session.execute(text(pts_sql), {"ids": needed_ids}) # Lock the rows we are going to update, including -1 for unclassified pts_sql = """SELECT id, nbr FROM projects_taxo_stat WHERE projid = :prj AND id = ANY(:ids) FOR NO KEY UPDATE""" res = session.execute(text(pts_sql), {"prj": prj_id, "ids": needed_ids}) ids_in_db = {classif_id: nbr for (classif_id, nbr) in res.fetchall()} ids_not_in_db = set(needed_ids).difference(ids_in_db.keys()) if len(ids_not_in_db) > 0: # Insert rows for missing IDs pts_ins = """INSERT INTO projects_taxo_stat(projid, id, nbr, nbr_v, nbr_d, nbr_p) SELECT :prj, COALESCE(obh.classif_id, -1), COUNT(*) nbr, COUNT(CASE WHEN obh.classif_qual = 'V' THEN 1 END) nbr_v, COUNT(CASE WHEN obh.classif_qual = 'D' THEN 1 END) nbr_d, COUNT(CASE WHEN obh.classif_qual = 'P' THEN 1 END) nbr_p FROM obj_head obh JOIN acquisitions acq ON acq.acquisid = obh.acquisid JOIN samples sam ON sam.sampleid = acq.acq_sample_id AND sam.projid = :prj WHERE COALESCE(obh.classif_id, -1) = ANY(:ids) GROUP BY obh.classif_id""" session.execute(text(pts_ins), {'prj': prj_id, 'ids': list(ids_not_in_db)}) # Apply delta for classif_id, chg in collated_changes.items(): if classif_id in ids_not_in_db: # The line was created just above, with OK values continue if ids_in_db[classif_id] + chg['n'] == 0: # The delta means 0 for this taxon in this project, delete the line sqlparam = {'prj': prj_id, 'cid': classif_id} ts_sql = """DELETE FROM projects_taxo_stat WHERE projid = :prj AND id = :cid""" else: # General case sqlparam = {'prj': prj_id, 'cid': classif_id, 'nul': chg['n'], 'val': chg['V'], 'dub': chg['D'], 'prd': chg['P']} ts_sql = """UPDATE projects_taxo_stat SET nbr=nbr+:nul, nbr_v=nbr_v+:val, nbr_d=nbr_d+:dub, nbr_p=nbr_p+:prd WHERE projid = :prj AND id = :cid""" session.execute(text(ts_sql), sqlparam)
def historize_classification(self, only_qual=None, manual=True): """ Copy current classification information into history table, for all rows in self. :param only_qual: If set, only historize for current rows with this classification. :param manual: If set, historize manual entries, otherwise, pick automatic ones. """ # Light up a bit the SQLA expressions oh = ObjectHeader och = ObjectsClassifHisto # What we want to historize, as a subquery if manual: # What we want to historize, as a subquery sel_subqry = select([ oh.objid, oh.classif_when, text("'M'"), oh.classif_id, oh.classif_qual, oh.classif_who ]) if only_qual is not None: qual_cond = oh.classif_qual.in_(only_qual) else: qual_cond = true() sel_subqry = sel_subqry.where( and_(oh.objid == any_(self.object_ids), oh.classif_when.isnot(None), qual_cond)) ins_columns = [ och.objid, och.classif_date, och.classif_type, och.classif_id, och.classif_qual, och.classif_who ] else: # What we want to historize, as a subquery sel_subqry = select([ oh.objid, oh.classif_auto_when, text("'A'"), oh.classif_auto_id, oh.classif_qual, oh.classif_auto_score ]) sel_subqry = sel_subqry.where( and_(oh.objid == any_(self.object_ids), oh.classif_auto_id.isnot(None), oh.classif_auto_when.isnot(None))) ins_columns = [ och.objid, och.classif_date, och.classif_type, och.classif_id, och.classif_qual, och.classif_score ] # Insert into the log table ins_qry: Insert = pg_insert(och.__table__) ins_qry = ins_qry.from_select(ins_columns, sel_subqry) ins_qry = ins_qry.on_conflict_do_nothing( constraint='objectsclassifhisto_pkey') # TODO: mypy crashes due to pg_dialect below # logger.info("Histo query: %s", ins_qry.compile(dialect=pg_dialect())) nb_objs = self.session.execute(ins_qry).rowcount logger.info(" %d out of %d rows copied to log", nb_objs, len(self.object_ids)) return oh
def remap(session: Session, prj_id: int, table: MappedTableTypeT, remaps: List[RemapOp]): """ Apply remapping operations onto the given table for given project. """ # Do the remapping, including blanking of unused columns values = {a_remap.to: text(a_remap.frm) if a_remap.frm is not None else a_remap.frm for a_remap in remaps} qry: Query = session.query(table) samples_4_prj: Query acqs_4_samples: Query if table == Sample: qry = qry.filter(Sample.projid == prj_id) # type: ignore elif table == Acquisition: samples_4_prj = Query(Sample.sampleid).filter(Sample.projid == prj_id) qry = qry.filter(Acquisition.acq_sample_id.in_(samples_4_prj)) # type: ignore elif table == Process: samples_4_prj = Query(Sample.sampleid).filter(Sample.projid == prj_id) acqs_4_samples = Query(Acquisition.acquisid).filter(Acquisition.acq_sample_id.in_(samples_4_prj)) qry = qry.filter(Process.processid.in_(acqs_4_samples)) # type: ignore elif table == ObjectFields: samples_4_prj = Query(Sample.sampleid).filter(Sample.projid == prj_id) acqs_4_samples = Query(Acquisition.acquisid).filter(Acquisition.acq_sample_id.in_(samples_4_prj)) objs_for_acqs: Query = Query(ObjectHeader.objid).filter(ObjectHeader.acquisid.in_(acqs_4_samples)) qry = qry.filter(ObjectFields.objfid.in_(objs_for_acqs)) # type: ignore qry = qry.update(values=values, synchronize_session=False) logger.info("Remap query for %s: %s", table.__tablename__, qry)
def read_taxo_stats(session: Session, prj_ids: ProjectIDListT, taxa_ids: Union[str, ClassifIDListT]) -> List[ProjectTaxoStats]: sql = """ SELECT pts.projid, ARRAY_AGG(pts.id) as ids, SUM(CASE WHEN pts.id = -1 THEN pts.nbr ELSE 0 END) as nb_u, SUM(pts.nbr_v) as nb_v, SUM(pts.nbr_d) as nb_d, SUM(pts.nbr_p) as nb_p FROM projects_taxo_stat pts WHERE pts.projid = ANY(:ids)""" params: Dict[str, Any] = {'ids': prj_ids} if len(taxa_ids) > 0: if taxa_ids == 'all': pass else: sql += " AND pts.id = ANY(:tids)" params["tids"] = taxa_ids sql += """ GROUP BY pts.projid""" if len(taxa_ids) > 0: sql += ", pts.id" res: Result = session.execute(text(sql), params) with CodeTimer("stats for %d projects:" % len(prj_ids), logger): ret = [ProjectTaxoStats(rec) for rec in res.fetchall()] for a_stat in ret: a_stat.used_taxa.sort() return ret
def get_bounding_geo(cls, session: Session, project_ids: ProjectIDListT) -> Iterable[float]: # TODO: Why using the view? sql = ("SELECT min(o.latitude), max(o.latitude), min(o.longitude), max(o.longitude)" " FROM objects o " " WHERE o.projid = ANY(:prj)") res: Result = session.execute(text(sql), {"prj": project_ids}) vals = res.first() assert vals return [a_val for a_val in vals]
def get_date_range(cls, session: Session, project_ids: ProjectIDListT) -> Iterable[datetime]: # TODO: Why using the view? sql = ("SELECT min(o.objdate), max(o.objdate)" " FROM objects o " " WHERE o.projid = ANY(:prj)") res: Result = session.execute(text(sql), {"prj": project_ids}) vals = res.first() assert vals return [a_val for a_val in vals]
def create_or_link_slaves(how: ImportHow, session: Session, object_head_to_write, object_fields_to_write, image_to_write) -> int: """ Create, link or update slave entities, i.e. head, fields, image. Also update them... TODO: Split/fork the def :returns the number of new records """ if object_head_to_write.orig_id in how.existing_objects: # Set the objid which will be copied for storing the image, the object itself # will not be stored due to returned value. objid = how.existing_objects[object_head_to_write.orig_id] object_head_to_write.objid = objid if how.can_update_only: # noinspection DuplicatedCode for a_cls, its_pk, an_upd in zip( [ObjectHeader, ObjectFields], ['objid', 'objfid'], [object_head_to_write, object_fields_to_write]): filter_for_id = text("%s=%d" % (its_pk, objid)) # Fetch the record to update obj = session.query(a_cls).filter(filter_for_id).first() if a_cls == ObjectHeader: # Eventually refresh sun position if an_upd.nb_fields_from(USED_FIELDS_FOR_SUNPOS) > 0: # Give the bean enough data for computation for a_field in USED_FIELDS_FOR_SUNPOS.difference( an_upd.keys()): an_upd[a_field] = getattr(obj, a_field) TSVFile.do_sun_position_field(an_upd) updates = TSVFile.update_orm_object(obj, an_upd) # type: ignore if len(updates) > 0: logger.info("Updating '%s' using %s", filter_for_id, updates) session.flush() ret = 0 # nothing to write else: # 'Simply' a line with a complementary image logger.info("One more image for %s:%s ", object_head_to_write.orig_id, image_to_write) ret = 1 # just a new image else: if how.can_update_only: # No objects creation while updating logger.info("Object %s not found while updating ", object_head_to_write.orig_id) ret = 0 else: # or create it # object_head_to_write.projid = how.prj_id object_head_to_write.random_value = random.randint(1, 99999999) # Below left NULL @see self.update_counts_and_img0 # object_head_to_write.img0id = XXXXX ret = 3 # new image + new object_head + new object_fields return ret
def create_summary(self, src_project: Project): req = self.req proj_id = src_project.projid self.update_progress(1, "Start Summary export") now_txt = DateTime.now_time().strftime("%Y%m%d_%H%M") self.out_file_name = "export_summary_{0:d}_{1:s}.tsv".format( src_project.projid, now_txt) out_file = self.temp_for_jobs.base_dir_for( self.job_id) / self.out_file_name # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet( self.ro_session, proj_id, self.filters) # By default, select (and group by) unambiguous category name sels = ["txo.display_name"] if self.req.sum_subtotal == "A": sels[:0] = ["acq.orig_id"] elif self.req.sum_subtotal == "S": sels[:0] = [ "sam.orig_id", "sam.latitude", "sam.longitude", "MAX(obh.objdate) AS date" ] sels.append("COUNT(*) AS nbr") select_clause = "SELECT " + ", ".join(sels) not_aggregated = [a_sel for a_sel in sels if " " not in a_sel] group_clause = " GROUP BY " + ", ".join(not_aggregated) order_clause = OrderClause() for a_sel in not_aggregated: alias, col = a_sel.split(".") order_clause.add_expression(alias, col) # Base SQL comes from filters from_, where, params = object_set.get_sql(self._get_owner_id(), order_clause, select_clause) sql = select_clause + " FROM " + from_.get_sql() + where.get_sql( ) + group_clause + order_clause.get_sql() logger.info("Execute SQL : %s", sql) logger.info("Params : %s", params) res = self.ro_session.execute(text(sql), params) msg = "Creating file %s" % out_file logger.info(msg) self.update_progress(50, msg) nb_lines = self.write_result_to_csv(res, out_file) msg = "Extracted %d rows" % nb_lines logger.info(msg) self.update_progress(90, msg) return nb_lines
def get_sums_by_taxon(cls, session: Session, acquis_id: AcquisitionIDT) \ -> Dict[ClassifIDT, int]: sql = text("SELECT o.classif_id, count(1)" " FROM obj_head o " " WHERE o.acquisid = :acq " " AND o.classif_id IS NOT NULL " " AND o.classif_qual = 'V'" " GROUP BY o.classif_id") res: Result = session.execute(sql, {"acq": acquis_id}) return { int(classif_id): int(cnt) for (classif_id, cnt) in res.fetchall() }
def fetch_existing_images(session: Session, prj_id): """ Get all object/image pairs from the project """ # Must be reloaded from DB, as phase 1 added all objects for duplicates checking # TODO: Why using the view? sql = text("SELECT concat(o.orig_id,'*',i.orig_file_name) " " FROM images i " " JOIN objects o ON i.objid = o.objid " " WHERE o.projid = :prj") res: Result = session.execute(sql, {"prj": prj_id}) ret = {img_id for img_id, in res} return ret
def update_taxo_stats(session: Session, projid: int): sql = text(""" DELETE FROM projects_taxo_stat pts WHERE pts.projid = :prjid; INSERT INTO projects_taxo_stat(projid, id, nbr, nbr_v, nbr_d, nbr_p) SELECT sam.projid, COALESCE(obh.classif_id, -1) id, COUNT(*) nbr, COUNT(CASE WHEN obh.classif_qual = 'V' THEN 1 END) nbr_v, COUNT(CASE WHEN obh.classif_qual = 'D' THEN 1 END) nbr_d, COUNT(CASE WHEN obh.classif_qual = 'P' THEN 1 END) nbr_p FROM obj_head obh JOIN acquisitions acq ON acq.acquisid = obh.acquisid JOIN samples sam ON sam.sampleid = acq.acq_sample_id AND sam.projid = :prjid GROUP BY sam.projid, obh.classif_id;""") session.execute(sql, {'prjid': projid})
def update_stats(session: Session, projid: int): sql = text(""" UPDATE projects SET objcount=q.nbr_sum, pctclassified=100.0*nbrclassified/q.nbr_sum, pctvalidated=100.0*nbrvalidated/q.nbr_sum FROM projects p LEFT JOIN (SELECT projid, SUM(nbr) nbr_sum, SUM(CASE WHEN id>0 THEN nbr END) nbrclassified, SUM(nbr_v) nbrvalidated FROM projects_taxo_stat WHERE projid = :prjid GROUP BY projid) q ON p.projid = q.projid WHERE projects.projid = :prjid AND p.projid = :prjid""") session.execute(sql, {'prjid': projid})
def read_taxo_stats(self) -> List[SampleTaxoStats]: sql = text(""" SELECT sam.sampleid, ARRAY_AGG(DISTINCT COALESCE(obh.classif_id, -1)) as ids, SUM(CASE WHEN obh.classif_id <> -1 THEN 0 ELSE 1 END) as nb_u, COUNT(CASE WHEN obh.classif_qual = 'V' THEN 1 END) nbr_v, COUNT(CASE WHEN obh.classif_qual = 'D' THEN 1 END) nbr_d, COUNT(CASE WHEN obh.classif_qual = 'P' THEN 1 END) nbr_p FROM obj_head obh JOIN acquisitions acq ON acq.acquisid = obh.acquisid JOIN samples sam ON sam.sampleid = acq.acq_sample_id WHERE sam.sampleid = ANY(:ids) GROUP BY sam.sampleid;""") with CodeTimer("Stats for %d samples: " % len(self.ids), logger): res = self.session.execute(sql, {'ids': self.ids}) ret = [SampleTaxoStats(rec) for rec in res] return ret
def __init__(self, session: Session, project_ids: ProjectIDListT): qry: Query = session.query(Acquisition.instrument) qry = qry.join(Sample).join(Project) # TODO: WTF WTF just for adding a column to the select qry = qry.add_columns( text(Project.__table__.name + "." + Project.__table__.c.projid.name)) # Below SQLAlchemy complains # qry = qry.add_columns(Project.projid) if len(project_ids) > 0: qry = qry.filter(Project.projid.in_(project_ids)) qry = qry.distinct() instruments_by_proj: Dict[ProjectIDT, Set[InstrumentIDT]] = {} instrument_names = set() for ins_name, projid in qry.all(): if ins_name: instruments_by_proj.setdefault(projid, set()).add(ins_name) instrument_names.add(ins_name) else: pass # Filter NULL & empty strings self.by_project = instruments_by_proj self.instrument_names = sorted(list(instrument_names))
def classify_validate(self, user_id: UserIDT, classif_ids: ClassifIDListT, wanted_qualif: str) \ -> Tuple[int, Dict[Tuple, ObjectIDListT]]: """ Set current classifications in self and/or validate current classification. :param user_id: The User who did these changes. :param classif_ids: One category id for each of the object ids in self. -1 means "keep current". :param wanted_qualif: Validate or Dubious :returns updated rows and a summary of changes, for MRU and logging. """ # Gather state of classification, for impacted objects, before the change. Keep a lock on rows. present = self._fetch_classifs_and_lock() # Cook a diff b/w present and wanted values, both for the update of obj_head and preparing the ones on _stat # Group the updates as lots of them are identical updates: Dict[Tuple, EnumeratedObjectSet] = {} all_changes: OrderedDict[Tuple, List[int]] = OrderedDict() # A bit of obsessive optimization classif_id_col = ObjectHeader.classif_id.name classif_qual_col = ObjectHeader.classif_qual.name classif_who_col = ObjectHeader.classif_who.name classif_when_col = ObjectHeader.classif_when.name for obj_id, v in zip(self.object_ids, classif_ids): prev_obj = present[obj_id] prev_classif_id: Optional[int] = prev_obj['classif_id'] new_classif_id: Optional[int] if v == -1: # special value from validate all # Arrange that no change can happen for this field # Note: prev_classif_id can be None new_classif_id = prev_classif_id else: new_classif_id = v prev_classif_qual = prev_obj['classif_qual'] if (prev_classif_id == new_classif_id and prev_classif_qual == wanted_qualif and prev_obj['classif_who'] == user_id): continue # There was at least 1 field change for this object an_update = updates.setdefault( (new_classif_id, wanted_qualif), EnumeratedObjectSet(self.session, [])) an_update.add_object(obj_id) # Compact changes, grouped by operation change_key = (prev_classif_id, prev_classif_qual, new_classif_id, wanted_qualif) for_this_change = all_changes.setdefault(change_key, []) for_this_change.append(obj_id) # Keep the recently used in first all_changes.move_to_end(change_key, last=False) if len(updates) == 0: # Nothing to do return 0, all_changes # Update of obj_head, grouped by similar operations. nb_updated = 0 sql_now = text("now()") for (new_classif_id, wanted_qualif), an_obj_set in updates.items(): # Historize the updated rows (can be a lot!) an_obj_set.historize_classification() row_upd = { classif_id_col: new_classif_id, classif_qual_col: wanted_qualif, classif_who_col: user_id, classif_when_col: sql_now } # Do the update itsef nb_updated += an_obj_set.update_all(row_upd) logger.info("%d rows updated in %d queries", nb_updated, len(updates)) # Return statuses return nb_updated, all_changes
def classify_auto(self, classif_ids: ClassifIDListT, scores: List[float], keep_logs: bool) \ -> Tuple[int, Dict[Tuple, ObjectIDListT]]: """ Set automatic classifications in self. :param classif_ids: One category id for each of the object ids in self. :param scores: One confidence score for each object from automatic classification algorithm. :param keep_logs: Self-explained :returns updated rows and a summary of changes, for stats. """ # Gather state of classification, for impacted objects, before the change. Keep a lock on rows. prev = self._fetch_classifs_and_lock() # Cook a diff b/w present and wanted values, both for the update of obj_head and preparing the ones on _stat # updates: Dict[Tuple, EnumeratedObjectSet] = {} all_changes: OrderedDict[Tuple, List[int]] = OrderedDict() # A bit of obsessive optimization classif_auto_id_col = ObjectHeader.classif_auto_id.name classif_auto_score_col = ObjectHeader.classif_auto_score.name classif_id_col = ObjectHeader.classif_id.name classif_qual_col = ObjectHeader.classif_qual.name overriden_by_prediction = {None, PREDICTED_CLASSIF_QUAL} full_updates = [] partial_updates = [] objid_param = "_objid" for obj_id, classif, score in zip(self.object_ids, classif_ids, scores): prev_obj = prev[obj_id] prev_classif_id: Optional[int] = prev_obj['classif_id'] prev_classif_qual = prev_obj['classif_qual'] # Whatever, set the auto_* fields an_update: Dict[str, Any] = { objid_param: obj_id, classif_auto_id_col: classif, classif_auto_score_col: score } if prev_classif_qual in overriden_by_prediction: # If not manually modified, go to Predicted state and set prediction as classification an_update[classif_id_col] = classif an_update[classif_qual_col] = PREDICTED_CLASSIF_QUAL full_updates.append(an_update) change_key = (prev_classif_id, prev_classif_qual, classif, PREDICTED_CLASSIF_QUAL) # Compact changes, grouped by operation for_this_change = all_changes.setdefault(change_key, []) for_this_change.append(obj_id) else: # Just store prediction, no change on user-visible data partial_updates.append(an_update) # Historize (auto) if keep_logs: self.historize_classification(None, True) # Bulk (or sort of) update of obj_head sql_now = text("now()") obj_upd_qry: Update = ObjectHeader.__table__.update() obj_upd_qry = obj_upd_qry.where( ObjectHeader.objid == bindparam(objid_param)) nb_updated = 0 if len(full_updates) > 0: full_upd_qry = obj_upd_qry.values( classif_id=bindparam(classif_id_col), classif_qual=bindparam(classif_qual_col), classif_auto_id=bindparam(classif_auto_id_col), classif_auto_score=bindparam(classif_auto_score_col), classif_auto_when=sql_now) nb_updated += self.session.execute(full_upd_qry, full_updates).rowcount # Partial updates if len(partial_updates) > 0: part_upd_qry = obj_upd_qry.values( classif_auto_id=bindparam(classif_auto_id_col), classif_auto_score=bindparam(classif_auto_score_col), classif_auto_when=sql_now) nb_updated += self.session.execute(part_upd_qry, partial_updates).rowcount # TODO: Cache upd logger.info("_auto: %d and %d gives %d rows updated ", len(full_updates), len(partial_updates), nb_updated) # Return statuses return nb_updated, all_changes
def create_tsv(self, src_project: Project, end_progress: int) -> Tuple[int, int]: """ Create the TSV file. """ req = self.req proj_id = src_project.projid self.update_progress(1, "Start TSV export") progress_range = end_progress - 1 # Get a fast count of the maximum of what to do count_sql = "SELECT SUM(nbr) AS cnt FROM projects_taxo_stat WHERE projid = :prj" res = self.ro_session.execute(text(count_sql), {"prj": proj_id}) obj_count = res.first()[0] # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet( self.ro_session, proj_id, self.filters) # Backup or not, the column namings are taken from common mapping # @See Mapping.py # TSV column order # field_order = ["object_id", "object_lat", "object_lon", "object_date", "object_time", "object_depth_max", # "object_annotation_status", "object_annotation_person_name", "object_annotation_person_email", # "object_annotation_date", "object_annotation_time", "object_annotation_category"] # formats = {"object_date": "TO_CHAR({0},'YYYYMMDD')", # "object_time": "TO_CHAR({0},'HH24MISS')", # "object_annotation_date": "TO_CHAR({0},'YYYYMMDD')", # "object_annotation_time": "TO_CHAR({0},'HH24MISS')", # "object_annotation_status": """ # CASE {0} # WHEN 'V' then 'validated' # WHEN 'P' then 'predicted' # WHEN 'D' then 'dubious' # ELSE {0} # END # """ # } # prefices = {ObjectHeader.__tablename__: "obh", # } # for a_fld in field_order: # mpg = GlobalMapping.PREDEFINED_FIELDS[a_fld] # mpg[""] # assert a_fld in GlobalMapping.PREDEFINED_FIELDS, "%s is not a mapped column" % a_fld date_fmt, time_fmt = "YYYYMMDD", "HH24MISS" if req.format_dates_times: date_fmt, time_fmt = "YYYY-MM-DD", "HH24:MI:SS" select_clause = "select " if req.with_images or (req.exp_type == ExportTypeEnum.backup): select_clause += "img.orig_file_name AS img_file_name, img.imgrank AS img_rank" if req.with_images: select_clause += ", img.file_name AS img_src_path" select_clause += ",\n" select_clause += """obh.orig_id AS object_id, obh.latitude AS object_lat, obh.longitude AS object_lon, TO_CHAR(obh.objdate,'{0}') AS object_date, TO_CHAR(obh.objtime,'{1}') AS object_time, obh.object_link, obh.depth_min AS object_depth_min, obh.depth_max AS object_depth_max, CASE obh.classif_qual WHEN 'V' then 'validated' WHEN 'P' then 'predicted' WHEN 'D' then 'dubious' ELSE obh.classif_qual END AS object_annotation_status, usr.name AS object_annotation_person_name, usr.email AS object_annotation_person_email, TO_CHAR(obh.classif_when,'{0}') AS object_annotation_date, TO_CHAR(obh.classif_when,'{1}') AS object_annotation_time, txo.display_name AS object_annotation_category """.format(date_fmt, time_fmt) if req.exp_type == ExportTypeEnum.backup: select_clause += ", txo.id AS object_annotation_category_id" else: select_clause += "," + TaxonomyBO.parents_sql( "obh.classif_id") + " AS object_annotation_hierarchy" if 'C' in req.tsv_entities: select_clause += "\n, obh.complement_info" # Deal with mappings, the goal is to emit SQL which will reconstitute the TSV structure src_mappings = ProjectMapping().load_from_project(src_project) if 'O' in req.tsv_entities: select_clause += "\n " + src_mappings.object_mappings.as_select_list( "obf") if 'S' in req.tsv_entities: select_clause += "\n, sam.orig_id AS sample_id, sam.dataportal_descriptor AS sample_dataportal_descriptor " select_clause += src_mappings.sample_mappings.as_select_list("sam") if 'P' in req.tsv_entities: select_clause += "\n, prc.orig_id AS process_id " select_clause += src_mappings.process_mappings.as_select_list( "prc") if 'A' in req.tsv_entities: select_clause += "\n, acq.orig_id AS acq_id, acq.instrument AS acq_instrument " select_clause += src_mappings.acquisition_mappings.as_select_list( "acq") if req.exp_type == ExportTypeEnum.dig_obj_ident: select_clause += "\n, obh.objid" if req.with_internal_ids: select_clause += """\n, obh.objid, obh.acquisid AS processid_internal, obh.acquisid AS acq_id_internal, sam.sampleid AS sample_id_internal, obh.classif_id, obh.classif_who, obh.classif_auto_id, txp.name classif_auto_name, obh.classif_auto_score, obh.classif_auto_when, obh.random_value object_random_value, obh.sunpos object_sunpos """ if 'S' in req.tsv_entities: select_clause += "\n, sam.latitude sample_lat, sam.longitude sample_long " # TODO: The condition on o.projid=1 in historical code below prevents any data production # if 'H' in req.tsv_entities: # sql1 += " , oh.classif_date AS histoclassif_date, classif_type AS histoclassif_type, " \ # "to3.name histoclassif_name, oh.classif_qual histoclassif_qual,uo3.name histoclassif_who, " \ # "classif_score histoclassif_score" # sql2 += """ LEFT JOIN (select o.objid, classif_date, classif_type, och.classif_id, # och.classif_qual, och.classif_who, classif_score # from objectsclassifhisto och # join objects o on o.objid=och.objid and o.projid=1 {0} # union all # select o.objid, o.classif_when classif_date, 'C' classif_type, classif_id, # classif_qual, classif_who, NULL # from objects o {0} where o.projid=1 # ) oh on o.objid=oh.objid # LEFT JOIN taxonomy to3 on oh.classif_id=to3.id # LEFT JOIN users uo3 on oh.classif_who=uo3.id # """.format(samplefilter) order_clause = OrderClause() if req.split_by == "sample": order_clause.add_expression("sam", "orig_id") split_field = "sample_id" # AKA sam.orig_id, but renamed in select list elif req.split_by == "taxo": select_clause += "\n, txo.display_name AS taxo_parent_child " order_clause.add_expression(None, "taxo_parent_child") split_field = "taxo_parent_child" else: order_clause.add_expression("sam", "orig_id") split_field = "object_id" # cette valeur permet d'éviter des erreurs plus loin dans r[split_field] order_clause.add_expression("obh", "objid") if req.with_images or (req.exp_type == ExportTypeEnum.backup): order_clause.add_expression(None, "img_rank") # Base SQL comes from filters from_, where, params = object_set.get_sql( self._get_owner_id(), order_clause, select_clause, all_images=not req.only_first_image) sql = select_clause + " FROM " + from_.get_sql() + where.get_sql( ) + order_clause.get_sql() logger.info("Execute SQL : %s" % sql) logger.info("Params : %s" % params) res = self.ro_session.execute(text(sql), params) now_txt = DateTime.now_time().strftime("%Y%m%d_%H%M") self.out_file_name = "export_{0:d}_{1:s}.{2}".format( proj_id, now_txt, "zip") produced_path = self.out_path / self.out_file_name zfile = zipfile.ZipFile(produced_path, 'w', allowZip64=True, compression=zipfile.ZIP_DEFLATED) splitcsv = (req.split_by != "") csv_filename = 'data.tsv' # Just a temp name as there is a rename while filling up the Zip if splitcsv: # Produce into the same temp file all the time, at zipping time the name in archive will vary prev_value = "NotAssigned" # To trigger a sequence change immediately else: # The zip will contain a single TSV with same base name as the zip prev_value = self.out_file_name.replace('.zip', '') csv_path: Path = self.out_path / csv_filename # Constant path to a (sometimes) changing file csv_fd: Optional[IO] = None csv_wtr = None # Store the images to save in a separate CSV. Useless if not exporting images but who cares. temp_img_file = self.out_path / "images.csv" img_file_fd = open(temp_img_file, 'w') img_wtr = csv.DictWriter(img_file_fd, ["src_path", "dst_path"], delimiter='\t', quotechar='"', lineterminator='\n') img_wtr.writeheader() # Prepare TSV structure col_descs = [ a_desc for a_desc in res.cursor.description if a_desc.name != "img_src_path" ] # read latitude column to get float DB type for a_desc in col_descs: if a_desc.name == "object_lat": db_float_type = a_desc.type_code break else: raise float_cols = set() # Prepare float separator conversion, if not required the set will just be empty if req.coma_as_separator: for a_desc in col_descs: if a_desc.type_code == db_float_type: float_cols.add(a_desc.name) tsv_cols = [a_desc.name for a_desc in col_descs] tsv_types_line = { name: ('[f]' if a_desc.type_code == db_float_type else '[t]') for name, a_desc in zip(tsv_cols, col_descs) } nb_rows = 0 nb_images = 0 used_dst_pathes = set() for r in res: # Rows from SQLAlchemy are not mutable, so we need a clone for arranging values a_row = dict(r) if ((splitcsv and (prev_value != a_row[split_field]) ) # At each split column values change or (nb_rows == 0)): # And anyway for the first row # Start of sequence, eventually end of previous sequence if csv_fd: csv_fd.close() # Close previous file self.store_csv_into_zip(zfile, prev_value, csv_path) if splitcsv: prev_value = a_row[split_field] logger.info("Writing into file %s", csv_path) csv_fd = open(csv_path, 'w', encoding='latin_1') csv_wtr = csv.DictWriter(csv_fd, tsv_cols, delimiter='\t', quotechar='"', lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC) csv_wtr.writeheader() if req.exp_type == ExportTypeEnum.backup: # Write types line for backup type csv_wtr.writerow(tsv_types_line) if req.with_images: copy_op = {"src_path": a_row.pop("img_src_path")} if req.exp_type == ExportTypeEnum.dig_obj_ident: # Images will be stored in a per-category directory, but there is a single TSV at the Zip root categ = a_row['object_annotation_category'] # All names cannot directly become directories a_row['img_file_name'] = self.get_DOI_imgfile_name( a_row['objid'], a_row['img_rank'], categ, a_row['img_file_name']) copy_op["dst_path"] = a_row['img_file_name'] else: # It's a backup # Images are stored in the Zip subdirectory per sample/taxo, i.e. at the same place as # their referring TSV dst_path = "{0}/{1}".format(prev_value, a_row['img_file_name']) if dst_path in used_dst_pathes: # Avoid duplicates in zip as only the last entry will be present during unzip # root cause: for UVP6 bundles, the vignette and original image are both stored # with the same name. img_with_rank = "{0}/{1}".format( a_row['img_rank'], a_row['img_file_name']) a_row[ 'img_file_name'] = img_with_rank # write into TSV the corrected path dst_path = prev_value + "/" + img_with_rank used_dst_pathes.add(dst_path) copy_op["dst_path"] = dst_path img_wtr.writerow(copy_op) nb_images += 1 # Remove CR from comments if 'C' in req.tsv_entities and a_row['complement_info']: a_row['complement_info'] = ' '.join( a_row['complement_info'].splitlines()) # Replace decimal separator for cname in float_cols: if a_row[cname] is not None: a_row[cname] = str(a_row[cname]).replace('.', ',') assert csv_wtr is not None # Produce the row in the TSV csv_wtr.writerow(a_row) nb_rows += 1 if nb_rows % self.ROWS_REPORT_EVERY == 0: msg = "Row %d of max %d" % (nb_rows, obj_count) logger.info(msg) self.update_progress(1 + progress_range / obj_count * nb_rows, msg) if csv_fd: csv_fd.close() # Close last file self.store_csv_into_zip(zfile, prev_value, csv_path) logger.info("Extracted %d rows", nb_rows) img_file_fd.close() if zfile: zfile.close() return nb_rows, nb_images
def projects_for_user(session: Session, user: User, for_managing: bool = False, not_granted: bool = False, title_filter: str = '', instrument_filter: str = '', filter_subset: bool = False) -> List[ProjectIDT]: """ :param session: :param user: The user for which the list is needed. :param for_managing: If set, list the projects that the user can manage. :param not_granted: If set, list (only) the projects on which given user has no right, so user can request access to them. :param title_filter: If set, filter out the projects with title not matching the required string, or if set to a number, filter out the projects of which ID does not match. :param instrument_filter: If set, filter out the projects which do not have given instrument in at least one sample. :param filter_subset: If set, filter out any project of which title contains 'subset'. :return: The project IDs """ sql_params: Dict[str, Any] = {"user_id": user.id} # Default query: all projects, eventually with first manager information # noinspection SqlResolve sql = """SELECT p.projid FROM projects p LEFT JOIN ( """ + ProjectPrivilegeBO.first_manager_by_project() + """ ) fpm ON fpm.projid = p.projid """ if not_granted: # Add the projects for which no entry is found in ProjectPrivilege sql += """ LEFT JOIN projectspriv pp ON p.projid = pp.projid AND pp.member = :user_id WHERE pp.member is null """ if for_managing: sql += " AND False " else: if not user.has_role(Role.APP_ADMINISTRATOR): # Not an admin, so restrict to projects which current user can work on, or view sql += """ JOIN projectspriv pp ON p.projid = pp.projid AND pp.member = :user_id """ if for_managing: sql += """ AND pp.privilege = '%s' """ % ProjectPrivilegeBO.MANAGE sql += " WHERE 1 = 1 " if title_filter != '': sql += """ AND ( title ILIKE '%%'|| :title ||'%%' OR TO_CHAR(p.projid,'999999') LIKE '%%'|| :title ) """ sql_params["title"] = title_filter if instrument_filter != '': sql += """ AND p.projid IN (SELECT DISTINCT sam.projid FROM samples sam, acquisitions acq WHERE acq.acq_sample_id = sam.sampleid AND acq.instrument ILIKE '%%'|| :instrum ||'%%' ) """ sql_params["instrum"] = instrument_filter if filter_subset: sql += """ AND NOT title ILIKE '%%subset%%' """ with CodeTimer("Projects query:", logger): res: Result = session.execute(text(sql), sql_params) # single-element tuple :( DBAPI ret = [an_id for an_id, in res.fetchall()] return ret # type:ignore