def find_or_create_target(db: DatabaseSession, identifier: str, control: bool) -> Any: """ Select presence_absence test target by *identifier*, or insert it if it doesn't exist. """ LOG.debug(f"Looking up target «{identifier}»") target = db.fetch_row( """ select target_id as id, identifier from warehouse.target where identifier = %s """, (identifier, )) if target: LOG.info(f"Found target {target.id} «{target.identifier}»") else: LOG.debug(f"Target «{identifier}» not found, adding") data = {"identifier": identifier, "control": control} target = db.fetch_row( """ insert into warehouse.target (identifier, control) values (%(identifier)s, %(control)s) returning target_id as id, identifier """, data) LOG.info(f"Created target {target.id} «{target.identifier}»") return target
def ls(): """List users.""" session = DatabaseSession() with session.cursor() as cursor: cursor.execute(""" select usename as name, pg_catalog.shobj_description(usesysid, 'pg_authid') as description, coalesce(array_agg(groname order by groname) filter (where groname is not null), '{}') as roles from pg_catalog.pg_user left join pg_catalog.pg_group on (grolist @> array[usesysid]) where usename not in ('postgres', 'rdsadmin') group by name, usesysid order by name """) users = list(cursor) # Line up name + description nicely into a column def maxlen(attr): return max(map(len, filter(None, map(attrgetter(attr), users))), default=0) template = "{:<%d}" % (maxlen("name") + 3) for user in users: click.secho(template.format(user.name), bold=True, nl=False) click.echo(", ".join(user.roles))
def find_or_create_site(db: DatabaseSession, identifier: str, details: dict) -> Any: """ Select encounter site by *identifier*, or insert it if it doesn't exist. """ LOG.debug(f"Looking up site «{identifier}»") site = db.fetch_row( """ select site_id as id, identifier from warehouse.site where identifier = %s """, (identifier, )) if site: LOG.info(f"Found site {site.id} «{site.identifier}»") else: LOG.debug(f"Site «{identifier}» not found, adding") data = { "identifier": identifier, "details": Json(details), } site = db.fetch_row( """ insert into warehouse.site (identifier, details) values (%(identifier)s, %(details)s) returning site_id as id, identifier """, data) LOG.info(f"Created site {site.id} «{site.identifier}»") return site
def refresh_materialized_view(schema_name, view_name, db: DatabaseSession): """ Refresh materialized view <schema>.<view> in ID3C. """ LOG.info(f"Refreshing materialized view «{schema_name}.{view_name}»") db.cursor().execute( """ select refresh_materialized_view(%s, %s) """, (schema_name, view_name)) LOG.info("Successfully refreshed materialized view")
def update_sample(db: DatabaseSession, identifier: str = None, collection_identifier: str = None, additional_details: dict = None) -> Any: """ Find sample by *identifier* and update with any *additional_details*. The provided *additional_details* are merged (at the top-level only) into the existing sample details, if any. Raises an :class:`SampleNotFoundError` if there is no sample known by *identifier*. """ if identifier: LOG.debug(f"Looking up sample with identifier «{identifier}»") sample = db.fetch_row(""" select sample_id as id, identifier, details from warehouse.sample where identifier = %s for update """, (identifier,)) elif collection_identifier: LOG.debug(f"Looking up sample with collection_identifier «{collection_identifier}»") sample = db.fetch_row(""" select sample_id as id, collection_identifier as identifier, details from warehouse.sample where collection_identifier = %s for update """, (collection_identifier,)) if not sample: LOG.error(f"No sample found with identifier «{identifier}» or collection identifier «{collection_identifier}»") raise SampleNotFoundError(identifier) LOG.info(f"Found sample {sample.id} «{sample.identifier}»") if additional_details: LOG.info(f"Updating sample {sample.id} «{sample.identifier}» details") update_details_nwgc_id(sample, additional_details) sample = db.fetch_row(""" update warehouse.sample set details = coalesce(details, '{}') || %s where sample_id = %s returning sample_id as id, identifier """, (Json(additional_details), sample.id)) assert sample.id, "Updating details affected no rows!" return sample
def etl_fhir(*, db: DatabaseSession): LOG.debug(f"Starting the FHIR ETL routine, revision {REVISION}") # Fetch and iterate over FHIR documents that aren't processed # # Use a server-side cursor by providing a name and limit to one fetched # record at a time, to limit local process size. # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same FHIR documents. LOG.debug("Fetching unprocessed FHIR documents") fhir_documents = db.cursor("fhir") fhir_documents.itersize = 1 fhir_documents.execute( """ select fhir_id as id, document from receiving.fhir where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]), )) for record in fhir_documents: with db.savepoint(f"FHIR document {record.id}"): LOG.info(f"Processing FHIR document {record.id}") assert_bundle_collection(record.document) bundle = Bundle(record.document) resources = extract_resources(bundle) # Loop over every Resource the Bundle entry, processing what is # needed along the way. try: assert_required_resource_types_present(resources) process_bundle_entries(db, bundle) except SkipBundleError as error: LOG.warning( f"Skipping bundle in FHIR document «{record.id}»: {error}") mark_skipped(db, record.id) continue mark_processed(db, record.id, {"status": "processed"}) LOG.info(f"Finished processing FHIR document {record.id}")
def find_sample(db: DatabaseSession, identifier: str, for_update=True) -> Any: """ Find sample by *identifier* and return sample. """ LOG.debug(f"Looking up sample «{identifier}»") query_ending = "" if for_update: query_ending = "for update" sample = db.fetch_row( """ select sample_id as id, identifier, encounter_id from warehouse.sample where identifier = %s or collection_identifier = %s """ + query_ending, ( identifier, identifier, )) if not sample: LOG.info(f"No sample with identifier «{identifier}» found") return None LOG.info(f"Found sample {sample.id} «{sample.identifier}»") return sample
def upload(sequence_read_set_file, unknown_sample_output, db: DatabaseSession): """ Upload sequence read sets into the database warehouse. <sequence-read-set.ndjson> must be a newline delimited JSON file produced by this command's sibling command. Sequence read sets with NWGC sample IDs that cannot be found within the database warehouse are printed out as newline delimited JSON file <unknown-sample-output.ndjson>. """ for sequence_read_set in sequence_read_set_file: sample_set = json.loads(sequence_read_set) nwgc_id = sample_set.get("sample") urls = sample_set.get("urls") with db.savepoint(f"sequence read set {nwgc_id}"): LOG.info(f"Processing sequence read set for sample {nwgc_id}") sample_id = find_sample(db, nwgc_id) if sample_id is None: LOG.warning( f"Skipping sample with NWGC ID «{nwgc_id}» because it was not found within warehouse." ) unknown_sample_output.write(sequence_read_set) continue sequence_read_set = insert_sequence_read_set(db, sample_id, urls) LOG.info( f"Finished uploading sequence read set for sample {nwgc_id}")
def upsert_individual(db: DatabaseSession, identifier: str, sex: str = None) -> Any: """ Upsert individual by their *identifier*. """ LOG.debug(f"Upserting individual «{identifier}»") data = { "identifier": identifier, "sex": sex, } individual = db.fetch_row( """ insert into warehouse.individual (identifier, sex) values (%(identifier)s, %(sex)s) on conflict (identifier) do update set sex = excluded.sex returning individual_id as id, identifier """, data) assert individual.id, "Upsert affected no rows!" LOG.info(f"Upserted individual {individual.id} «{individual.identifier}»") return individual
def update_sample( db: DatabaseSession, sample, encounter_id: Optional[int] = None) -> Optional[MinimalSampleRecord]: """ Update sample's encounter_id. """ LOG.debug( f"Updating sample {sample.id}, linked to encounter {encounter_id}") if sample.encounter_id: assert sample.encounter_id == encounter_id, \ f"Sample {sample.id} already linked to another encounter {sample.encounter_id}" return None sample = db.fetch_row( """ update warehouse.sample set encounter_id = %s where sample_id = %s returning sample_id as id, identifier """, (encounter_id, sample.id)) assert sample.id, "Updating encounter_id affected no rows!" LOG.info(f"Updated sample {sample.id}") return sample
def upsert_sample(db: DatabaseSession, collection_identifier: str, encounter_id: int, details: dict) -> Any: """ Upsert collected sample by its *collection_identifier*. The provided *details* are merged (at the top-level only) into the existing sample details, if any. """ LOG.debug(f"Upserting sample collection «{collection_identifier}»") data = { "collection_identifier": collection_identifier, "encounter_id": encounter_id, "details": Json(details), } sample = db.fetch_row( """ insert into warehouse.sample (collection_identifier, encounter_id, details) values (%(collection_identifier)s, %(encounter_id)s, %(details)s) on conflict (collection_identifier) do update set encounter_id = excluded.encounter_id, details = coalesce(sample.details, '{}') || %(details)s returning sample_id as id, identifier, collection_identifier, encounter_id """, data) assert sample.id, "Upsert affected no rows!" LOG.info( f"Upserted sample {sample.id} with collection identifier «{sample.collection_identifier}»" ) return sample
def upload(longitudinal_file): """ Upload longitudinal records into the database receiving area. <longitudinal.ndjson> must be a newline-delimited JSON file produced by this command's sibling commands. Once records are uploaded, the longitudinal ETL routine will reconcile the longitudinal records with known sites, individuals, encounters and samples. """ db = DatabaseSession() try: LOG.info(f"Copying longitudinal records from {longitudinal_file.name}") row_count = db.copy_from_ndjson(("receiving", "longitudinal", "document"), longitudinal_file) LOG.info(f"Received {row_count:,} longitudinal records") LOG.info("Committing all changes") db.commit() except: LOG.info("Rolling back all changes; the database will not be modified") db.rollback() raise
def mint(set_name, count, *, labels, quiet): """ Mint new identifiers and make barcode labels. <set name> is an existing identifier set, e.g. as output by the `id3c identifier set ls` command. <count> is the number of new identifiers to mint. If --labels are requested, a PDF of printable barcode labels is generated using the Lab Labels¹ instance <https://backoffice.seattleflu.org/labels/>. An alternative instance may be used by setting the LABEL_API environment variable to the instance URL. ¹ https://github.com/MullinsLab/Lab-Labels """ session = DatabaseSession() minted = db.mint_identifiers(session, set_name, count) if not quiet: for identifier in minted: print(identifier.barcode, identifier.uuid, sep = "\t") if labels: layout = labelmaker.layout_identifiers(set_name, minted) pdf = labelmaker.generate_pdf(layout) labels.write(pdf)
def lookup(filename: click.File, scale: str, lat_column: str, lng_column: str, drop_latlng_columns: bool): """ Lookup locations containing a given latitude and longitude. <filename.{csv,tsv,xls,xlsx}> accepts `-` as a special file that refers to stdin, assuming data is formatted as comma-separated values. This is expected when piping output from `id3c geocode` directly into this command. Lookup results are output to stdout as comma-separated values, with location identifier as <scale>_identifier. """ input_df = load_input_from_file_or_stdin(filename) lat_lngs = extract_lat_lng_from_input(input_df, lat_column, lng_column) db = DatabaseSession() locations = [] for lat_lng in lat_lngs: location = location_lookup(db, lat_lng, scale) locations.append(location.identifier if location else None) output_df = input_df.copy() output_df[f"{scale}_identifier"] = locations if drop_latlng_columns: try: output_df.drop(columns=[lat_column, lng_column], inplace=True) except KeyError as error: LOG.error(f"{error}. Columns are: {list(output_df.columns)}") raise error from None output_df.to_csv(sys.stdout, index=False)
def find_or_create_sequence_read_set( db: DatabaseSession, document: dict, sample: MinimalSampleRecord) -> SequenceReadSetRecord: """ Find sequence read set given a *sample* and consensus genome record *document*, inserting the sequence read set if it does not exist. Return the sequence read set. """ urls = document['metadata']['urls'] LOG.debug( dedent(f""" Looking up sequence read set with sample ID «{sample.id}» and urls {urls} """)) sequence_read_set: SequenceReadSetRecord = db.fetch_row( """ select sequence_read_set_id as id, sample_id, urls from warehouse.sequence_read_set where sample_id = %s and urls @> %s and %s @> urls """, (sample.id, urls, urls)) if sequence_read_set: LOG.info(f"Found sequence read set {sequence_read_set.id}") else: LOG.debug( dedent(f""" Sequence read set not found for sample id «{sample.id}» and urls {urls} """)) data = { "sample_id": sample.id, "urls": urls, } sequence_read_set = db.fetch_row( """ insert into warehouse.sequence_read_set (sample_id, urls) values (%(sample_id)s, %(urls)s) returning sequence_read_set_id as id, sample_id, urls """, data) LOG.info(f"Created sequence read set {sequence_read_set.id}") return sequence_read_set
def upload(manifest_file): """ Upload manifest records into the database receiving area. <manifest.ndjson> must be a newline-delimited JSON file produced by this command's sibling commands. Once records are uploaded, the manifest ETL routine will reconcile the manifest records with known identifiers and existing samples. """ db = DatabaseSession() try: LOG.info(f"Copying sample manifest records from {manifest_file.path}") row_count = db.copy_from_ndjson(("receiving", "manifest", "document"), manifest_file) LOG.info(f"Received {row_count:,} manifest records") LOG.info("Committing all changes") db.commit() except: LOG.info("Rolling back all changes; the database will not be modified") db.rollback() raise
def upload(table_name, document_file): """ Upload documents into a receiving table. <table> must be the name of a table in the receiving schema which has a "document" column. All other columns in the table must be optional on insert, as only "document" is provided. <documents.ndjson> must be a newline-delimited JSON file containing one document per line to insert as a table row. """ db = DatabaseSession() try: LOG.info(f"Copying documents from {document_file.name}") row_count = db.copy_from_ndjson(("receiving", table_name, "document"), document_file) LOG.info(f"Received {row_count:,} {table_name} records") LOG.info("Committing all changes") db.commit() except: LOG.info("Rolling back all changes; the database will not be modified") db.rollback() raise
def upload(consensus_genome_file): """ Upload consensus genomes and summary statistics to the warehouse receiving area. Consensus genomes and summary statistics should be in newline-delimited JSON format that matches those generated by the assembly pipeline. """ db = DatabaseSession() try: LOG.info( f"Copying consensus genome records from {consensus_genome_file.name}" ) row_count = db.copy_from_ndjson( ("receiving", "consensus_genome", "document"), consensus_genome_file) LOG.info(f"Received {row_count:,} consensus genome records") LOG.info("Committing all changes") db.commit() except: LOG.info("Rolling back all changes; the database will not be modified") db.rollback() raise
def set_ls(): """List identifier sets.""" session = DatabaseSession() with session.cursor() as cursor: cursor.execute(""" select name, description from warehouse.identifier_set order by lower(name) """) sets = list(cursor) # Line up names nicely into a column template = "{:<%d}" % (max(len(s.name) for s in sets) + 3) for set in sets: click.secho(template.format(set.name), bold = True, nl = False) click.echo(set.description)
def set_use_create(use, description): """ Create a new identifier set use. \b <use> is the name of the new use. <description> is a comment explaining the purpose of the use. """ session = DatabaseSession() with session: use, = session.fetch_row(""" insert into warehouse.identifier_set_use (use, description) values (%s, %s) returning use """, (use, description)) click.echo( "Created identifier set use " + click.style(use, bold = True))
def set_create(name, description): """ Create a new identifier set. \b <name> is the name of the new set. <description> is a comment explaining the purpose of the set. """ session = DatabaseSession() with session: identifier_set_id, = session.fetch_row(""" insert into warehouse.identifier_set (name, description) values (%s, %s) returning identifier_set_id """, (name, description)) click.echo( "Created identifier set " + click.style(name, bold = True) + f" (#{identifier_set_id})")
def find_kit(db: DatabaseSession, identifier: str) -> KitRecord: """ Look for kit using *identifier* within the database """ kit: KitRecord = db.fetch_row( """ select kit_id as id, identifier, encounter_id, rdt_sample_id, utm_sample_id from warehouse.kit where identifier = %s for update """, (identifier, )) return kit
def set_create(name, use, description): """ Create a new identifier set. \b <name> is the name of the new set. <use> is the use classification for this set, valid values can be found using `id3c identifier set-use ls` command. <description> is a comment explaining the purpose of the set. """ session = DatabaseSession() with session: identifier_set_id, = session.fetch_row(""" insert into warehouse.identifier_set (name, use, description) values (%s, %s, %s) returning identifier_set_id """, (name, use, description)) click.echo( "Created identifier set " + click.style(name, bold = True) + f" (#{identifier_set_id})")
def find_or_create_individual(db: DatabaseSession, identifier: str, sex: str, details: dict = None) -> Any: """ Select indinvidual by *identifier*, or insert it if it doesn't exist. """ LOG.debug(f"Looking up individual «{identifier}»") individual = db.fetch_row( """ select individual_id as id, identifier from warehouse.individual where identifier = %s """, (identifier, )) if individual: LOG.info(f"Found individual {individual.id} «{individual.identifier}»") else: LOG.debug(f"individual «{identifier}» not found, adding") data = { "identifier": identifier, "sex": sex, "details": Json(details), } individual = db.fetch_row( """ insert into warehouse.individual (identifier, sex, details) values (%(identifier)s, %(sex)s, %(details)s) returning individual_id as id, identifier """, data) LOG.info( f"Created individual {individual.id} «{individual.identifier}»") return individual
def upsert_encounter(db: DatabaseSession, identifier: str, encountered: str, individual_id: int, site_id: int, age: Optional[str], details: dict) -> Any: """ Upsert encounter by its *identifier*. """ LOG.debug(f"Upserting encounter «{identifier}»") data = { "identifier": identifier, "encountered": encountered, "individual_id": individual_id, "site_id": site_id, "age": age, "details": Json(details), } encounter = db.fetch_row( """ insert into warehouse.encounter ( identifier, individual_id, site_id, encountered, age, details) values ( %(identifier)s, %(individual_id)s, %(site_id)s, %(encountered)s::timestamp with time zone, %(age)s, %(details)s) on conflict (identifier) do update set individual_id = excluded.individual_id, site_id = excluded.site_id, encountered = excluded.encountered, age = excluded.age, details = excluded.details returning encounter_id as id, identifier """, data) assert encounter.id, "Upsert affected no rows!" LOG.info(f"Upserted encounter {encounter.id} «{encounter.identifier}»") return encounter
def upsert_presence_absence(db: DatabaseSession, identifier: str, sample_id: int, target_id: int, present: bool, details: dict) -> Any: """ Upsert presence_absence by its *identifier*. Confirmed with Samplify that their numeric identifier for each test is stable and persistent. """ LOG.debug(f"Upserting presence_absence «{identifier}»") data = { "identifier": identifier, "sample_id": sample_id, "target_id": target_id, "present": present, "details": Json(details) } presence_absence = db.fetch_row( """ insert into warehouse.presence_absence ( identifier, sample_id, target_id, present, details) values ( %(identifier)s, %(sample_id)s, %(target_id)s, %(present)s, %(details)s) on conflict (identifier) do update set sample_id = excluded.sample_id, target_id = excluded.target_id, present = excluded.present, details = coalesce(presence_absence.details, '{}') || excluded.details returning presence_absence_id as id, identifier """, data) assert presence_absence.id, "Upsert affected no rows!" LOG.info(f"Upserted presence_absence {presence_absence.id} \ «{presence_absence.identifier}»") return presence_absence
def insert_dets(db: DatabaseSession, project: Project, offers: List[dict]): """ Inserts synthethic DETs into ``receiving.redcap_det`` for the REDCap record *offers* made for *project*. """ dets = [(Json(det(project, offer, TESTING_INSTRUMENT)), ) for offer in offers] LOG.info(f"Inserting {len(dets):,} synthetic REDCap DETs for {project}") with db.cursor() as cursor: execute_values( cursor, """ insert into receiving.redcap_det (document) values %s """, dets)
def mint(set_name, count, *, labels, layout, quiet, dry_run): """ Mint new identifiers and make barcode labels. <set name> is an existing identifier set, e.g. as output by the `id3c identifier set ls` command. <count> is the number of new identifiers to mint. If --labels are requested, a PDF of printable barcode labels is generated using the Lab Labels¹ instance <https://backoffice.seattleflu.org/labels/>. An alternative instance may be used by setting the LABEL_API environment variable to the instance URL. If --layout is requested, the printable barcode labels will use the given version of the layout, if available. ¹ https://github.com/MullinsLab/Lab-Labels """ session = DatabaseSession() with session: minted = db.mint_identifiers(session, set_name, count) if dry_run: LOG.info("Rolling back all changes; the database will not be modified") session.rollback() if not quiet: for identifier in minted: print(identifier.barcode, identifier.uuid, sep = "\t") if labels: label_layout = labelmaker.layout_identifiers(set_name, minted, layout) pdf = labelmaker.generate_pdf(label_layout) labels.write(pdf)
def reset_password(name): """ Reset a user's password. <username> is the login name of the new user. The newly generated random password will be displayed on stdout. """ session = DatabaseSession() with session: new_password = db.reset_password(session, name) click.echo( click.style("New password is ", bold=True) + click.style(new_password, fg="red"))
def insert_fhir_bundle(db: DatabaseSession, bundle: dict) -> None: """ Insert FHIR bundles into the receiving area of the database. """ LOG.debug(f"Inserting FHIR bundle «{bundle['id']}»") fhir = db.fetch_row(""" insert into receiving.fhir(document) values (%s) returning fhir_id as id """, (Json(bundle),)) assert fhir.id, "Insert affected no rows!" LOG.info(f"Inserted FHIR document {fhir.id} «{bundle['id']}»")