def refresh_materialized_view(schema_name, view_name, db: DatabaseSession): """ Refresh materialized view <schema>.<view> in ID3C. """ LOG.info(f"Refreshing materialized view «{schema_name}.{view_name}»") db.cursor().execute( """ select refresh_materialized_view(%s, %s) """, (schema_name, view_name)) LOG.info("Successfully refreshed materialized view")
def ls(): """List users.""" session = DatabaseSession() with session.cursor() as cursor: cursor.execute(""" select usename as name, pg_catalog.shobj_description(usesysid, 'pg_authid') as description, coalesce(array_agg(groname order by groname) filter (where groname is not null), '{}') as roles from pg_catalog.pg_user left join pg_catalog.pg_group on (grolist @> array[usesysid]) where usename not in ('postgres', 'rdsadmin') group by name, usesysid order by name """) users = list(cursor) # Line up name + description nicely into a column def maxlen(attr): return max(map(len, filter(None, map(attrgetter(attr), users))), default=0) template = "{:<%d}" % (maxlen("name") + 3) for user in users: click.secho(template.format(user.name), bold=True, nl=False) click.echo(", ".join(user.roles))
def insert_dets(db: DatabaseSession, project: Project, offers: List[dict]): """ Inserts synthethic DETs into ``receiving.redcap_det`` for the REDCap record *offers* made for *project*. """ dets = [(Json(det(project, offer, TESTING_INSTRUMENT)), ) for offer in offers] LOG.info(f"Inserting {len(dets):,} synthetic REDCap DETs for {project}") with db.cursor() as cursor: execute_values( cursor, """ insert into receiving.redcap_det (document) values %s """, dets)
def etl_fhir(*, db: DatabaseSession): LOG.debug(f"Starting the FHIR ETL routine, revision {REVISION}") # Fetch and iterate over FHIR documents that aren't processed # # Use a server-side cursor by providing a name and limit to one fetched # record at a time, to limit local process size. # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same FHIR documents. LOG.debug("Fetching unprocessed FHIR documents") fhir_documents = db.cursor("fhir") fhir_documents.itersize = 1 fhir_documents.execute( """ select fhir_id as id, document from receiving.fhir where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]), )) for record in fhir_documents: with db.savepoint(f"FHIR document {record.id}"): LOG.info(f"Processing FHIR document {record.id}") assert_bundle_collection(record.document) bundle = Bundle(record.document) resources = extract_resources(bundle) # Loop over every Resource the Bundle entry, processing what is # needed along the way. try: assert_required_resource_types_present(resources) process_bundle_entries(db, bundle) except SkipBundleError as error: LOG.warning( f"Skipping bundle in FHIR document «{record.id}»: {error}") mark_skipped(db, record.id) continue mark_processed(db, record.id, {"status": "processed"}) LOG.info(f"Finished processing FHIR document {record.id}")
def upsert_encounter_location(db: DatabaseSession, encounter_id: int, relation: str, location_id: int) -> Any: """ Upserts an encounter location by its *encounter_id* and *relation*. """ LOG.debug(f"Upserting encounter {relation} location") with db.cursor() as cursor: cursor.execute( """ insert into warehouse.encounter_location (encounter_id, relation, location_id) values (%s, %s, %s) on conflict (encounter_id, relation) do update set location_id = excluded.location_id """, (encounter_id, relation, location_id)) assert cursor.rowcount == 1, "Upsert affected no rows!"
def mark_processed(db: DatabaseSession, det_id: int, entry = {}) -> None: LOG.debug(f"Appending to processing log of REDCap DET record {det_id}") data = { "det_id": det_id, "log_entry": Json({ **entry, "timestamp": datetime.now(timezone.utc), }), } with db.cursor() as cursor: cursor.execute(""" update receiving.redcap_det set processing_log = processing_log || %(log_entry)s where redcap_det_id = %(det_id)s """, data)
def set_ls(): """List identifier sets.""" session = DatabaseSession() with session.cursor() as cursor: cursor.execute(""" select name, description from warehouse.identifier_set order by lower(name) """) sets = list(cursor) # Line up names nicely into a column template = "{:<%d}" % (max(len(s.name) for s in sets) + 3) for set in sets: click.secho(template.format(set.name), bold = True, nl = False) click.echo(set.description)
def labels(filename, layout: str='default'): """ Make barcode labels for an existing batch of identifiers. A PDF of printable barcode labels is generated using the Lab Labels¹ instance <https://backoffice.seattleflu.org/labels/>. An alternative instance may be used by setting the LABEL_API environment variable to the instance URL. The batch of identifiers to make labels for is selected interactively based on the identifier set and time of original generation. If --layout is requested, the printable barcode labels will use the given version of the layout, if available. ¹ https://github.com/MullinsLab/Lab-Labels """ session = DatabaseSession() # Fetch batches of identifiers with session.cursor() as cursor: cursor.execute(""" select identifier_set.name as set_name, to_char(generated, 'FMDD Mon YYYY') as generated_date, generated, count(*) from warehouse.identifier join warehouse.identifier_set using (identifier_set_id) group by generated, identifier_set.name order by generated, identifier_set.name """) batches = list(cursor) # Print batches for selection click.echo("\nThe following batches of identifiers exist:\n") for index, batch in enumerate(batches, 1): click.secho(f"{index:2d}) ", bold = True, nl = False) click.echo(f"{batch.generated_date:>11} — {batch.count:>5,} {batch.set_name}") # Which batch do we want? choice = click.prompt( "\nFor which batch would you like to make labels", prompt_suffix = "? ", type = click.IntRange(1, len(batches)), default = str(len(batches))) chosen_batch = batches[int(choice) - 1] # Fetch identifiers for the chosen batch with session.cursor() as cursor: cursor.execute(""" select uuid, barcode from warehouse.identifier join warehouse.identifier_set using (identifier_set_id) where identifier_set.name = %s and generated = %s """, (chosen_batch.set_name, chosen_batch.generated)) identifiers = list(cursor) assert len(identifiers) == chosen_batch.count label_layout = labelmaker.layout_identifiers(chosen_batch.set_name, identifiers, layout) pdf = labelmaker.generate_pdf(label_layout) filename.write(pdf)
def etl_longitudinal(*, db: DatabaseSession): LOG.debug(f"Starting the longitudinal ETL routine, revision {REVISION}") # Fetch and iterate over longitudinal records that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same longitudinal records. LOG.debug("Fetching unprocessed longitudinal records") longitudinal = db.cursor("longitudinal") longitudinal.execute( """ select longitudinal_id as id, document from receiving.longitudinal where not processing_log @> %s order by id for update """, (Json([{ "revision": REVISION }]), )) for record in longitudinal: with db.savepoint(f"longitudinal record {record.id}"): LOG.info(f"Processing longitudinal record {record.id}") # Check validity of barcode received_sample_identifier = sample_identifier(db, record.document) # Check sample exists in database if received_sample_identifier: sample = find_sample(db, identifier=received_sample_identifier) else: sample = None # Most of the time we expect to see existing sites so a # select-first approach makes the most sense to avoid useless # updates. site = find_or_create_site( db, identifier=site_identifier(record.document), details={"type": record.document['type']}) # Most of the time we expect to see existing individuals and new # encounters. # Encounters we see more than once are presumed to be # corrections. individual = find_or_create_individual( db, identifier=record.document["individual"], sex=sex(record.document)) encounter = upsert_encounter( db, identifier=record.document["identifier"], encountered=record.document["encountered"], individual_id=individual.id, site_id=site.id, age=age(record.document), details=encounter_details(record.document)) if sample: sample = update_sample(db, sample=sample, encounter_id=encounter.id) # Link encounter to a Census tract, if we have it tract_identifier = record.document.get("census_tract") if tract_identifier: tract = find_location(db, "tract", str(tract_identifier)) assert tract, f"Tract «{tract_identifier}» is unknown" upsert_encounter_location(db, encounter_id=encounter.id, relation="residence", location_id=tract.id) mark_processed(db, record.id, {"status": "processed"}) LOG.info(f"Finished processing longitudinal record {record.id}")
def etl_manifest(*, db: DatabaseSession): LOG.debug(f"Starting the manifest ETL routine, revision {REVISION}") # XXX TODO: Stop hardcoding valid identifier sets. Instead, accept them as # an option or config (and validate option choices against what's actually # in the database). We won't want to validate using click.option(), # because that would necessitate a database connection simply to run # bin/id3c at all. # -trs, 13 May 2019 expected_identifier_sets = { "samples": {"samples"}, "collections": { "collections-environmental", "collections-fluathome.org", "collections-household-intervention", "collections-household-intervention-asymptomatic", "collections-household-observation", "collections-household-observation-asymptomatic", "collections-kiosks", "collections-kiosks-asymptomatic", "collections-seattleflu.org", "collections-swab&send", "collections-swab&send-asymptomatic", "collections-self-test", "collections-scan", "collections-haarvi", "samples-haarvi", }, "rdt": {"collections-fluathome.org"} } # Fetch and iterate over samples that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same samples. LOG.debug("Fetching unprocessed manifest records") manifest = db.cursor("manifest") manifest.execute(""" select manifest_id as id, document from receiving.manifest where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]),)) for manifest_record in manifest: with db.savepoint(f"manifest record {manifest_record.id}"): LOG.info(f"Processing record {manifest_record.id}") # Convert sample barcode to full identifier, ensuring it's # known and from the correct identifier set. sample_barcode = manifest_record.document.pop("sample") sample_identifier = find_identifier(db, sample_barcode) if not sample_identifier: LOG.warning(f"Skipping sample with unknown sample barcode «{sample_barcode}»") mark_skipped(db, manifest_record.id) continue if (manifest_record.document.get("sample_type") and manifest_record.document["sample_type"] == "rdt"): assert sample_identifier.set_name in expected_identifier_sets["rdt"], \ (f"Sample identifier found in set «{sample_identifier.set_name}», " + f"not {expected_identifier_sets['rdt']}") else: assert sample_identifier.set_name in expected_identifier_sets["samples"], \ (f"Sample identifier found in set «{sample_identifier.set_name}», " + f"not {expected_identifier_sets['samples']}") # Optionally, convert the collection barcode to full # identifier, ensuring it's known and from the correct # identifier set. collection_barcode = manifest_record.document.pop("collection", None) collection_identifier = find_identifier(db, collection_barcode) if collection_barcode else None if collection_barcode and not collection_identifier: LOG.warning(f"Skipping sample with unknown collection barcode «{collection_barcode}»") mark_skipped(db, manifest_record.id) continue assert not collection_identifier \ or collection_identifier.set_name in expected_identifier_sets["collections"], \ f"Collection identifier found in set «{collection_identifier.set_name}», not {expected_identifier_sets['collections']}" # type: ignore # Sample collection date collection_date = manifest_record.document.get("date") # Upsert sample cooperatively with enrollments ETL routine # # The details document was intentionally modified by two pop()s # earlier to remove barcodes that were looked up. The # rationale is that we want just one clear place in the # warehouse for each piece of information. sample, status = upsert_sample(db, identifier = sample_identifier.uuid, collection_identifier = collection_identifier.uuid if collection_identifier else None, collection_date = collection_date, additional_details = manifest_record.document) mark_loaded(db, manifest_record.id, status = status, sample_id = sample.id) LOG.info(f"Finished processing manifest record {manifest_record.id}")
def upsert_sample(db: DatabaseSession, identifier: str, collection_identifier: Optional[str], collection_date: Optional[str], additional_details: dict) -> Tuple[Any, str]: """ Upsert sample by its *identifier* and/or *collection_identifier*. An existing sample has its *identifier*, *collection_identifier*, *collection_date* updated, and the provided *additional_details* are merged (at the top-level only) into the existing sample details, if any. Raises an exception if there is more than one matching sample. """ data = { "identifier": identifier, "collection_identifier": collection_identifier, "collection_date": collection_date, "additional_details": Json(additional_details), } # Look for existing sample(s) with db.cursor() as cursor: cursor.execute(""" select sample_id as id, identifier, collection_identifier, encounter_id from warehouse.sample where identifier = %(identifier)s or collection_identifier = %(collection_identifier)s for update """, data) samples = list(cursor) # Nothing found → create if not samples: LOG.info("Creating new sample") status = 'created' sample = db.fetch_row(""" insert into warehouse.sample (identifier, collection_identifier, collected, details) values (%(identifier)s, %(collection_identifier)s, date_or_null(%(collection_date)s), %(additional_details)s) returning sample_id as id, identifier, collection_identifier, encounter_id """, data) # One found → update elif len(samples) == 1: status = 'updated' sample = samples[0] LOG.info(f"Updating existing sample {sample.id}") sample = db.fetch_row(""" update warehouse.sample set identifier = %(identifier)s, collection_identifier = %(collection_identifier)s, collected = date_or_null(%(collection_date)s), details = coalesce(details, '{}') || %(additional_details)s where sample_id = %(sample_id)s returning sample_id as id, identifier, collection_identifier, encounter_id """, { **data, "sample_id": sample.id }) assert sample.id, "Update affected no rows!" # More than one found → error else: raise Exception(f"More than one sample matching sample and/or collection barcodes: {samples}") return sample, status
def kit_enrollments(*, db: DatabaseSession): LOG.debug( f"Starting the kit enrollments ETL routine, revision {ENROLLMENTS_REVISION}" ) expected_barcode_types = {"ScannedSelfSwab", "ManualSelfSwab"} LOG.debug("Fetching unprocessed enrollments") enrollments = db.cursor("enrollments") enrollments.execute( """ select enrollment_id as id, document from receiving.enrollment where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": ENROLLMENTS_REVISION }]), )) for enrollment in enrollments: with db.savepoint(f"enrollment {enrollment.id}"): LOG.info(f"Processing enrollment {enrollment.id}") # Find encounter that should have been created # from this enrollment record through etl enrollments encounter = find_encounter(db, enrollment.document["id"]) # Error out the kit etl process if no encounter found # The kit etl process can try again starting with this record # next time with the idea that the encounter will be # created by then. if not encounter: raise EncounterNotFoundError( f"No encounter with identifier «{enrollment.document['id']}» found" ) # Skip and mark the enrollment document as processed if the # encounter found is linked to a site that is not self-test if encounter.site != "self-test": LOG.debug( f"Found encounter {encounter.id} «{encounter.identifier}»" + f"linked to site «{encounter.site}», not 'self-test'") mark_enrollment_processed(db, enrollment.id) continue for code in enrollment.document["sampleCodes"]: barcode = code.get("code") # Kit must have a barcode if not barcode: LOG.warning(f"No barcode found in sampleCodes {code}") continue # Barcode must be of expected barcode type if code["type"] not in expected_barcode_types: LOG.debug(f"Skipping barcode with type {code['type']}") continue # Convert kit barcode to full identifier kit_identifier = find_identifier(db, barcode) if not kit_identifier: LOG.warning( f"Skipping kit with unknown barcode «{barcode}»") continue if kit_identifier.set_name not in expected_identifier_sets[ "kits"]: LOG.warning( f"Skipping kit with identifier found in " + f"set «{kit_identifier.set_name}» not {expected_identifier_sets['kits']}" ) continue details = {"type": code["type"]} kit, status = upsert_kit_with_encounter( db, identifier=kit_identifier.uuid, encounter_id=encounter.id, additional_details=details) if status == "updated": update_kit_samples(db, kit) mark_enrollment_processed(db, enrollment.id) LOG.info(f"Finished processing enrollment {enrollment.id}")
def kit_manifests(*, db: DatabaseSession): LOG.debug( f"Starting the kits manifests ETL routine, revision {MANIFEST_REVISION}" ) LOG.debug("Fetching unprocessed manifest records") manifest = db.cursor("manifest") manifest.execute( """ select manifest_id as id, document from receiving.manifest where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": MANIFEST_REVISION }]), )) for manifest_record in manifest: with db.savepoint(f"manifest record {manifest_record.id}"): LOG.info(f"Processing record {manifest_record.id}") # Mark record as skipped # if it does not contain a kit related sample if "kit" not in manifest_record.document: LOG.info( f"Skipping manifest record {manifest_record.id} without kit data" ) mark_skipped(db, manifest_record.id) continue sample_barcode = manifest_record.document.pop("sample") sample_identifier = find_identifier(db, sample_barcode) # Mark record as skipped # if it has an unknown sample barcode if not sample_identifier: LOG.warning( f"Skipping manifest record with unknown sample barcode «{sample_barcode}»" ) mark_skipped(db, manifest_record.id) continue # Mark record as skipped sample identifier set is unexpected if sample_identifier.set_name not in expected_identifier_sets[ "samples"]: LOG.warning( f"Skipping manifest record with sample identifier found in " + f"set «{sample_identifier.set_name}», not {expected_identifier_sets['samples']}" ) mark_skipped(db, manifest_record.id) continue # Find sample that should have been created from this # manifest record via etl manifest sample = find_sample(db, sample_identifier.uuid) # Error out the kit etl process if no sample found # The kit etl process can try again starting with this record # next time with the idea that the sample will be # created by then. if not sample: raise SampleNotFoundError( f"No sample with «{sample_identifier.uuid}» found") # Mark record as skipped if the sample does not have a # sample type (utm or rdt) if sample.type not in {"utm", "rdt"}: LOG.info(f"Skipping manifest record {manifest_record.id} " + f"with unknown sample type {sample.type}") mark_skipped(db, manifest_record.id) continue kit_barcode = manifest_record.document.pop("kit") kit_identifier = find_identifier(db, kit_barcode) # Mark record as skipped if it has an unknown kit barcode if not kit_identifier: LOG.warning( f"Skipping kit with unknown barcode «{kit_barcode}»") mark_skipped(db, manifest_record.id) continue # Mark record as skipped if kit identifier set is unexpected if kit_identifier.set_name not in expected_identifier_sets["kits"]: LOG.warning( f"Skipping kit with identifier found in " + f"set «{kit_identifier.set_name}» not {expected_identifier_sets['kits']}" ) mark_skipped(db, manifest_record.id) continue # List of extra data not needed for kit record that can # be removed before adding manifest document to kit details extra_data = [ "collection", "sample_type", "aliquot_date", "aliquots", "racks" ] for key in extra_data: manifest_record.document.pop(key, None) # Try to find identifier for the test-strip barcode for rdt samples if sample.type == "rdt": update_test_strip(db, manifest_record.document) kit, status = upsert_kit_with_sample( db, identifier=kit_identifier.uuid, sample=sample, additional_details=manifest_record.document) if status == "updated": update_sample(db, sample, kit.encounter_id) mark_loaded(db, manifest_record.id, status, kit.id)
def etl_presence_absence(*, db: DatabaseSession): LOG.debug( f"Starting the presence_absence ETL routine, revision {REVISION}") # Fetch and iterate over presence-absence tests that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same presence-absence tests. LOG.debug("Fetching unprocessed presence-absence tests") presence_absence = db.cursor("presence_absence") presence_absence.itersize = 1 presence_absence.execute( """ select presence_absence_id as id, document, received::date as received_date from receiving.presence_absence where not processing_log @> %s order by id for update """, (Json([{ "revision": REVISION }]), )) for group in presence_absence: with db.savepoint(f"presence_absence group {group.id}"): LOG.info(f"Processing presence_absence group {group.id}") # Samplify will now send documents with a top level key # "samples". The new format also includes a "chip" key for each # sample which is then included in the unique identifier for # each presence/absence result # -Jover, 14 Nov 2019 try: received_samples = group.document["samples"] except KeyError as error: # Skip documents in the old format because they do not # include the "chip" key which is needed for the # unique identifier for each result. # -Jover, 14 Nov 2019 # Also skip old format to avoid ingesting wrong data from # plate swapped data! This will lead to 188 samples with the # wrong nwgc_id associated with them. # -Jover, 06 Dec 2019 if (group.document.get("store") is not None or group.document.get("Update") is not None): LOG.info( "Skipping presence_absence record that is in old format" ) mark_processed(db, group.id) continue else: raise error from None for received_sample in received_samples: received_sample_barcode = received_sample.get("investigatorId") if not received_sample_barcode: LOG.info( f"Skipping sample «{received_sample['sampleId']}» without SFS barcode" ) continue # Don't go any further if the sample is marked as Failed sample_failed = received_sample.get("sampleFailed") if sample_failed is True: LOG.info( f"Skipping sample «{received_sample_barcode}» that has been failed" ) continue # Don't go any further if there are no results to import. test_results = received_sample["targetResults"] if not test_results: LOG.warning( f"Skipping sample «{received_sample_barcode}» without any results" ) continue received_sample_id = str(received_sample["sampleId"]) chip = received_sample.get("chip") extraction_date = received_sample.get("extractionDate") assay_name = received_sample.get("assayName") assay_date = received_sample.get("assayDate") # The assayType field will be removed after Samplify starts # sending us OpenArray results with target.clinicalStatus. # # kfay, 28 Dec 2020 assay_type = received_sample.get("assayType") # Guard against empty chip values assert chip or "chip" not in received_sample, "Received bogus chip id" # Must be current results LOG.info(f"Processing sample «{received_sample_barcode}»") if not received_sample.get("isCurrentExpressionResult"): LOG.warning( f"Skipping out-of-date results for sample «{received_sample_barcode}»" ) continue # Barcode must match a known identifier received_sample_identifier = sample_identifier( db, received_sample_barcode) if not received_sample_identifier: LOG.warning( f"Skipping results for sample without a known identifier «{received_sample_barcode}»" ) continue # Track Samplify's internal ids for our samples, which is # unfortunately necessary for linking genomic data NWGC also # sends. sample = update_sample( db, identifier=received_sample_identifier, additional_details=sample_details(received_sample)) # Finally, process all results. for test_result in test_results: test_result_target_id = test_result["geneTarget"] LOG.debug( f"Processing target «{test_result_target_id}» for \ sample «{received_sample_barcode}»") # Skip this result if it's actually a non-result present = target_present(test_result) if present is ...: LOG.debug( f"No test result for «{test_result_target_id}», skipping" ) continue # Most of the time we expect to see existing targets so a # select-first approach makes the most sense to avoid useless # updates. target = find_or_create_target( db, identifier=test_result_target_id, control=target_control(test_result["controlStatus"])) # The unique identifier for each result. If chip is # applicable, then it's included to differentiate the same # sample being run on multiple chips (uncommon, but it # happens). if chip: identifier = f"NWGC/{received_sample_id}/{target.identifier}/{chip}" else: identifier = f"NWGC/{received_sample_id}/{target.identifier}" # Most of the time we expect to see new samples and new # presence_absence tests, so an insert-first approach makes more sense. # Presence-absence tests we see more than once are presumed to be # corrections. upsert_presence_absence(db, identifier=identifier, sample_id=sample.id, target_id=target.id, present=present, details=presence_absence_details( test_result, group.received_date, chip, extraction_date, assay_name, assay_date, assay_type)) mark_processed(db, group.id) LOG.info(f"Finished processing presence_absence group {group.id}")
def etl_manifest(*, db: DatabaseSession): LOG.debug(f"Starting the manifest ETL routine, revision {REVISION}") # XXX TODO: Stop hardcoding valid identifier sets. Instead, accept them as # an option or config (and validate option choices against what's actually # in the database). We won't want to validate using click.option(), # because that would necessitate a database connection simply to run # bin/id3c at all. # -trs, 13 May 2019 expected_identifier_sets = { "samples": {"samples"}, "collections": { "collections-environmental", "collections-fluathome.org", "collections-household-intervention", "collections-household-intervention-asymptomatic", "collections-household-observation", "collections-household-observation-asymptomatic", "collections-kiosks", "collections-kiosks-asymptomatic", "collections-seattleflu.org", "collections-swab&send", "collections-swab&send-asymptomatic", "collections-self-test", "collections-scan", "collections-scan-kiosks", "collections-haarvi", "samples-haarvi", "collections-validation", "collections-uw-home", "collections-uw-observed", "collections-household-general", "collections-childcare", "collections-school-testing-home", "collections-school-testing-observed", "collections-apple-respiratory", "collections-apple-respiratory-serial", "collections-adult-family-home-outbreak", "collections-workplace-outbreak", }, "rdt": {"collections-fluathome.org"} } # Fetch and iterate over samples that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same samples. LOG.debug("Fetching unprocessed manifest records") manifest = db.cursor("manifest") manifest.execute( """ select manifest_id as id, document from receiving.manifest where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]), )) for manifest_record in manifest: with db.savepoint(f"manifest record {manifest_record.id}"): LOG.info(f"Processing record {manifest_record.id}") # When updating an existing row, update the identifiers # only if the record has both the 'sample' and # 'collection' keys. should_update_identifiers = "sample" in manifest_record.document \ and "collection" in manifest_record.document # Sample collection date # Don't pop this entry off the document. For backwards # compatibility reasons, keep it in the document so that 'date' # also gets written to the 'details' column in warehouse.sample. collected_date = manifest_record.document.get("date", None) # Attempt to find barcodes and their related identifiers sample_barcode = manifest_record.document.pop("sample", None) sample_identifier = find_identifier( db, sample_barcode) if sample_barcode else None collection_barcode = manifest_record.document.pop( "collection", None) collection_identifier = find_identifier( db, collection_barcode) if collection_barcode else None # Skip a record if it has no associated barcodes if not sample_barcode and not collection_barcode: LOG.warning( f"Skipping record «{manifest_record.id}» because it has neither a sample " "barcode nor a collection barcode") mark_skipped(db, manifest_record.id) continue # Skip a record if it has a sample barcode but the barcode doesn't match an identifier if sample_barcode and not sample_identifier: LOG.warning( f"Skipping sample with unknown sample barcode «{sample_barcode}»" ) mark_skipped(db, manifest_record.id) continue # Skip a record if it has a collection barcode but the barcode doesn't match an identifier if collection_barcode and not collection_identifier: LOG.warning( f"Skipping sample with unknown collection barcode «{collection_barcode}»" ) mark_skipped(db, manifest_record.id) continue # Skip a record if the collection identifier is from an unexpected set if collection_identifier and collection_identifier.set_name not in expected_identifier_sets[ "collections"]: LOG.warning( f"Skipping sample because collection identifier found in set «{collection_identifier.set_name}», not \ {expected_identifier_sets['collections']}") mark_skipped(db, manifest_record.id) continue # Validate the sample identifer and assert if a record fails if sample_identifier: if (manifest_record.document.get("sample_type") and manifest_record.document["sample_type"] == "rdt"): assert sample_identifier.set_name in expected_identifier_sets["rdt"], \ (f"Sample identifier found in set «{sample_identifier.set_name}»," + f"not {expected_identifier_sets['rdt']}") else: assert sample_identifier.set_name in expected_identifier_sets["samples"], \ (f"Sample identifier found in set «{sample_identifier.set_name}», " + f"not {expected_identifier_sets['samples']}") # Upsert sample cooperatively with enrollments ETL routine # # The details document was intentionally modified by two pop()s # earlier to remove barcodes that were looked up. # The rationale is that we want just one clear place in the # warehouse for each piece of information. sample, status = upsert_sample( db, update_identifiers=should_update_identifiers, identifier=sample_identifier.uuid if sample_identifier else None, collection_identifier=collection_identifier.uuid if collection_identifier else None, collection_date=collected_date, additional_details=manifest_record.document) mark_loaded(db, manifest_record.id, status=status, sample_id=sample.id) LOG.info( f"Finished processing manifest record {manifest_record.id}")
def etl_consensus_genome(*, db: DatabaseSession): LOG.debug( f"Starting the consensus genome ETL routine, revision {REVISION}") # Fetch and iterate over consensus genome records that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same consensus genome records. LOG.debug("Fetching unprocessed consensus genome records") consensus_genome = db.cursor("consensus genome") consensus_genome.execute( """ select consensus_genome_id as id, document, received from receiving.consensus_genome where not processing_log @> %s order by id for update """, (Json([{ "revision": REVISION }]), )) for record in consensus_genome: with db.savepoint(f"consensus genome record {record.id}"): LOG.info(f"Processing consensus genome record {record.id}") # Verify sample identifier is in the database sample = find_sample(db, record.document["sample_identifier"], for_update=False) assert sample, f"No sample found with identifier «{record.document['sample_identifier']}»" # Most of the time we expect to see existing sequence read sets, # but we also want to update the details log. However, the # only unique constraint on the sequence_read_set table is # defined within a trigger function, as Postgres does not allow # unique constraints on array columns. Therefore, perform a # find-or-create followed by an update to the details column to # avoid conflict. sequence_read_set = find_or_create_sequence_read_set( db, record.document, sample) status = record.document.get("status") # Find the matching organism within the warehouse for the # reference organism organism_name = get_lineage(db, record.document) organism = find_organism(db, organism_name) assert organism, f"No organism found with name «{organism_name}»" # Only upsert genome and genomic sequences if the assembly job # was marked as complete. if status == 'complete': # Most of the time we expect to see new sequences, so an # insert-first approach makes the most sense to avoid useless # queries. genome = upsert_genome(db, sequence_read_set=sequence_read_set, organism=organism, document=record.document) for masked_consensus in record.document['masked_consensus']: genomic_sequence = upsert_genomic_sequence( db, genome=genome, masked_consensus=masked_consensus) update_sequence_read_set_details(db, sequence_read_set.id, organism, status) mark_processed(db, record.id, {"status": "processed"}) LOG.info( f"Finished processing consensus genome record {record.id}")
def notify(*, action: str): LOG.debug(f"Starting the reportable conditions notification routine, revision {REVISION}") db = DatabaseSession() def webhook(suffix): return os.environ.get("SLACK_WEBHOOK_ALERTS_TEST") \ or os.environ[f"SLACK_WEBHOOK_REPORTING_{suffix}"] slack_webhooks = { "ncov-reporting": webhook("HCOV19"), } projects = [ { "collection_sets": {"collections-childcare"}, "slack_channel_name": "ncov-reporting-childcare", "slack_webhook": webhook("HCOV19_CHILDCARE"), }, { "collection_sets": {"collections-apple-respiratory"}, "slack_channel_name": "ncov-reporting-apple", "slack_webhook": webhook("HCOV19_APPLE"), }, { "collection_sets": {"collections-school-testing-home", "collections-school-testing-observed", "collections-radxup-yakima-schools-home", "collections-radxup-yakima-schools-observed"}, "slack_channel_name": "ncov-reporting-schools", "slack_webhook": webhook("HCOV19_SCHOOLS"), }, { "collection_sets": {"collections-adult-family-home-outbreak"}, "slack_channel_name": "ncov-reporting-afh", "slack_webhook": webhook("HCOV19_AFH"), }, { "collection_sets": {"collections-workplace-outbreak"}, "slack_channel_name": "ncov-reporting-workplace", "slack_webhook": webhook("HCOV19_WORKPLACE"), } ] # Fetch and iterate over reportable condition records that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same reportable condition records. LOG.debug("Fetching unprocessed reportable conditions records") reportable_conditions = db.cursor("reportable_conditions") reportable_conditions.execute(""" select reportable_condition_v1.*, presence_absence_id as id from shipping.reportable_condition_v1 join warehouse.presence_absence using (presence_absence_id) where details @> %s is not true order by id for update of presence_absence; """, (Json({"reporting_log":[{ "revision": REVISION }]}),)) processed_without_error = None try: for record in reportable_conditions: with db.savepoint(f"reportable condition presence_absence_id {record.id}"): LOG.info(f"Processing reportable condition, presence_absence_id «{record.id}»") if not record.site: LOG.info(f"No site found for presence_absence_id «{record.id}». " + "Inferring site from manifest data.") responses = {'ncov-reporting': send_slack_post_request(record, slack_webhooks['ncov-reporting'])} # Also send study-specific results to their respective channels for project in projects: if (record.collection_set_name in project['collection_sets']): responses[project['slack_channel_name']] = send_slack_post_request( record, project['slack_webhook']) # Check all POSTs to Slack were successful to mark as processed # This does mean that if one fails but others succeed, there # will be duplicate POSTs to the already succeeded channels. # The chance of this happening is pretty small, but we can # revisit this if it becomes a common problem # -Jover, 21 October 2020 if all(response.status_code == 200 for response in responses.values()): mark_processed(db, record.id, {"status": "sent Slack notification"}) LOG.info(f"Finished processing presence_absence_id «{record.id}»") else: for channel, response in responses.items(): if response.status_code != 200: LOG.error(("Error: A Slack notification could not " \ f"be sent to the channel «{channel}» for " f"presence_absence_id «{record.id}».\n" \ f"Slack API returned status code {response.status_code}: "\ f"{response.text}")) except Exception as error: processed_without_error = False LOG.error(f"Aborting with error") raise error from None else: processed_without_error = True finally: if action == "prompt": ask_to_commit = \ "Commit all changes?" if processed_without_error else \ "Commit successfully processed reportable condition records up to this point?" commit = click.confirm(ask_to_commit) else: commit = action == "commit" if commit: LOG.info( "Committing all changes" if processed_without_error else \ "Committing successfully processed reportable condition records up to this point") db.commit() else: LOG.info("Rolling back all changes; the database will not be modified") db.rollback()
def decorated(*args, db: DatabaseSession, log_output: bool, **kwargs): LOG.debug( f"Starting the REDCap DET ETL routine {name}, revision {revision}" ) redcap_det = db.cursor(f"redcap-det {name}") redcap_det.execute( """ select redcap_det_id as id, document from receiving.redcap_det where not processing_log @> %s and document::jsonb @> %s order by id for update """, (Json([etl_id]), Json(det_contains))) with pickled_cache(CACHE_FILE) as cache: for det in redcap_det: with db.savepoint(f"redcap_det {det.id}"): LOG.info(f"Processing REDCap DET {det.id}") instrument = det.document['instrument'] # Only pull REDCap record if # `include_incomplete` flag was not included and # the current instrument is complete if not include_incomplete and not is_complete( instrument, det.document): LOG.debug( f"Skipping incomplete or unverified REDCap DET {det.id}" ) mark_skipped(db, det.id, etl_id) continue redcap_record = get_redcap_record_from_det( det.document, raw_coded_values) if not redcap_record: LOG.debug( f"REDCap record is missing or invalid. Skipping REDCap DET {det.id}" ) mark_skipped(db, det.id, etl_id) continue # Only process REDCap record if all required instruments are complete incomplete_instruments = { instrument for instrument in required_instruments if not is_complete(instrument, redcap_record) } if incomplete_instruments: LOG.debug(f"The following required instruments «{incomplete_instruments}» are not yet marked complete. " + \ f"Skipping REDCap DET {det.id}") mark_skipped(db, det.id, etl_id) continue bundle = routine(db=db, cache=cache, det=det, redcap_record=redcap_record) if not bundle: mark_skipped(db, det.id, etl_id) continue if log_output: print(as_json(bundle)) insert_fhir_bundle(db, bundle) mark_loaded(db, det.id, etl_id, bundle['id'])
def etl_enrollments(*, db: DatabaseSession): LOG.debug(f"Starting the enrollment ETL routine, revision {REVISION}") # Fetch and iterate over enrollments that aren't processed # # Use a server-side cursor by providing a name. This ensures we limit how # much data we fetch at once, to limit local process size. Each enrollment # document is ~10 KB and the default fetch size (cursor.itersize) is 2,000, # thus we'll get ~20 MB on each fetch of 2,000 enrollments. # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same enrollments. LOG.debug("Fetching unprocessed enrollments") enrollments = db.cursor("enrollments") enrollments.execute( """ select enrollment_id as id, document from receiving.enrollment where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]), )) for enrollment in enrollments: with db.savepoint(f"enrollment {enrollment.id}"): LOG.info(f"Processing enrollment {enrollment.id}") # Out of an abundance of caution, fail when the schema version # of the enrollment document changes. This ensures manual # intervention happens on document structure changes. After # seeing how versions are handled over time, this restriction # may be toned down a bit. known_versions = {"1.1.0", "1.0.0"} assert enrollment.document["schemaVersion"] in known_versions, \ f"Document schema version {enrollment.document['schemaVersion']} is not in {known_versions}" # Most of the time we expect to see existing sites so a # select-first approach makes the most sense to avoid useless # updates. site = find_or_create_site( db, identifier=enrollment.document["site"]["name"], details=site_details(enrollment.document["site"])) # Most of the time we expect to see new individuals and new # encounters, so an insert-first approach makes more sense. # Encounters we see more than once are presumed to be # corrections. individual = upsert_individual( db, identifier=enrollment.document["participant"], sex=assigned_sex(enrollment.document)) encounter = upsert_encounter( db, identifier=enrollment.document["id"], encountered=enrollment.document["startTimestamp"], individual_id=individual.id, site_id=site.id, age=age(enrollment.document), details=encounter_details(enrollment.document)) process_samples(db, encounter.id, enrollment.document) process_locations(db, encounter.id, enrollment.document) mark_processed(db, enrollment.id) LOG.info(f"Finished processing enrollment {enrollment.id}")
def etl_clinical(*, db: DatabaseSession): LOG.debug(f"Starting the clinical ETL routine, revision {REVISION}") # Fetch and iterate over clinical records that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same clinical records. LOG.debug("Fetching unprocessed clinical records") clinical = db.cursor("clinical") clinical.execute(""" select clinical_id as id, document from receiving.clinical where not processing_log @> %s order by id for update """, (Json([{ "revision": REVISION }]),)) for record in clinical: with db.savepoint(f"clinical record {record.id}"): LOG.info(f"Processing clinical record {record.id}") # Check validity of barcode received_sample_identifier = sample_identifier(db, record.document["barcode"]) # Skip row if no matching identifier found if received_sample_identifier is None: LOG.info("Skipping due to unknown barcode " + \ f"{record.document['barcode']}") mark_skipped(db, record.id) continue # Check sample exists in database sample = find_sample(db, identifier = received_sample_identifier) # Skip row if sample does not exist if sample is None: LOG.info("Skipping due to missing sample with identifier " + \ f"{received_sample_identifier}") mark_skipped(db, record.id) continue # Most of the time we expect to see existing sites so a # select-first approach makes the most sense to avoid useless # updates. site = find_or_create_site(db, identifier = site_identifier(record.document["site"]), details = {"type": "retrospective"}) # Most of the time we expect to see new individuals and new # encounters, so an insert-first approach makes more sense. # Encounters we see more than once are presumed to be # corrections. individual = upsert_individual(db, identifier = record.document["individual"], sex = sex(record.document["AssignedSex"])) encounter = upsert_encounter(db, identifier = record.document["identifier"], encountered = record.document["encountered"], individual_id = individual.id, site_id = site.id, age = age(record.document), details = encounter_details(record.document)) sample = update_sample(db, sample = sample, encounter_id = encounter.id) # Link encounter to a Census tract, if we have it tract_identifier = record.document.get("census_tract") if tract_identifier: # Special-case float-like identifiers in earlier date tract_identifier = re.sub(r'\.0$', '', str(tract_identifier)) tract = find_location(db, "tract", tract_identifier) assert tract, f"Tract «{tract_identifier}» is unknown" upsert_encounter_location(db, encounter_id = encounter.id, relation = "residence", location_id = tract.id) mark_processed(db, record.id, {"status": "processed"}) LOG.info(f"Finished processing clinical record {record.id}")
def decorated(*args, db: DatabaseSession, log_output: bool, det_limit: int = None, redcap_api_batch_size: int, geocoding_cache: str = None, **kwargs): LOG.debug( f"Starting the REDCap DET ETL routine {name}, revision {revision}" ) project = Project(redcap_url, project_id) if det_limit: LOG.debug(f"Processing up to {det_limit:,} pending DETs") limit = sql.Literal(det_limit) else: LOG.debug(f"Processing all pending DETs") limit = sql.SQL("all") redcap_det = db.cursor(f"redcap-det {name}") redcap_det.execute( sql.SQL(""" select redcap_det_id as id, document from receiving.redcap_det where not processing_log @> %s and document::jsonb @> %s order by id limit {} for update """).format(limit), (Json([etl_id]), Json(det_contains))) # First loop of the DETs to determine how to process each one. # Uses `first_complete_dets` to keep track of which DET to # use to process a unique REDCap record. # Uses `all_dets` to keep track of the status for each DET record # so that they can be processed in order of `redcap_det_id` later. # --Jover, 21 May 2020 first_complete_dets: Dict[str, Any] = {} all_dets: List[Dict[str, str]] = [] for det in redcap_det: instrument = det.document['instrument'] record_id = det.document['record'] # Assume we are loading all DETs # Status will be updated to "skip" if DET does not need to be processed det_record = {"id": det.id, "status": "load"} # Only pull REDCap record if # `include_incomplete` flag was not included and # the current instrument is complete if not include_incomplete and not is_complete( instrument, det.document): det_record.update({ "status": "skip", "reason": "incomplete/unverified DET" }) # Check if this is record has an older DET # Skip latest DET in favor of the first DET # This is done to continue our first-in-first-out # semantics of our receiving tables elif first_complete_dets.get(record_id): det_record.update({ "status": "skip", "reason": "repeat REDCap record" }) else: first_complete_dets[record_id] = det det_record["record_id"] = record_id all_dets.append(det_record) if not first_complete_dets: LOG.info("No new complete DETs found.") else: # Batch request records from REDCap LOG.info(f"Fetching REDCap project {project_id}") record_ids = list(first_complete_dets.keys()) LOG.info( f"Fetching {len(record_ids):,} REDCap records from project {project.id}" ) # Convert list of REDCap records to a dict so that # records can be looked up by record id. # Records with repeating instruments or longitudinal # events will have multiple entries in the list. redcap_records: DefaultDict[str, List[dict]] = defaultdict(list) batches = list(chunked(record_ids, redcap_api_batch_size)) for i, batch in enumerate(batches, 1): LOG.info( f"Fetching REDCap record batch {i:,}/{len(batches):,} of size {len(batch):,}" ) for record in project.records(ids=batch, raw=raw_coded_values): redcap_records[record.id].append(record) # Process all DETs in order of redcap_det_id with pickled_cache(geocoding_cache) as cache: for det in all_dets: with db.savepoint(f"redcap_det {det['id']}"): LOG.info(f"Processing REDCap DET {det['id']}") if det["status"] == "skip": LOG.debug( f"Skipping REDCap DET {det['id']} due to {det['reason']}" ) mark_skipped(db, det["id"], etl_id, det["reason"]) continue received_det = first_complete_dets.pop( det["record_id"]) redcap_record_instances = redcap_records.get( received_det.document["record"]) if not redcap_record_instances: LOG.debug( f"REDCap record is missing or invalid. Skipping REDCap DET {received_det.id}" ) mark_skipped(db, received_det.id, etl_id, "invalid REDCap record") continue bundle = routine( db=db, cache=cache, det=received_det, redcap_record_instances=redcap_record_instances) if not bundle: LOG.debug( f"Skipping REDCap DET {received_det.id} due to insufficient data in REDCap record." ) mark_skipped(db, received_det.id, etl_id, "insufficient data in record") continue if log_output: print(as_json(bundle)) insert_fhir_bundle(db, bundle) mark_loaded(db, received_det.id, etl_id, bundle['id'])