def upload(sequence_read_set_file, unknown_sample_output, db: DatabaseSession): """ Upload sequence read sets into the database warehouse. <sequence-read-set.ndjson> must be a newline delimited JSON file produced by this command's sibling command. Sequence read sets with NWGC sample IDs that cannot be found within the database warehouse are printed out as newline delimited JSON file <unknown-sample-output.ndjson>. """ for sequence_read_set in sequence_read_set_file: sample_set = json.loads(sequence_read_set) nwgc_id = sample_set.get("sample") urls = sample_set.get("urls") with db.savepoint(f"sequence read set {nwgc_id}"): LOG.info(f"Processing sequence read set for sample {nwgc_id}") sample_id = find_sample(db, nwgc_id) if sample_id is None: LOG.warning( f"Skipping sample with NWGC ID «{nwgc_id}» because it was not found within warehouse." ) unknown_sample_output.write(sequence_read_set) continue sequence_read_set = insert_sequence_read_set(db, sample_id, urls) LOG.info( f"Finished uploading sequence read set for sample {nwgc_id}")
def etl_fhir(*, db: DatabaseSession): LOG.debug(f"Starting the FHIR ETL routine, revision {REVISION}") # Fetch and iterate over FHIR documents that aren't processed # # Use a server-side cursor by providing a name and limit to one fetched # record at a time, to limit local process size. # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same FHIR documents. LOG.debug("Fetching unprocessed FHIR documents") fhir_documents = db.cursor("fhir") fhir_documents.itersize = 1 fhir_documents.execute( """ select fhir_id as id, document from receiving.fhir where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]), )) for record in fhir_documents: with db.savepoint(f"FHIR document {record.id}"): LOG.info(f"Processing FHIR document {record.id}") assert_bundle_collection(record.document) bundle = Bundle(record.document) resources = extract_resources(bundle) # Loop over every Resource the Bundle entry, processing what is # needed along the way. try: assert_required_resource_types_present(resources) process_bundle_entries(db, bundle) except SkipBundleError as error: LOG.warning( f"Skipping bundle in FHIR document «{record.id}»: {error}") mark_skipped(db, record.id) continue mark_processed(db, record.id, {"status": "processed"}) LOG.info(f"Finished processing FHIR document {record.id}")
def etl_clinical(*, db: DatabaseSession): LOG.debug(f"Starting the clinical ETL routine, revision {REVISION}") # Fetch and iterate over clinical records that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same clinical records. LOG.debug("Fetching unprocessed clinical records") clinical = db.cursor("clinical") clinical.execute(""" select clinical_id as id, document from receiving.clinical where not processing_log @> %s order by id for update """, (Json([{ "revision": REVISION }]),)) for record in clinical: with db.savepoint(f"clinical record {record.id}"): LOG.info(f"Processing clinical record {record.id}") # Check validity of barcode received_sample_identifier = sample_identifier(db, record.document["barcode"]) # Skip row if no matching identifier found if received_sample_identifier is None: LOG.info("Skipping due to unknown barcode " + \ f"{record.document['barcode']}") mark_skipped(db, record.id) continue # Check sample exists in database sample = find_sample(db, identifier = received_sample_identifier) # Skip row if sample does not exist if sample is None: LOG.info("Skipping due to missing sample with identifier " + \ f"{received_sample_identifier}") mark_skipped(db, record.id) continue # Most of the time we expect to see existing sites so a # select-first approach makes the most sense to avoid useless # updates. site = find_or_create_site(db, identifier = site_identifier(record.document["site"]), details = {"type": "retrospective"}) # Most of the time we expect to see new individuals and new # encounters, so an insert-first approach makes more sense. # Encounters we see more than once are presumed to be # corrections. individual = upsert_individual(db, identifier = record.document["individual"], sex = sex(record.document["AssignedSex"])) encounter = upsert_encounter(db, identifier = record.document["identifier"], encountered = record.document["encountered"], individual_id = individual.id, site_id = site.id, age = age(record.document), details = encounter_details(record.document)) sample = update_sample(db, sample = sample, encounter_id = encounter.id) # Link encounter to a Census tract, if we have it tract_identifier = record.document.get("census_tract") if tract_identifier: # Special-case float-like identifiers in earlier date tract_identifier = re.sub(r'\.0$', '', str(tract_identifier)) tract = find_location(db, "tract", tract_identifier) assert tract, f"Tract «{tract_identifier}» is unknown" upsert_encounter_location(db, encounter_id = encounter.id, relation = "residence", location_id = tract.id) mark_processed(db, record.id, {"status": "processed"}) LOG.info(f"Finished processing clinical record {record.id}")
def etl_presence_absence(*, db: DatabaseSession): LOG.debug( f"Starting the presence_absence ETL routine, revision {REVISION}") # Fetch and iterate over presence-absence tests that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same presence-absence tests. LOG.debug("Fetching unprocessed presence-absence tests") presence_absence = db.cursor("presence_absence") presence_absence.itersize = 1 presence_absence.execute( """ select presence_absence_id as id, document, received::date as received_date from receiving.presence_absence where not processing_log @> %s order by id for update """, (Json([{ "revision": REVISION }]), )) for group in presence_absence: with db.savepoint(f"presence_absence group {group.id}"): LOG.info(f"Processing presence_absence group {group.id}") # Samplify will now send documents with a top level key # "samples". The new format also includes a "chip" key for each # sample which is then included in the unique identifier for # each presence/absence result # -Jover, 14 Nov 2019 try: received_samples = group.document["samples"] except KeyError as error: # Skip documents in the old format because they do not # include the "chip" key which is needed for the # unique identifier for each result. # -Jover, 14 Nov 2019 # Also skip old format to avoid ingesting wrong data from # plate swapped data! This will lead to 188 samples with the # wrong nwgc_id associated with them. # -Jover, 06 Dec 2019 if (group.document.get("store") is not None or group.document.get("Update") is not None): LOG.info( "Skipping presence_absence record that is in old format" ) mark_processed(db, group.id) continue else: raise error from None for received_sample in received_samples: received_sample_barcode = received_sample.get("investigatorId") if not received_sample_barcode: LOG.info( f"Skipping sample «{received_sample['sampleId']}» without SFS barcode" ) continue # Don't go any further if the sample is marked as Failed sample_failed = received_sample.get("sampleFailed") if sample_failed is True: LOG.info( f"Skipping sample «{received_sample_barcode}» that has been failed" ) continue # Don't go any further if there are no results to import. test_results = received_sample["targetResults"] if not test_results: LOG.warning( f"Skipping sample «{received_sample_barcode}» without any results" ) continue received_sample_id = str(received_sample["sampleId"]) chip = received_sample.get("chip") extraction_date = received_sample.get("extractionDate") assay_name = received_sample.get("assayName") assay_date = received_sample.get("assayDate") # The assayType field will be removed after Samplify starts # sending us OpenArray results with target.clinicalStatus. # # kfay, 28 Dec 2020 assay_type = received_sample.get("assayType") # Guard against empty chip values assert chip or "chip" not in received_sample, "Received bogus chip id" # Must be current results LOG.info(f"Processing sample «{received_sample_barcode}»") if not received_sample.get("isCurrentExpressionResult"): LOG.warning( f"Skipping out-of-date results for sample «{received_sample_barcode}»" ) continue # Barcode must match a known identifier received_sample_identifier = sample_identifier( db, received_sample_barcode) if not received_sample_identifier: LOG.warning( f"Skipping results for sample without a known identifier «{received_sample_barcode}»" ) continue # Track Samplify's internal ids for our samples, which is # unfortunately necessary for linking genomic data NWGC also # sends. sample = update_sample( db, identifier=received_sample_identifier, additional_details=sample_details(received_sample)) # Finally, process all results. for test_result in test_results: test_result_target_id = test_result["geneTarget"] LOG.debug( f"Processing target «{test_result_target_id}» for \ sample «{received_sample_barcode}»") # Skip this result if it's actually a non-result present = target_present(test_result) if present is ...: LOG.debug( f"No test result for «{test_result_target_id}», skipping" ) continue # Most of the time we expect to see existing targets so a # select-first approach makes the most sense to avoid useless # updates. target = find_or_create_target( db, identifier=test_result_target_id, control=target_control(test_result["controlStatus"])) # The unique identifier for each result. If chip is # applicable, then it's included to differentiate the same # sample being run on multiple chips (uncommon, but it # happens). if chip: identifier = f"NWGC/{received_sample_id}/{target.identifier}/{chip}" else: identifier = f"NWGC/{received_sample_id}/{target.identifier}" # Most of the time we expect to see new samples and new # presence_absence tests, so an insert-first approach makes more sense. # Presence-absence tests we see more than once are presumed to be # corrections. upsert_presence_absence(db, identifier=identifier, sample_id=sample.id, target_id=target.id, present=present, details=presence_absence_details( test_result, group.received_date, chip, extraction_date, assay_name, assay_date, assay_type)) mark_processed(db, group.id) LOG.info(f"Finished processing presence_absence group {group.id}")
def etl_longitudinal(*, db: DatabaseSession): LOG.debug(f"Starting the longitudinal ETL routine, revision {REVISION}") # Fetch and iterate over longitudinal records that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same longitudinal records. LOG.debug("Fetching unprocessed longitudinal records") longitudinal = db.cursor("longitudinal") longitudinal.execute( """ select longitudinal_id as id, document from receiving.longitudinal where not processing_log @> %s order by id for update """, (Json([{ "revision": REVISION }]), )) for record in longitudinal: with db.savepoint(f"longitudinal record {record.id}"): LOG.info(f"Processing longitudinal record {record.id}") # Check validity of barcode received_sample_identifier = sample_identifier(db, record.document) # Check sample exists in database if received_sample_identifier: sample = find_sample(db, identifier=received_sample_identifier) else: sample = None # Most of the time we expect to see existing sites so a # select-first approach makes the most sense to avoid useless # updates. site = find_or_create_site( db, identifier=site_identifier(record.document), details={"type": record.document['type']}) # Most of the time we expect to see existing individuals and new # encounters. # Encounters we see more than once are presumed to be # corrections. individual = find_or_create_individual( db, identifier=record.document["individual"], sex=sex(record.document)) encounter = upsert_encounter( db, identifier=record.document["identifier"], encountered=record.document["encountered"], individual_id=individual.id, site_id=site.id, age=age(record.document), details=encounter_details(record.document)) if sample: sample = update_sample(db, sample=sample, encounter_id=encounter.id) # Link encounter to a Census tract, if we have it tract_identifier = record.document.get("census_tract") if tract_identifier: tract = find_location(db, "tract", str(tract_identifier)) assert tract, f"Tract «{tract_identifier}» is unknown" upsert_encounter_location(db, encounter_id=encounter.id, relation="residence", location_id=tract.id) mark_processed(db, record.id, {"status": "processed"}) LOG.info(f"Finished processing longitudinal record {record.id}")
def etl_manifest(*, db: DatabaseSession): LOG.debug(f"Starting the manifest ETL routine, revision {REVISION}") # XXX TODO: Stop hardcoding valid identifier sets. Instead, accept them as # an option or config (and validate option choices against what's actually # in the database). We won't want to validate using click.option(), # because that would necessitate a database connection simply to run # bin/id3c at all. # -trs, 13 May 2019 expected_identifier_sets = { "samples": {"samples"}, "collections": { "collections-environmental", "collections-fluathome.org", "collections-household-intervention", "collections-household-intervention-asymptomatic", "collections-household-observation", "collections-household-observation-asymptomatic", "collections-kiosks", "collections-kiosks-asymptomatic", "collections-seattleflu.org", "collections-swab&send", "collections-swab&send-asymptomatic", "collections-self-test", "collections-scan", "collections-haarvi", "samples-haarvi", }, "rdt": {"collections-fluathome.org"} } # Fetch and iterate over samples that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same samples. LOG.debug("Fetching unprocessed manifest records") manifest = db.cursor("manifest") manifest.execute(""" select manifest_id as id, document from receiving.manifest where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]),)) for manifest_record in manifest: with db.savepoint(f"manifest record {manifest_record.id}"): LOG.info(f"Processing record {manifest_record.id}") # Convert sample barcode to full identifier, ensuring it's # known and from the correct identifier set. sample_barcode = manifest_record.document.pop("sample") sample_identifier = find_identifier(db, sample_barcode) if not sample_identifier: LOG.warning(f"Skipping sample with unknown sample barcode «{sample_barcode}»") mark_skipped(db, manifest_record.id) continue if (manifest_record.document.get("sample_type") and manifest_record.document["sample_type"] == "rdt"): assert sample_identifier.set_name in expected_identifier_sets["rdt"], \ (f"Sample identifier found in set «{sample_identifier.set_name}», " + f"not {expected_identifier_sets['rdt']}") else: assert sample_identifier.set_name in expected_identifier_sets["samples"], \ (f"Sample identifier found in set «{sample_identifier.set_name}», " + f"not {expected_identifier_sets['samples']}") # Optionally, convert the collection barcode to full # identifier, ensuring it's known and from the correct # identifier set. collection_barcode = manifest_record.document.pop("collection", None) collection_identifier = find_identifier(db, collection_barcode) if collection_barcode else None if collection_barcode and not collection_identifier: LOG.warning(f"Skipping sample with unknown collection barcode «{collection_barcode}»") mark_skipped(db, manifest_record.id) continue assert not collection_identifier \ or collection_identifier.set_name in expected_identifier_sets["collections"], \ f"Collection identifier found in set «{collection_identifier.set_name}», not {expected_identifier_sets['collections']}" # type: ignore # Sample collection date collection_date = manifest_record.document.get("date") # Upsert sample cooperatively with enrollments ETL routine # # The details document was intentionally modified by two pop()s # earlier to remove barcodes that were looked up. The # rationale is that we want just one clear place in the # warehouse for each piece of information. sample, status = upsert_sample(db, identifier = sample_identifier.uuid, collection_identifier = collection_identifier.uuid if collection_identifier else None, collection_date = collection_date, additional_details = manifest_record.document) mark_loaded(db, manifest_record.id, status = status, sample_id = sample.id) LOG.info(f"Finished processing manifest record {manifest_record.id}")
def kit_enrollments(*, db: DatabaseSession): LOG.debug( f"Starting the kit enrollments ETL routine, revision {ENROLLMENTS_REVISION}" ) expected_barcode_types = {"ScannedSelfSwab", "ManualSelfSwab"} LOG.debug("Fetching unprocessed enrollments") enrollments = db.cursor("enrollments") enrollments.execute( """ select enrollment_id as id, document from receiving.enrollment where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": ENROLLMENTS_REVISION }]), )) for enrollment in enrollments: with db.savepoint(f"enrollment {enrollment.id}"): LOG.info(f"Processing enrollment {enrollment.id}") # Find encounter that should have been created # from this enrollment record through etl enrollments encounter = find_encounter(db, enrollment.document["id"]) # Error out the kit etl process if no encounter found # The kit etl process can try again starting with this record # next time with the idea that the encounter will be # created by then. if not encounter: raise EncounterNotFoundError( f"No encounter with identifier «{enrollment.document['id']}» found" ) # Skip and mark the enrollment document as processed if the # encounter found is linked to a site that is not self-test if encounter.site != "self-test": LOG.debug( f"Found encounter {encounter.id} «{encounter.identifier}»" + f"linked to site «{encounter.site}», not 'self-test'") mark_enrollment_processed(db, enrollment.id) continue for code in enrollment.document["sampleCodes"]: barcode = code.get("code") # Kit must have a barcode if not barcode: LOG.warning(f"No barcode found in sampleCodes {code}") continue # Barcode must be of expected barcode type if code["type"] not in expected_barcode_types: LOG.debug(f"Skipping barcode with type {code['type']}") continue # Convert kit barcode to full identifier kit_identifier = find_identifier(db, barcode) if not kit_identifier: LOG.warning( f"Skipping kit with unknown barcode «{barcode}»") continue if kit_identifier.set_name not in expected_identifier_sets[ "kits"]: LOG.warning( f"Skipping kit with identifier found in " + f"set «{kit_identifier.set_name}» not {expected_identifier_sets['kits']}" ) continue details = {"type": code["type"]} kit, status = upsert_kit_with_encounter( db, identifier=kit_identifier.uuid, encounter_id=encounter.id, additional_details=details) if status == "updated": update_kit_samples(db, kit) mark_enrollment_processed(db, enrollment.id) LOG.info(f"Finished processing enrollment {enrollment.id}")
def kit_manifests(*, db: DatabaseSession): LOG.debug( f"Starting the kits manifests ETL routine, revision {MANIFEST_REVISION}" ) LOG.debug("Fetching unprocessed manifest records") manifest = db.cursor("manifest") manifest.execute( """ select manifest_id as id, document from receiving.manifest where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": MANIFEST_REVISION }]), )) for manifest_record in manifest: with db.savepoint(f"manifest record {manifest_record.id}"): LOG.info(f"Processing record {manifest_record.id}") # Mark record as skipped # if it does not contain a kit related sample if "kit" not in manifest_record.document: LOG.info( f"Skipping manifest record {manifest_record.id} without kit data" ) mark_skipped(db, manifest_record.id) continue sample_barcode = manifest_record.document.pop("sample") sample_identifier = find_identifier(db, sample_barcode) # Mark record as skipped # if it has an unknown sample barcode if not sample_identifier: LOG.warning( f"Skipping manifest record with unknown sample barcode «{sample_barcode}»" ) mark_skipped(db, manifest_record.id) continue # Mark record as skipped sample identifier set is unexpected if sample_identifier.set_name not in expected_identifier_sets[ "samples"]: LOG.warning( f"Skipping manifest record with sample identifier found in " + f"set «{sample_identifier.set_name}», not {expected_identifier_sets['samples']}" ) mark_skipped(db, manifest_record.id) continue # Find sample that should have been created from this # manifest record via etl manifest sample = find_sample(db, sample_identifier.uuid) # Error out the kit etl process if no sample found # The kit etl process can try again starting with this record # next time with the idea that the sample will be # created by then. if not sample: raise SampleNotFoundError( f"No sample with «{sample_identifier.uuid}» found") # Mark record as skipped if the sample does not have a # sample type (utm or rdt) if sample.type not in {"utm", "rdt"}: LOG.info(f"Skipping manifest record {manifest_record.id} " + f"with unknown sample type {sample.type}") mark_skipped(db, manifest_record.id) continue kit_barcode = manifest_record.document.pop("kit") kit_identifier = find_identifier(db, kit_barcode) # Mark record as skipped if it has an unknown kit barcode if not kit_identifier: LOG.warning( f"Skipping kit with unknown barcode «{kit_barcode}»") mark_skipped(db, manifest_record.id) continue # Mark record as skipped if kit identifier set is unexpected if kit_identifier.set_name not in expected_identifier_sets["kits"]: LOG.warning( f"Skipping kit with identifier found in " + f"set «{kit_identifier.set_name}» not {expected_identifier_sets['kits']}" ) mark_skipped(db, manifest_record.id) continue # List of extra data not needed for kit record that can # be removed before adding manifest document to kit details extra_data = [ "collection", "sample_type", "aliquot_date", "aliquots", "racks" ] for key in extra_data: manifest_record.document.pop(key, None) # Try to find identifier for the test-strip barcode for rdt samples if sample.type == "rdt": update_test_strip(db, manifest_record.document) kit, status = upsert_kit_with_sample( db, identifier=kit_identifier.uuid, sample=sample, additional_details=manifest_record.document) if status == "updated": update_sample(db, sample, kit.encounter_id) mark_loaded(db, manifest_record.id, status, kit.id)
def decorated(*args, db: DatabaseSession, log_output: bool, det_limit: int = None, redcap_api_batch_size: int, geocoding_cache: str = None, **kwargs): LOG.debug( f"Starting the REDCap DET ETL routine {name}, revision {revision}" ) project = Project(redcap_url, project_id) if det_limit: LOG.debug(f"Processing up to {det_limit:,} pending DETs") limit = sql.Literal(det_limit) else: LOG.debug(f"Processing all pending DETs") limit = sql.SQL("all") redcap_det = db.cursor(f"redcap-det {name}") redcap_det.execute( sql.SQL(""" select redcap_det_id as id, document from receiving.redcap_det where not processing_log @> %s and document::jsonb @> %s order by id limit {} for update """).format(limit), (Json([etl_id]), Json(det_contains))) # First loop of the DETs to determine how to process each one. # Uses `first_complete_dets` to keep track of which DET to # use to process a unique REDCap record. # Uses `all_dets` to keep track of the status for each DET record # so that they can be processed in order of `redcap_det_id` later. # --Jover, 21 May 2020 first_complete_dets: Dict[str, Any] = {} all_dets: List[Dict[str, str]] = [] for det in redcap_det: instrument = det.document['instrument'] record_id = det.document['record'] # Assume we are loading all DETs # Status will be updated to "skip" if DET does not need to be processed det_record = {"id": det.id, "status": "load"} # Only pull REDCap record if # `include_incomplete` flag was not included and # the current instrument is complete if not include_incomplete and not is_complete( instrument, det.document): det_record.update({ "status": "skip", "reason": "incomplete/unverified DET" }) # Check if this is record has an older DET # Skip latest DET in favor of the first DET # This is done to continue our first-in-first-out # semantics of our receiving tables elif first_complete_dets.get(record_id): det_record.update({ "status": "skip", "reason": "repeat REDCap record" }) else: first_complete_dets[record_id] = det det_record["record_id"] = record_id all_dets.append(det_record) if not first_complete_dets: LOG.info("No new complete DETs found.") else: # Batch request records from REDCap LOG.info(f"Fetching REDCap project {project_id}") record_ids = list(first_complete_dets.keys()) LOG.info( f"Fetching {len(record_ids):,} REDCap records from project {project.id}" ) # Convert list of REDCap records to a dict so that # records can be looked up by record id. # Records with repeating instruments or longitudinal # events will have multiple entries in the list. redcap_records: DefaultDict[str, List[dict]] = defaultdict(list) batches = list(chunked(record_ids, redcap_api_batch_size)) for i, batch in enumerate(batches, 1): LOG.info( f"Fetching REDCap record batch {i:,}/{len(batches):,} of size {len(batch):,}" ) for record in project.records(ids=batch, raw=raw_coded_values): redcap_records[record.id].append(record) # Process all DETs in order of redcap_det_id with pickled_cache(geocoding_cache) as cache: for det in all_dets: with db.savepoint(f"redcap_det {det['id']}"): LOG.info(f"Processing REDCap DET {det['id']}") if det["status"] == "skip": LOG.debug( f"Skipping REDCap DET {det['id']} due to {det['reason']}" ) mark_skipped(db, det["id"], etl_id, det["reason"]) continue received_det = first_complete_dets.pop( det["record_id"]) redcap_record_instances = redcap_records.get( received_det.document["record"]) if not redcap_record_instances: LOG.debug( f"REDCap record is missing or invalid. Skipping REDCap DET {received_det.id}" ) mark_skipped(db, received_det.id, etl_id, "invalid REDCap record") continue bundle = routine( db=db, cache=cache, det=received_det, redcap_record_instances=redcap_record_instances) if not bundle: LOG.debug( f"Skipping REDCap DET {received_det.id} due to insufficient data in REDCap record." ) mark_skipped(db, received_det.id, etl_id, "insufficient data in record") continue if log_output: print(as_json(bundle)) insert_fhir_bundle(db, bundle) mark_loaded(db, received_det.id, etl_id, bundle['id'])
def notify(*, action: str): LOG.debug(f"Starting the reportable conditions notification routine, revision {REVISION}") db = DatabaseSession() def webhook(suffix): return os.environ.get("SLACK_WEBHOOK_ALERTS_TEST") \ or os.environ[f"SLACK_WEBHOOK_REPORTING_{suffix}"] slack_webhooks = { "ncov-reporting": webhook("HCOV19"), } projects = [ { "collection_sets": {"collections-childcare"}, "slack_channel_name": "ncov-reporting-childcare", "slack_webhook": webhook("HCOV19_CHILDCARE"), }, { "collection_sets": {"collections-apple-respiratory"}, "slack_channel_name": "ncov-reporting-apple", "slack_webhook": webhook("HCOV19_APPLE"), }, { "collection_sets": {"collections-school-testing-home", "collections-school-testing-observed", "collections-radxup-yakima-schools-home", "collections-radxup-yakima-schools-observed"}, "slack_channel_name": "ncov-reporting-schools", "slack_webhook": webhook("HCOV19_SCHOOLS"), }, { "collection_sets": {"collections-adult-family-home-outbreak"}, "slack_channel_name": "ncov-reporting-afh", "slack_webhook": webhook("HCOV19_AFH"), }, { "collection_sets": {"collections-workplace-outbreak"}, "slack_channel_name": "ncov-reporting-workplace", "slack_webhook": webhook("HCOV19_WORKPLACE"), } ] # Fetch and iterate over reportable condition records that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same reportable condition records. LOG.debug("Fetching unprocessed reportable conditions records") reportable_conditions = db.cursor("reportable_conditions") reportable_conditions.execute(""" select reportable_condition_v1.*, presence_absence_id as id from shipping.reportable_condition_v1 join warehouse.presence_absence using (presence_absence_id) where details @> %s is not true order by id for update of presence_absence; """, (Json({"reporting_log":[{ "revision": REVISION }]}),)) processed_without_error = None try: for record in reportable_conditions: with db.savepoint(f"reportable condition presence_absence_id {record.id}"): LOG.info(f"Processing reportable condition, presence_absence_id «{record.id}»") if not record.site: LOG.info(f"No site found for presence_absence_id «{record.id}». " + "Inferring site from manifest data.") responses = {'ncov-reporting': send_slack_post_request(record, slack_webhooks['ncov-reporting'])} # Also send study-specific results to their respective channels for project in projects: if (record.collection_set_name in project['collection_sets']): responses[project['slack_channel_name']] = send_slack_post_request( record, project['slack_webhook']) # Check all POSTs to Slack were successful to mark as processed # This does mean that if one fails but others succeed, there # will be duplicate POSTs to the already succeeded channels. # The chance of this happening is pretty small, but we can # revisit this if it becomes a common problem # -Jover, 21 October 2020 if all(response.status_code == 200 for response in responses.values()): mark_processed(db, record.id, {"status": "sent Slack notification"}) LOG.info(f"Finished processing presence_absence_id «{record.id}»") else: for channel, response in responses.items(): if response.status_code != 200: LOG.error(("Error: A Slack notification could not " \ f"be sent to the channel «{channel}» for " f"presence_absence_id «{record.id}».\n" \ f"Slack API returned status code {response.status_code}: "\ f"{response.text}")) except Exception as error: processed_without_error = False LOG.error(f"Aborting with error") raise error from None else: processed_without_error = True finally: if action == "prompt": ask_to_commit = \ "Commit all changes?" if processed_without_error else \ "Commit successfully processed reportable condition records up to this point?" commit = click.confirm(ask_to_commit) else: commit = action == "commit" if commit: LOG.info( "Committing all changes" if processed_without_error else \ "Committing successfully processed reportable condition records up to this point") db.commit() else: LOG.info("Rolling back all changes; the database will not be modified") db.rollback()
def decorated(*args, db: DatabaseSession, log_output: bool, **kwargs): LOG.debug( f"Starting the REDCap DET ETL routine {name}, revision {revision}" ) redcap_det = db.cursor(f"redcap-det {name}") redcap_det.execute( """ select redcap_det_id as id, document from receiving.redcap_det where not processing_log @> %s and document::jsonb @> %s order by id for update """, (Json([etl_id]), Json(det_contains))) with pickled_cache(CACHE_FILE) as cache: for det in redcap_det: with db.savepoint(f"redcap_det {det.id}"): LOG.info(f"Processing REDCap DET {det.id}") instrument = det.document['instrument'] # Only pull REDCap record if # `include_incomplete` flag was not included and # the current instrument is complete if not include_incomplete and not is_complete( instrument, det.document): LOG.debug( f"Skipping incomplete or unverified REDCap DET {det.id}" ) mark_skipped(db, det.id, etl_id) continue redcap_record = get_redcap_record_from_det( det.document, raw_coded_values) if not redcap_record: LOG.debug( f"REDCap record is missing or invalid. Skipping REDCap DET {det.id}" ) mark_skipped(db, det.id, etl_id) continue # Only process REDCap record if all required instruments are complete incomplete_instruments = { instrument for instrument in required_instruments if not is_complete(instrument, redcap_record) } if incomplete_instruments: LOG.debug(f"The following required instruments «{incomplete_instruments}» are not yet marked complete. " + \ f"Skipping REDCap DET {det.id}") mark_skipped(db, det.id, etl_id) continue bundle = routine(db=db, cache=cache, det=det, redcap_record=redcap_record) if not bundle: mark_skipped(db, det.id, etl_id) continue if log_output: print(as_json(bundle)) insert_fhir_bundle(db, bundle) mark_loaded(db, det.id, etl_id, bundle['id'])
def etl_enrollments(*, db: DatabaseSession): LOG.debug(f"Starting the enrollment ETL routine, revision {REVISION}") # Fetch and iterate over enrollments that aren't processed # # Use a server-side cursor by providing a name. This ensures we limit how # much data we fetch at once, to limit local process size. Each enrollment # document is ~10 KB and the default fetch size (cursor.itersize) is 2,000, # thus we'll get ~20 MB on each fetch of 2,000 enrollments. # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same enrollments. LOG.debug("Fetching unprocessed enrollments") enrollments = db.cursor("enrollments") enrollments.execute( """ select enrollment_id as id, document from receiving.enrollment where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]), )) for enrollment in enrollments: with db.savepoint(f"enrollment {enrollment.id}"): LOG.info(f"Processing enrollment {enrollment.id}") # Out of an abundance of caution, fail when the schema version # of the enrollment document changes. This ensures manual # intervention happens on document structure changes. After # seeing how versions are handled over time, this restriction # may be toned down a bit. known_versions = {"1.1.0", "1.0.0"} assert enrollment.document["schemaVersion"] in known_versions, \ f"Document schema version {enrollment.document['schemaVersion']} is not in {known_versions}" # Most of the time we expect to see existing sites so a # select-first approach makes the most sense to avoid useless # updates. site = find_or_create_site( db, identifier=enrollment.document["site"]["name"], details=site_details(enrollment.document["site"])) # Most of the time we expect to see new individuals and new # encounters, so an insert-first approach makes more sense. # Encounters we see more than once are presumed to be # corrections. individual = upsert_individual( db, identifier=enrollment.document["participant"], sex=assigned_sex(enrollment.document)) encounter = upsert_encounter( db, identifier=enrollment.document["id"], encountered=enrollment.document["startTimestamp"], individual_id=individual.id, site_id=site.id, age=age(enrollment.document), details=encounter_details(enrollment.document)) process_samples(db, encounter.id, enrollment.document) process_locations(db, encounter.id, enrollment.document) mark_processed(db, enrollment.id) LOG.info(f"Finished processing enrollment {enrollment.id}")
def etl_consensus_genome(*, db: DatabaseSession): LOG.debug( f"Starting the consensus genome ETL routine, revision {REVISION}") # Fetch and iterate over consensus genome records that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same consensus genome records. LOG.debug("Fetching unprocessed consensus genome records") consensus_genome = db.cursor("consensus genome") consensus_genome.execute( """ select consensus_genome_id as id, document, received from receiving.consensus_genome where not processing_log @> %s order by id for update """, (Json([{ "revision": REVISION }]), )) for record in consensus_genome: with db.savepoint(f"consensus genome record {record.id}"): LOG.info(f"Processing consensus genome record {record.id}") # Verify sample identifier is in the database sample = find_sample(db, record.document["sample_identifier"], for_update=False) assert sample, f"No sample found with identifier «{record.document['sample_identifier']}»" # Most of the time we expect to see existing sequence read sets, # but we also want to update the details log. However, the # only unique constraint on the sequence_read_set table is # defined within a trigger function, as Postgres does not allow # unique constraints on array columns. Therefore, perform a # find-or-create followed by an update to the details column to # avoid conflict. sequence_read_set = find_or_create_sequence_read_set( db, record.document, sample) status = record.document.get("status") # Find the matching organism within the warehouse for the # reference organism organism_name = get_lineage(db, record.document) organism = find_organism(db, organism_name) assert organism, f"No organism found with name «{organism_name}»" # Only upsert genome and genomic sequences if the assembly job # was marked as complete. if status == 'complete': # Most of the time we expect to see new sequences, so an # insert-first approach makes the most sense to avoid useless # queries. genome = upsert_genome(db, sequence_read_set=sequence_read_set, organism=organism, document=record.document) for masked_consensus in record.document['masked_consensus']: genomic_sequence = upsert_genomic_sequence( db, genome=genome, masked_consensus=masked_consensus) update_sequence_read_set_details(db, sequence_read_set.id, organism, status) mark_processed(db, record.id, {"status": "processed"}) LOG.info( f"Finished processing consensus genome record {record.id}")
def etl_manifest(*, db: DatabaseSession): LOG.debug(f"Starting the manifest ETL routine, revision {REVISION}") # XXX TODO: Stop hardcoding valid identifier sets. Instead, accept them as # an option or config (and validate option choices against what's actually # in the database). We won't want to validate using click.option(), # because that would necessitate a database connection simply to run # bin/id3c at all. # -trs, 13 May 2019 expected_identifier_sets = { "samples": {"samples"}, "collections": { "collections-environmental", "collections-fluathome.org", "collections-household-intervention", "collections-household-intervention-asymptomatic", "collections-household-observation", "collections-household-observation-asymptomatic", "collections-kiosks", "collections-kiosks-asymptomatic", "collections-seattleflu.org", "collections-swab&send", "collections-swab&send-asymptomatic", "collections-self-test", "collections-scan", "collections-scan-kiosks", "collections-haarvi", "samples-haarvi", "collections-validation", "collections-uw-home", "collections-uw-observed", "collections-household-general", "collections-childcare", "collections-school-testing-home", "collections-school-testing-observed", "collections-apple-respiratory", "collections-apple-respiratory-serial", "collections-adult-family-home-outbreak", "collections-workplace-outbreak", }, "rdt": {"collections-fluathome.org"} } # Fetch and iterate over samples that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same samples. LOG.debug("Fetching unprocessed manifest records") manifest = db.cursor("manifest") manifest.execute( """ select manifest_id as id, document from receiving.manifest where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]), )) for manifest_record in manifest: with db.savepoint(f"manifest record {manifest_record.id}"): LOG.info(f"Processing record {manifest_record.id}") # When updating an existing row, update the identifiers # only if the record has both the 'sample' and # 'collection' keys. should_update_identifiers = "sample" in manifest_record.document \ and "collection" in manifest_record.document # Sample collection date # Don't pop this entry off the document. For backwards # compatibility reasons, keep it in the document so that 'date' # also gets written to the 'details' column in warehouse.sample. collected_date = manifest_record.document.get("date", None) # Attempt to find barcodes and their related identifiers sample_barcode = manifest_record.document.pop("sample", None) sample_identifier = find_identifier( db, sample_barcode) if sample_barcode else None collection_barcode = manifest_record.document.pop( "collection", None) collection_identifier = find_identifier( db, collection_barcode) if collection_barcode else None # Skip a record if it has no associated barcodes if not sample_barcode and not collection_barcode: LOG.warning( f"Skipping record «{manifest_record.id}» because it has neither a sample " "barcode nor a collection barcode") mark_skipped(db, manifest_record.id) continue # Skip a record if it has a sample barcode but the barcode doesn't match an identifier if sample_barcode and not sample_identifier: LOG.warning( f"Skipping sample with unknown sample barcode «{sample_barcode}»" ) mark_skipped(db, manifest_record.id) continue # Skip a record if it has a collection barcode but the barcode doesn't match an identifier if collection_barcode and not collection_identifier: LOG.warning( f"Skipping sample with unknown collection barcode «{collection_barcode}»" ) mark_skipped(db, manifest_record.id) continue # Skip a record if the collection identifier is from an unexpected set if collection_identifier and collection_identifier.set_name not in expected_identifier_sets[ "collections"]: LOG.warning( f"Skipping sample because collection identifier found in set «{collection_identifier.set_name}», not \ {expected_identifier_sets['collections']}") mark_skipped(db, manifest_record.id) continue # Validate the sample identifer and assert if a record fails if sample_identifier: if (manifest_record.document.get("sample_type") and manifest_record.document["sample_type"] == "rdt"): assert sample_identifier.set_name in expected_identifier_sets["rdt"], \ (f"Sample identifier found in set «{sample_identifier.set_name}»," + f"not {expected_identifier_sets['rdt']}") else: assert sample_identifier.set_name in expected_identifier_sets["samples"], \ (f"Sample identifier found in set «{sample_identifier.set_name}», " + f"not {expected_identifier_sets['samples']}") # Upsert sample cooperatively with enrollments ETL routine # # The details document was intentionally modified by two pop()s # earlier to remove barcodes that were looked up. # The rationale is that we want just one clear place in the # warehouse for each piece of information. sample, status = upsert_sample( db, update_identifiers=should_update_identifiers, identifier=sample_identifier.uuid if sample_identifier else None, collection_identifier=collection_identifier.uuid if collection_identifier else None, collection_date=collected_date, additional_details=manifest_record.document) mark_loaded(db, manifest_record.id, status=status, sample_id=sample.id) LOG.info( f"Finished processing manifest record {manifest_record.id}")