def find_or_create_site(db: DatabaseSession, identifier: str, details: dict) -> Any: """ Select encounter site by *identifier*, or insert it if it doesn't exist. """ LOG.debug(f"Looking up site «{identifier}»") site = db.fetch_row( """ select site_id as id, identifier from warehouse.site where identifier = %s """, (identifier, )) if site: LOG.info(f"Found site {site.id} «{site.identifier}»") else: LOG.debug(f"Site «{identifier}» not found, adding") data = { "identifier": identifier, "details": Json(details), } site = db.fetch_row( """ insert into warehouse.site (identifier, details) values (%(identifier)s, %(details)s) returning site_id as id, identifier """, data) LOG.info(f"Created site {site.id} «{site.identifier}»") return site
def update_sequence_read_set_details(db, sequence_read_set_id: int, organism: OrganismRecord, status: str) -> None: """ This function is a workaround to the order-specific unique constraint for arrays in Postgres. It searches for an existing *sequence_read_set* by its ID and updates the details column with the given *entry* details. """ LOG.debug( f"Marking sequence read set data {sequence_read_set_id} as received") entry = {organism.lineage: {"status": status}} data = { "sequence_read_set_id": sequence_read_set_id, "log_entry": Json(entry), } # Postgres allows array columns to be defined as unique, but the ordering of # the arrays must be the same for the arrays to match. We are only # interested in matching on array content, not array ordering. This prevents # us from being able to use the built-in unique constraint on urls, thereby # preventing `ON CONFLICT` updates to a table when urls is the only unique # column in the table. with db.cursor() as cursor: cursor.execute( """ update warehouse.sequence_read_set set details = coalesce(details, '{}') || %(log_entry)s where sequence_read_set_id = %(sequence_read_set_id)s """, data)
def upsert_sample(db: DatabaseSession, collection_identifier: str, encounter_id: int, details: dict) -> Any: """ Upsert collected sample by its *collection_identifier*. The provided *details* are merged (at the top-level only) into the existing sample details, if any. """ LOG.debug(f"Upserting sample collection «{collection_identifier}»") data = { "collection_identifier": collection_identifier, "encounter_id": encounter_id, "details": Json(details), } sample = db.fetch_row( """ insert into warehouse.sample (collection_identifier, encounter_id, details) values (%(collection_identifier)s, %(encounter_id)s, %(details)s) on conflict (collection_identifier) do update set encounter_id = excluded.encounter_id, details = coalesce(sample.details, '{}') || %(details)s returning sample_id as id, identifier, collection_identifier, encounter_id """, data) assert sample.id, "Upsert affected no rows!" LOG.info( f"Upserted sample {sample.id} with collection identifier «{sample.collection_identifier}»" ) return sample
def update_sample(db: DatabaseSession, identifier: str = None, collection_identifier: str = None, additional_details: dict = None) -> Any: """ Find sample by *identifier* and update with any *additional_details*. The provided *additional_details* are merged (at the top-level only) into the existing sample details, if any. Raises an :class:`SampleNotFoundError` if there is no sample known by *identifier*. """ if identifier: LOG.debug(f"Looking up sample with identifier «{identifier}»") sample = db.fetch_row(""" select sample_id as id, identifier, details from warehouse.sample where identifier = %s for update """, (identifier,)) elif collection_identifier: LOG.debug(f"Looking up sample with collection_identifier «{collection_identifier}»") sample = db.fetch_row(""" select sample_id as id, collection_identifier as identifier, details from warehouse.sample where collection_identifier = %s for update """, (collection_identifier,)) if not sample: LOG.error(f"No sample found with identifier «{identifier}» or collection identifier «{collection_identifier}»") raise SampleNotFoundError(identifier) LOG.info(f"Found sample {sample.id} «{sample.identifier}»") if additional_details: LOG.info(f"Updating sample {sample.id} «{sample.identifier}» details") update_details_nwgc_id(sample, additional_details) sample = db.fetch_row(""" update warehouse.sample set details = coalesce(details, '{}') || %s where sample_id = %s returning sample_id as id, identifier """, (Json(additional_details), sample.id)) assert sample.id, "Updating details affected no rows!" return sample
def upsert_presence_absence(db: DatabaseSession, identifier: str, sample_id: int, target_id: int, present: bool, details: dict) -> Any: """ Upsert presence_absence by its *identifier*. Confirmed with Samplify that their numeric identifier for each test is stable and persistent. """ LOG.debug(f"Upserting presence_absence «{identifier}»") data = { "identifier": identifier, "sample_id": sample_id, "target_id": target_id, "present": present, "details": Json(details) } presence_absence = db.fetch_row( """ insert into warehouse.presence_absence ( identifier, sample_id, target_id, present, details) values ( %(identifier)s, %(sample_id)s, %(target_id)s, %(present)s, %(details)s) on conflict (identifier) do update set sample_id = excluded.sample_id, target_id = excluded.target_id, present = excluded.present, details = coalesce(presence_absence.details, '{}') || excluded.details returning presence_absence_id as id, identifier """, data) assert presence_absence.id, "Upsert affected no rows!" LOG.info(f"Upserted presence_absence {presence_absence.id} \ «{presence_absence.identifier}»") return presence_absence
def upsert_encounter(db: DatabaseSession, identifier: str, encountered: str, individual_id: int, site_id: int, age: Optional[str], details: dict) -> Any: """ Upsert encounter by its *identifier*. """ LOG.debug(f"Upserting encounter «{identifier}»") data = { "identifier": identifier, "encountered": encountered, "individual_id": individual_id, "site_id": site_id, "age": age, "details": Json(details), } encounter = db.fetch_row( """ insert into warehouse.encounter ( identifier, individual_id, site_id, encountered, age, details) values ( %(identifier)s, %(individual_id)s, %(site_id)s, %(encountered)s::timestamp with time zone, %(age)s, %(details)s) on conflict (identifier) do update set individual_id = excluded.individual_id, site_id = excluded.site_id, encountered = excluded.encountered, age = excluded.age, details = excluded.details returning encounter_id as id, identifier """, data) assert encounter.id, "Upsert affected no rows!" LOG.info(f"Upserted encounter {encounter.id} «{encounter.identifier}»") return encounter
def insert_dets(db: DatabaseSession, project: Project, offers: List[dict]): """ Inserts synthethic DETs into ``receiving.redcap_det`` for the REDCap record *offers* made for *project*. """ dets = [(Json(det(project, offer, TESTING_INSTRUMENT)), ) for offer in offers] LOG.info(f"Inserting {len(dets):,} synthetic REDCap DETs for {project}") with db.cursor() as cursor: execute_values( cursor, """ insert into receiving.redcap_det (document) values %s """, dets)
def etl_fhir(*, db: DatabaseSession): LOG.debug(f"Starting the FHIR ETL routine, revision {REVISION}") # Fetch and iterate over FHIR documents that aren't processed # # Use a server-side cursor by providing a name and limit to one fetched # record at a time, to limit local process size. # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same FHIR documents. LOG.debug("Fetching unprocessed FHIR documents") fhir_documents = db.cursor("fhir") fhir_documents.itersize = 1 fhir_documents.execute( """ select fhir_id as id, document from receiving.fhir where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]), )) for record in fhir_documents: with db.savepoint(f"FHIR document {record.id}"): LOG.info(f"Processing FHIR document {record.id}") assert_bundle_collection(record.document) bundle = Bundle(record.document) resources = extract_resources(bundle) # Loop over every Resource the Bundle entry, processing what is # needed along the way. try: assert_required_resource_types_present(resources) process_bundle_entries(db, bundle) except SkipBundleError as error: LOG.warning( f"Skipping bundle in FHIR document «{record.id}»: {error}") mark_skipped(db, record.id) continue mark_processed(db, record.id, {"status": "processed"}) LOG.info(f"Finished processing FHIR document {record.id}")
def insert_fhir_bundle(db: DatabaseSession, bundle: dict) -> None: """ Insert FHIR bundles into the receiving area of the database. """ LOG.debug(f"Inserting FHIR bundle «{bundle['id']}»") fhir = db.fetch_row(""" insert into receiving.fhir(document) values (%s) returning fhir_id as id """, (Json(bundle),)) assert fhir.id, "Insert affected no rows!" LOG.info(f"Inserted FHIR document {fhir.id} «{bundle['id']}»")
def mark_processed(db, group_id: int) -> None: LOG.debug(f"Marking presence_absence group {group_id} as processed") data = { "group_id": group_id, "log_entry": Json({ "revision": REVISION, "timestamp": datetime.now(timezone.utc), }), } with db.cursor() as cursor: cursor.execute(""" update receiving.presence_absence set processing_log = processing_log || %(log_entry)s where presence_absence_id = %(group_id)s """, data)
def mark_processed(db: DatabaseSession, det_id: int, entry = {}) -> None: LOG.debug(f"Appending to processing log of REDCap DET record {det_id}") data = { "det_id": det_id, "log_entry": Json({ **entry, "timestamp": datetime.now(timezone.utc), }), } with db.cursor() as cursor: cursor.execute(""" update receiving.redcap_det set processing_log = processing_log || %(log_entry)s where redcap_det_id = %(det_id)s """, data)
def mark_processed(db, clinical_id: int, entry: Mapping) -> None: LOG.debug(f"Marking clinical document {clinical_id} as processed") data = { "clinical_id": clinical_id, "log_entry": Json({ **entry, "revision": REVISION, "timestamp": datetime.now(timezone.utc), }), } with db.cursor() as cursor: cursor.execute(""" update receiving.clinical set processing_log = processing_log || %(log_entry)s where clinical_id = %(clinical_id)s """, data)
def mark_processed(db, fhir_id: int, entry = {}) -> None: LOG.debug(f"Marking FHIR document {fhir_id} as processed") data = { "fhir_id": fhir_id, "log_entry": Json({ **entry, "etl": ETL_NAME, "revision": REVISION, "timestamp": datetime.now(timezone.utc), }), } with db.cursor() as cursor: cursor.execute(""" update receiving.fhir set processing_log = processing_log || %(log_entry)s where fhir_id = %(fhir_id)s """, data)
def mark_processed(db, manifest_id: int, entry = {}) -> None: LOG.debug(f"Appending to processing log of sample manifest record {manifest_id}") data = { "manifest_id": manifest_id, "log_entry": Json({ **entry, "etl": ETL_NAME, "revision": REVISION, "timestamp": datetime.now(timezone.utc), }), } with db.cursor() as cursor: cursor.execute(""" update receiving.manifest set processing_log = processing_log || %(log_entry)s where manifest_id = %(manifest_id)s """, data)
def mark_processed(db, presence_absence_id: int, entry: Mapping) -> None: LOG.debug(dedent(f""" Marking reportable condition «{presence_absence_id}» as processed in the presence_absence table""")) data = { "presence_absence_id": presence_absence_id, "log_entry": Json({ **entry, "revision": REVISION, "timestamp": datetime.now(timezone.utc), }), } with db.cursor() as cursor: cursor.execute(""" update warehouse.presence_absence set details = jsonb_insert('{"reporting_log":[]}' || coalesce(details, '{}'), '{reporting_log, -1}', %(log_entry)s, insert_after => true) where presence_absence_id = %(presence_absence_id)s """, data)
def upsert_genome(db: DatabaseSession, sequence_read_set: SequenceReadSetRecord, organism: OrganismRecord, document: dict) -> GenomeRecord: """ Upsert consensus genomes with the given *sequence_read_set*, *organism*, and consensus genome *document*. """ LOG.debug( dedent(f""" Upserting genome with sequence read set {sequence_read_set.id}, organism {organism.id} «{organism.lineage}»""")) data = { "sample_id": sequence_read_set.sample_id, "organism_id": organism.id, "sequence_read_set_id": sequence_read_set.id, "additional_details": Json(document['summary_stats']) } genome: GenomeRecord = db.fetch_row( """ insert into warehouse.consensus_genome (sample_id, organism_id, sequence_read_set_id, details) values (%(sample_id)s, %(organism_id)s, %(sequence_read_set_id)s, %(additional_details)s) on conflict (sample_id, organism_id, sequence_read_set_id) do update set details = %(additional_details)s returning consensus_genome_id as id, sample_id, organism_id, sequence_read_set_id """, data) assert genome.id, "Upsert affected no rows!" LOG.info( dedent(f""" Upserted genome {genome.id} with sample ID «{genome.sample_id}», organism ID «{genome.organism_id}», and sequence read set ID «{genome.sequence_read_set_id}» """)) return genome
def mark_enrollment_processed(db, enrollment_id: int) -> None: LOG.debug(f"Marking enrollment {enrollment_id} as processed") data = { "enrollment_id": enrollment_id, "log_entry": Json({ "revision": ENROLLMENTS_REVISION, "etl": ETL_NAME, "timestamp": datetime.now(timezone.utc), }) } with db.cursor() as cursor: cursor.execute( """ update receiving.enrollment set processing_log = processing_log || %(log_entry)s where enrollment_id = %(enrollment_id)s """, data)
def mark_manifest_processed(db, manifest_id: int, entry={}) -> None: LOG.debug(f"Marking manifest {manifest_id} as processed") data = { "manifest_id": manifest_id, "log_entry": Json({ **entry, "revision": MANIFEST_REVISION, "etl": ETL_NAME, "timestamp": datetime.now(timezone.utc) }) } with db.cursor() as cursor: cursor.execute( """ update receiving.manifest set processing_log = processing_log || %(log_entry)s where manifest_id = %(manifest_id)s """, data)
def find_or_create_individual(db: DatabaseSession, identifier: str, sex: str, details: dict = None) -> Any: """ Select indinvidual by *identifier*, or insert it if it doesn't exist. """ LOG.debug(f"Looking up individual «{identifier}»") individual = db.fetch_row( """ select individual_id as id, identifier from warehouse.individual where identifier = %s """, (identifier, )) if individual: LOG.info(f"Found individual {individual.id} «{individual.identifier}»") else: LOG.debug(f"individual «{identifier}» not found, adding") data = { "identifier": identifier, "sex": sex, "details": Json(details), } individual = db.fetch_row( """ insert into warehouse.individual (identifier, sex, details) values (%(identifier)s, %(sex)s, %(details)s) returning individual_id as id, identifier """, data) LOG.info( f"Created individual {individual.id} «{individual.identifier}»") return individual
def etl_presence_absence(*, db: DatabaseSession): LOG.debug( f"Starting the presence_absence ETL routine, revision {REVISION}") # Fetch and iterate over presence-absence tests that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same presence-absence tests. LOG.debug("Fetching unprocessed presence-absence tests") presence_absence = db.cursor("presence_absence") presence_absence.itersize = 1 presence_absence.execute( """ select presence_absence_id as id, document, received::date as received_date from receiving.presence_absence where not processing_log @> %s order by id for update """, (Json([{ "revision": REVISION }]), )) for group in presence_absence: with db.savepoint(f"presence_absence group {group.id}"): LOG.info(f"Processing presence_absence group {group.id}") # Samplify will now send documents with a top level key # "samples". The new format also includes a "chip" key for each # sample which is then included in the unique identifier for # each presence/absence result # -Jover, 14 Nov 2019 try: received_samples = group.document["samples"] except KeyError as error: # Skip documents in the old format because they do not # include the "chip" key which is needed for the # unique identifier for each result. # -Jover, 14 Nov 2019 # Also skip old format to avoid ingesting wrong data from # plate swapped data! This will lead to 188 samples with the # wrong nwgc_id associated with them. # -Jover, 06 Dec 2019 if (group.document.get("store") is not None or group.document.get("Update") is not None): LOG.info( "Skipping presence_absence record that is in old format" ) mark_processed(db, group.id) continue else: raise error from None for received_sample in received_samples: received_sample_barcode = received_sample.get("investigatorId") if not received_sample_barcode: LOG.info( f"Skipping sample «{received_sample['sampleId']}» without SFS barcode" ) continue # Don't go any further if the sample is marked as Failed sample_failed = received_sample.get("sampleFailed") if sample_failed is True: LOG.info( f"Skipping sample «{received_sample_barcode}» that has been failed" ) continue # Don't go any further if there are no results to import. test_results = received_sample["targetResults"] if not test_results: LOG.warning( f"Skipping sample «{received_sample_barcode}» without any results" ) continue received_sample_id = str(received_sample["sampleId"]) chip = received_sample.get("chip") extraction_date = received_sample.get("extractionDate") assay_name = received_sample.get("assayName") assay_date = received_sample.get("assayDate") # The assayType field will be removed after Samplify starts # sending us OpenArray results with target.clinicalStatus. # # kfay, 28 Dec 2020 assay_type = received_sample.get("assayType") # Guard against empty chip values assert chip or "chip" not in received_sample, "Received bogus chip id" # Must be current results LOG.info(f"Processing sample «{received_sample_barcode}»") if not received_sample.get("isCurrentExpressionResult"): LOG.warning( f"Skipping out-of-date results for sample «{received_sample_barcode}»" ) continue # Barcode must match a known identifier received_sample_identifier = sample_identifier( db, received_sample_barcode) if not received_sample_identifier: LOG.warning( f"Skipping results for sample without a known identifier «{received_sample_barcode}»" ) continue # Track Samplify's internal ids for our samples, which is # unfortunately necessary for linking genomic data NWGC also # sends. sample = update_sample( db, identifier=received_sample_identifier, additional_details=sample_details(received_sample)) # Finally, process all results. for test_result in test_results: test_result_target_id = test_result["geneTarget"] LOG.debug( f"Processing target «{test_result_target_id}» for \ sample «{received_sample_barcode}»") # Skip this result if it's actually a non-result present = target_present(test_result) if present is ...: LOG.debug( f"No test result for «{test_result_target_id}», skipping" ) continue # Most of the time we expect to see existing targets so a # select-first approach makes the most sense to avoid useless # updates. target = find_or_create_target( db, identifier=test_result_target_id, control=target_control(test_result["controlStatus"])) # The unique identifier for each result. If chip is # applicable, then it's included to differentiate the same # sample being run on multiple chips (uncommon, but it # happens). if chip: identifier = f"NWGC/{received_sample_id}/{target.identifier}/{chip}" else: identifier = f"NWGC/{received_sample_id}/{target.identifier}" # Most of the time we expect to see new samples and new # presence_absence tests, so an insert-first approach makes more sense. # Presence-absence tests we see more than once are presumed to be # corrections. upsert_presence_absence(db, identifier=identifier, sample_id=sample.id, target_id=target.id, present=present, details=presence_absence_details( test_result, group.received_date, chip, extraction_date, assay_name, assay_date, assay_type)) mark_processed(db, group.id) LOG.info(f"Finished processing presence_absence group {group.id}")
def etl_longitudinal(*, db: DatabaseSession): LOG.debug(f"Starting the longitudinal ETL routine, revision {REVISION}") # Fetch and iterate over longitudinal records that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same longitudinal records. LOG.debug("Fetching unprocessed longitudinal records") longitudinal = db.cursor("longitudinal") longitudinal.execute( """ select longitudinal_id as id, document from receiving.longitudinal where not processing_log @> %s order by id for update """, (Json([{ "revision": REVISION }]), )) for record in longitudinal: with db.savepoint(f"longitudinal record {record.id}"): LOG.info(f"Processing longitudinal record {record.id}") # Check validity of barcode received_sample_identifier = sample_identifier(db, record.document) # Check sample exists in database if received_sample_identifier: sample = find_sample(db, identifier=received_sample_identifier) else: sample = None # Most of the time we expect to see existing sites so a # select-first approach makes the most sense to avoid useless # updates. site = find_or_create_site( db, identifier=site_identifier(record.document), details={"type": record.document['type']}) # Most of the time we expect to see existing individuals and new # encounters. # Encounters we see more than once are presumed to be # corrections. individual = find_or_create_individual( db, identifier=record.document["individual"], sex=sex(record.document)) encounter = upsert_encounter( db, identifier=record.document["identifier"], encountered=record.document["encountered"], individual_id=individual.id, site_id=site.id, age=age(record.document), details=encounter_details(record.document)) if sample: sample = update_sample(db, sample=sample, encounter_id=encounter.id) # Link encounter to a Census tract, if we have it tract_identifier = record.document.get("census_tract") if tract_identifier: tract = find_location(db, "tract", str(tract_identifier)) assert tract, f"Tract «{tract_identifier}» is unknown" upsert_encounter_location(db, encounter_id=encounter.id, relation="residence", location_id=tract.id) mark_processed(db, record.id, {"status": "processed"}) LOG.info(f"Finished processing longitudinal record {record.id}")
def decorated(*args, db: DatabaseSession, log_output: bool, det_limit: int = None, redcap_api_batch_size: int, geocoding_cache: str = None, **kwargs): LOG.debug( f"Starting the REDCap DET ETL routine {name}, revision {revision}" ) project = Project(redcap_url, project_id) if det_limit: LOG.debug(f"Processing up to {det_limit:,} pending DETs") limit = sql.Literal(det_limit) else: LOG.debug(f"Processing all pending DETs") limit = sql.SQL("all") redcap_det = db.cursor(f"redcap-det {name}") redcap_det.execute( sql.SQL(""" select redcap_det_id as id, document from receiving.redcap_det where not processing_log @> %s and document::jsonb @> %s order by id limit {} for update """).format(limit), (Json([etl_id]), Json(det_contains))) # First loop of the DETs to determine how to process each one. # Uses `first_complete_dets` to keep track of which DET to # use to process a unique REDCap record. # Uses `all_dets` to keep track of the status for each DET record # so that they can be processed in order of `redcap_det_id` later. # --Jover, 21 May 2020 first_complete_dets: Dict[str, Any] = {} all_dets: List[Dict[str, str]] = [] for det in redcap_det: instrument = det.document['instrument'] record_id = det.document['record'] # Assume we are loading all DETs # Status will be updated to "skip" if DET does not need to be processed det_record = {"id": det.id, "status": "load"} # Only pull REDCap record if # `include_incomplete` flag was not included and # the current instrument is complete if not include_incomplete and not is_complete( instrument, det.document): det_record.update({ "status": "skip", "reason": "incomplete/unverified DET" }) # Check if this is record has an older DET # Skip latest DET in favor of the first DET # This is done to continue our first-in-first-out # semantics of our receiving tables elif first_complete_dets.get(record_id): det_record.update({ "status": "skip", "reason": "repeat REDCap record" }) else: first_complete_dets[record_id] = det det_record["record_id"] = record_id all_dets.append(det_record) if not first_complete_dets: LOG.info("No new complete DETs found.") else: # Batch request records from REDCap LOG.info(f"Fetching REDCap project {project_id}") record_ids = list(first_complete_dets.keys()) LOG.info( f"Fetching {len(record_ids):,} REDCap records from project {project.id}" ) # Convert list of REDCap records to a dict so that # records can be looked up by record id. # Records with repeating instruments or longitudinal # events will have multiple entries in the list. redcap_records: DefaultDict[str, List[dict]] = defaultdict(list) batches = list(chunked(record_ids, redcap_api_batch_size)) for i, batch in enumerate(batches, 1): LOG.info( f"Fetching REDCap record batch {i:,}/{len(batches):,} of size {len(batch):,}" ) for record in project.records(ids=batch, raw=raw_coded_values): redcap_records[record.id].append(record) # Process all DETs in order of redcap_det_id with pickled_cache(geocoding_cache) as cache: for det in all_dets: with db.savepoint(f"redcap_det {det['id']}"): LOG.info(f"Processing REDCap DET {det['id']}") if det["status"] == "skip": LOG.debug( f"Skipping REDCap DET {det['id']} due to {det['reason']}" ) mark_skipped(db, det["id"], etl_id, det["reason"]) continue received_det = first_complete_dets.pop( det["record_id"]) redcap_record_instances = redcap_records.get( received_det.document["record"]) if not redcap_record_instances: LOG.debug( f"REDCap record is missing or invalid. Skipping REDCap DET {received_det.id}" ) mark_skipped(db, received_det.id, etl_id, "invalid REDCap record") continue bundle = routine( db=db, cache=cache, det=received_det, redcap_record_instances=redcap_record_instances) if not bundle: LOG.debug( f"Skipping REDCap DET {received_det.id} due to insufficient data in REDCap record." ) mark_skipped(db, received_det.id, etl_id, "insufficient data in record") continue if log_output: print(as_json(bundle)) insert_fhir_bundle(db, bundle) mark_loaded(db, received_det.id, etl_id, bundle['id'])
def upsert_sample(db: DatabaseSession, identifier: str, collection_identifier: Optional[str], collection_date: Optional[str], additional_details: dict) -> Tuple[Any, str]: """ Upsert sample by its *identifier* and/or *collection_identifier*. An existing sample has its *identifier*, *collection_identifier*, *collection_date* updated, and the provided *additional_details* are merged (at the top-level only) into the existing sample details, if any. Raises an exception if there is more than one matching sample. """ data = { "identifier": identifier, "collection_identifier": collection_identifier, "collection_date": collection_date, "additional_details": Json(additional_details), } # Look for existing sample(s) with db.cursor() as cursor: cursor.execute(""" select sample_id as id, identifier, collection_identifier, encounter_id from warehouse.sample where identifier = %(identifier)s or collection_identifier = %(collection_identifier)s for update """, data) samples = list(cursor) # Nothing found → create if not samples: LOG.info("Creating new sample") status = 'created' sample = db.fetch_row(""" insert into warehouse.sample (identifier, collection_identifier, collected, details) values (%(identifier)s, %(collection_identifier)s, date_or_null(%(collection_date)s), %(additional_details)s) returning sample_id as id, identifier, collection_identifier, encounter_id """, data) # One found → update elif len(samples) == 1: status = 'updated' sample = samples[0] LOG.info(f"Updating existing sample {sample.id}") sample = db.fetch_row(""" update warehouse.sample set identifier = %(identifier)s, collection_identifier = %(collection_identifier)s, collected = date_or_null(%(collection_date)s), details = coalesce(details, '{}') || %(additional_details)s where sample_id = %(sample_id)s returning sample_id as id, identifier, collection_identifier, encounter_id """, { **data, "sample_id": sample.id }) assert sample.id, "Update affected no rows!" # More than one found → error else: raise Exception(f"More than one sample matching sample and/or collection barcodes: {samples}") return sample, status
def import_(features_path, scale, scale_from, hierarchy, hierarchy_by_feature, hierarchy_from, identifier_from, point_from, simplified_polygons_path, if_exists_action): """ Import locations from GeoJSON, Shapefile, or tabular delimited text (CSV/TSV). Run `id3c location --help` for general information about locations. Many common geospatial formats are supported via Fiona <https://fiona.readthedocs.io/en/latest/README.html>, including: \b * GeoJSON (top-level type must be ``FeatureCollection``) * Shapefiles (*.shp with sidecar *.dbf file; may be zipped) * tabular (CSV or TSV, delimiter auto-detected, no geometry construction) If a tabular data file is provided, at least one of --point-from or --simplified-polygons should be used to provide a geometry. Each location's (scale, identifier) pair is automatically added to its hierarchy during import. This is a non-configurable convention that could be made optional in the future. """ # First, a handful of accessor and transformation functions to make the # code that follows more readable. These functions make use of the command # options. def identifier(feature): if identifier_from: return feature["properties"].pop(identifier_from) else: return feature["id"] def geometry_type(feature): return feature["geometry"]["type"] if feature["geometry"] else None def point(feature): if point_from: return { "type": "Point", "coordinates": ( float(feature["properties"].pop(point_from[0])), float(feature["properties"].pop(point_from[1])), ) } elif geometry_type(feature) == "Point": return feature["geometry"] else: return None def polygon(feature): if geometry_type(feature) in {"Polygon", "MultiPolygon"}: return feature["geometry"] else: return None def get_hierarchy(feature, feature_identifier): if hierarchy_df is None: return hierarchy or feature["properties"].pop(hierarchy_from, None) hierarchy_list = hierarchy_df.loc[hierarchy_df['feature_identifier'] == feature_identifier].to_dict('record') hierarchies = '' if hierarchy_list: # This is assuming only one row has matching feature_identifier hierarchy_map = hierarchy_list[0] hierarchy_map.pop('feature_identifier') for key, val in hierarchy_map.items(): if pd.notna(val): hierarchies += (key + '=>' + val + ',') return hierarchies + hierarchy # Technically PostGIS' SRIDs don't have to match the EPSG id, but in my # experience, they always do in practice. If that doesn't hold true in the # future, then a lookup of (auth_name, auth_id) in spatial_ref_sys table # will be needed to map to srid. # -trs, 2 Dec 2019 def as_location(feature): feature_identifier = identifier(feature) return { "scale": scale or feature["properties"].pop(scale_from), "identifier": feature_identifier, "hierarchy": get_hierarchy(feature, feature_identifier), "point": point(feature), "polygon": polygon(feature), "srid": feature["crs"]["EPSG"], "details": feature["properties"], } def as_simplified(feature): return { "identifier": identifier(feature), "polygon": polygon(feature), "srid": feature["crs"]["EPSG"], } # Now, read in the data files and convert to our internal structure. LOG.info(f"Reading features from «{features_path}»") hierarchy_df = None if hierarchy_by_feature: hierarchy_df = pd.read_csv(hierarchy_by_feature, dtype=str) if "feature_identifier" not in hierarchy_df.columns: raise Exception( "hierarchy_by_feature CSV must include 'feature_identifier' column" ) duplicated_feature_identifier = hierarchy_df[ "feature_identifier"].duplicated(keep=False) duplicates = hierarchy_df["feature_identifier"][ duplicated_feature_identifier] dup_identifiers = list(duplicates.unique()) assert len( dup_identifiers ) == 0, f"Found duplicate feature_identifiers: {dup_identifiers}" locations = list(map(as_location, parse_features(features_path))) if simplified_polygons_path: LOG.info( f"Reading simplified polygons from «{simplified_polygons_path}»") simplified_polygons = list( map(as_simplified, parse_features(simplified_polygons_path))) else: simplified_polygons = [] # Finally, do the updates in the database db = DatabaseSession() try: LOG.info(f"Importing locations") insert = SQL(""" with new_location as ( select scale, identifier, coalesce(lower(hierarchy)::hstore, '') || hstore(lower(scale), lower(identifier)) as hierarchy, st_transform(st_setsrid(st_geomfromgeojson(point), location.srid), 4326) as point, st_transform(st_setsrid(st_multi(st_geomfromgeojson(location.polygon)), location.srid), 4326) as polygon, st_transform(st_setsrid(st_multi(st_geomfromgeojson(simplified.polygon)), simplified.srid), 4326) as simplified_polygon, details from jsonb_to_recordset(%s) as location ( scale text , identifier text , hierarchy text , point text , polygon text , srid integer , details jsonb ) left join jsonb_to_recordset(%s) as simplified ( identifier text , polygon text , srid integer ) using (identifier) ), inserted as ( insert into warehouse.location (scale, identifier, hierarchy, point, polygon, simplified_polygon, details) table new_location {on_conflict} returning scale, identifier ) select count(*) filter (where inserted is not null), count(*) filter (where inserted is not null and point is not null) as with_point, count(*) filter (where inserted is not null and polygon is not null) as with_polygon, count(*) filter (where inserted is not null and simplified_polygon is not null) as with_simplified_polygon, count(*) - count(*) filter (where inserted is not null) as skipped from new_location left join inserted using (scale, identifier) """) on_conflict = { "error": SQL(""), "update": SQL(""" on conflict (scale, identifier) do update set hierarchy = EXCLUDED.hierarchy, point = EXCLUDED.point, polygon = EXCLUDED.polygon, simplified_polygon = EXCLUDED.simplified_polygon, details = EXCLUDED.details """), "skip": SQL("on conflict (scale, identifier) do nothing"), } imported = db.fetch_row( insert.format(on_conflict=on_conflict[if_exists_action]), (Json(locations), Json(simplified_polygons))) LOG.info( dedent(f"""\ Imported {imported.count:,} locations {imported.with_point:,} with a point {imported.with_polygon:,} with a polygon {imported.with_simplified_polygon:,} with a simplified polygon {imported.skipped:,} skipped """)) LOG.info("Committing all changes") db.commit() except: LOG.info("Rolling back all changes; the database will not be modified") db.rollback() raise
def kit_manifests(*, db: DatabaseSession): LOG.debug( f"Starting the kits manifests ETL routine, revision {MANIFEST_REVISION}" ) LOG.debug("Fetching unprocessed manifest records") manifest = db.cursor("manifest") manifest.execute( """ select manifest_id as id, document from receiving.manifest where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": MANIFEST_REVISION }]), )) for manifest_record in manifest: with db.savepoint(f"manifest record {manifest_record.id}"): LOG.info(f"Processing record {manifest_record.id}") # Mark record as skipped # if it does not contain a kit related sample if "kit" not in manifest_record.document: LOG.info( f"Skipping manifest record {manifest_record.id} without kit data" ) mark_skipped(db, manifest_record.id) continue sample_barcode = manifest_record.document.pop("sample") sample_identifier = find_identifier(db, sample_barcode) # Mark record as skipped # if it has an unknown sample barcode if not sample_identifier: LOG.warning( f"Skipping manifest record with unknown sample barcode «{sample_barcode}»" ) mark_skipped(db, manifest_record.id) continue # Mark record as skipped sample identifier set is unexpected if sample_identifier.set_name not in expected_identifier_sets[ "samples"]: LOG.warning( f"Skipping manifest record with sample identifier found in " + f"set «{sample_identifier.set_name}», not {expected_identifier_sets['samples']}" ) mark_skipped(db, manifest_record.id) continue # Find sample that should have been created from this # manifest record via etl manifest sample = find_sample(db, sample_identifier.uuid) # Error out the kit etl process if no sample found # The kit etl process can try again starting with this record # next time with the idea that the sample will be # created by then. if not sample: raise SampleNotFoundError( f"No sample with «{sample_identifier.uuid}» found") # Mark record as skipped if the sample does not have a # sample type (utm or rdt) if sample.type not in {"utm", "rdt"}: LOG.info(f"Skipping manifest record {manifest_record.id} " + f"with unknown sample type {sample.type}") mark_skipped(db, manifest_record.id) continue kit_barcode = manifest_record.document.pop("kit") kit_identifier = find_identifier(db, kit_barcode) # Mark record as skipped if it has an unknown kit barcode if not kit_identifier: LOG.warning( f"Skipping kit with unknown barcode «{kit_barcode}»") mark_skipped(db, manifest_record.id) continue # Mark record as skipped if kit identifier set is unexpected if kit_identifier.set_name not in expected_identifier_sets["kits"]: LOG.warning( f"Skipping kit with identifier found in " + f"set «{kit_identifier.set_name}» not {expected_identifier_sets['kits']}" ) mark_skipped(db, manifest_record.id) continue # List of extra data not needed for kit record that can # be removed before adding manifest document to kit details extra_data = [ "collection", "sample_type", "aliquot_date", "aliquots", "racks" ] for key in extra_data: manifest_record.document.pop(key, None) # Try to find identifier for the test-strip barcode for rdt samples if sample.type == "rdt": update_test_strip(db, manifest_record.document) kit, status = upsert_kit_with_sample( db, identifier=kit_identifier.uuid, sample=sample, additional_details=manifest_record.document) if status == "updated": update_sample(db, sample, kit.encounter_id) mark_loaded(db, manifest_record.id, status, kit.id)
def upsert_kit_with_sample(db: DatabaseSession, identifier: str, sample: SampleRecord, additional_details: dict) -> Tuple[KitRecord, str]: """ Upsert kit by its *identifier* to include link to a sample. An existing kit has its *sample_id* updated and the provided *additional_details* are merged (at the top level only into) the existing kit details, if any. """ LOG.debug(f"Upserting kit «{identifier}»") data = { "identifier": identifier, "sample_id": sample.id, "additional_details": Json(additional_details) } if sample.type == 'utm': sample_type = "utm_sample_id" elif sample.type == 'rdt': sample_type = "rdt_sample_id" # Look for existing kit kit = find_kit(db, identifier) # Nothing found → create if not kit: LOG.info("Creating new kit") status = "created" kit = db.fetch_row( sql.SQL(""" insert into warehouse.kit (identifier, {}, details) values(%(identifier)s, %(sample_id)s, %(additional_details)s) returning kit_id as id, identifier, encounter_id, {} """).format(sql.Identifier(sample_type), sql.Identifier(sample_type)), data) # Found kit → update else: status = "updated" kit_sample_id = getattr(kit, sample_type) # Warn if kit is already linked to a different sample! if (kit_sample_id and (sample.id != kit_sample_id)): LOG.warning( f"Kit «{kit.id}» already linked to another " + f"{sample_type} «{kit_sample_id}», linking with «{sample.id}» instead" ) kit = db.fetch_row( sql.SQL(""" update warehouse.kit set {} = %(sample_id)s, details = coalesce(details, {}) || %(additional_details)s where kit_id = %(kit_id)s returning kit_id as id, identifier, encounter_id, {} """).format(sql.Identifier(sample_type), sql.Literal(Json({})), sql.Identifier(sample_type)), { **data, "kit_id": kit.id }) assert kit.id, "Upsert affected no rows!" LOG.info(f"Upserted kit {kit.id} with identifier «{kit.identifier}» " + f"linked to {sample_type} «{getattr(kit, sample_type)}»") return kit, status
def upsert_kit_with_encounter( db: DatabaseSession, identifier: str, encounter_id: int, additional_details: dict) -> Tuple[KitRecord, str]: """ Upsert kit by its *identifier* to include link to encounter. An existing kit has its *encounter_id* updated and the provided *additional_details* are merged (at the top level only into) the existing kit details, if any. """ LOG.debug(f"Upserting kit «{identifier}»") data = { "identifier": identifier, "encounter_id": encounter_id, "additional_details": Json(additional_details) } # Look for existing kit kit = find_kit(db, identifier) # Nothing found → create if not kit: LOG.info("Creating new kit") status = "created" kit = db.fetch_row( """ insert into warehouse.kit (identifier, encounter_id, details) values(%(identifier)s, %(encounter_id)s, %(additional_details)s) returning kit_id as id, identifier, encounter_id, null rdt_sample_id, null utm_sample_id """, data) # Found kit → update else: status = "updated" # Warn if kit is already linked to a different encounter! if kit.encounter_id and kit.encounter_id != encounter_id: LOG.warning( f"Kit «{kit.id}» already linked to another encounter «{kit.encounter_id}», linking with «{encounter_id}» instead" ) kit = db.fetch_row( """ update warehouse.kit set encounter_id = %(encounter_id)s, details = coalesce(details, '{}') || %(additional_details)s where kit_id = %(kit_id)s returning kit_id as id, identifier, encounter_id, rdt_sample_id, utm_sample_id """, { **data, "kit_id": kit.id }) assert kit.id, "Upsert affected no rows!" LOG.info( f"Upserted kit {kit.id} with identifier «{kit.identifier}» linked to encounter «{kit.encounter_id}»" ) return kit, status
def etl_manifest(*, db: DatabaseSession): LOG.debug(f"Starting the manifest ETL routine, revision {REVISION}") # XXX TODO: Stop hardcoding valid identifier sets. Instead, accept them as # an option or config (and validate option choices against what's actually # in the database). We won't want to validate using click.option(), # because that would necessitate a database connection simply to run # bin/id3c at all. # -trs, 13 May 2019 expected_identifier_sets = { "samples": {"samples"}, "collections": { "collections-environmental", "collections-fluathome.org", "collections-household-intervention", "collections-household-intervention-asymptomatic", "collections-household-observation", "collections-household-observation-asymptomatic", "collections-kiosks", "collections-kiosks-asymptomatic", "collections-seattleflu.org", "collections-swab&send", "collections-swab&send-asymptomatic", "collections-self-test", "collections-scan", "collections-haarvi", "samples-haarvi", }, "rdt": {"collections-fluathome.org"} } # Fetch and iterate over samples that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same samples. LOG.debug("Fetching unprocessed manifest records") manifest = db.cursor("manifest") manifest.execute(""" select manifest_id as id, document from receiving.manifest where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]),)) for manifest_record in manifest: with db.savepoint(f"manifest record {manifest_record.id}"): LOG.info(f"Processing record {manifest_record.id}") # Convert sample barcode to full identifier, ensuring it's # known and from the correct identifier set. sample_barcode = manifest_record.document.pop("sample") sample_identifier = find_identifier(db, sample_barcode) if not sample_identifier: LOG.warning(f"Skipping sample with unknown sample barcode «{sample_barcode}»") mark_skipped(db, manifest_record.id) continue if (manifest_record.document.get("sample_type") and manifest_record.document["sample_type"] == "rdt"): assert sample_identifier.set_name in expected_identifier_sets["rdt"], \ (f"Sample identifier found in set «{sample_identifier.set_name}», " + f"not {expected_identifier_sets['rdt']}") else: assert sample_identifier.set_name in expected_identifier_sets["samples"], \ (f"Sample identifier found in set «{sample_identifier.set_name}», " + f"not {expected_identifier_sets['samples']}") # Optionally, convert the collection barcode to full # identifier, ensuring it's known and from the correct # identifier set. collection_barcode = manifest_record.document.pop("collection", None) collection_identifier = find_identifier(db, collection_barcode) if collection_barcode else None if collection_barcode and not collection_identifier: LOG.warning(f"Skipping sample with unknown collection barcode «{collection_barcode}»") mark_skipped(db, manifest_record.id) continue assert not collection_identifier \ or collection_identifier.set_name in expected_identifier_sets["collections"], \ f"Collection identifier found in set «{collection_identifier.set_name}», not {expected_identifier_sets['collections']}" # type: ignore # Sample collection date collection_date = manifest_record.document.get("date") # Upsert sample cooperatively with enrollments ETL routine # # The details document was intentionally modified by two pop()s # earlier to remove barcodes that were looked up. The # rationale is that we want just one clear place in the # warehouse for each piece of information. sample, status = upsert_sample(db, identifier = sample_identifier.uuid, collection_identifier = collection_identifier.uuid if collection_identifier else None, collection_date = collection_date, additional_details = manifest_record.document) mark_loaded(db, manifest_record.id, status = status, sample_id = sample.id) LOG.info(f"Finished processing manifest record {manifest_record.id}")
def etl_clinical(*, db: DatabaseSession): LOG.debug(f"Starting the clinical ETL routine, revision {REVISION}") # Fetch and iterate over clinical records that aren't processed # # Rows we fetch are locked for update so that two instances of this # command don't try to process the same clinical records. LOG.debug("Fetching unprocessed clinical records") clinical = db.cursor("clinical") clinical.execute(""" select clinical_id as id, document from receiving.clinical where not processing_log @> %s order by id for update """, (Json([{ "revision": REVISION }]),)) for record in clinical: with db.savepoint(f"clinical record {record.id}"): LOG.info(f"Processing clinical record {record.id}") # Check validity of barcode received_sample_identifier = sample_identifier(db, record.document["barcode"]) # Skip row if no matching identifier found if received_sample_identifier is None: LOG.info("Skipping due to unknown barcode " + \ f"{record.document['barcode']}") mark_skipped(db, record.id) continue # Check sample exists in database sample = find_sample(db, identifier = received_sample_identifier) # Skip row if sample does not exist if sample is None: LOG.info("Skipping due to missing sample with identifier " + \ f"{received_sample_identifier}") mark_skipped(db, record.id) continue # Most of the time we expect to see existing sites so a # select-first approach makes the most sense to avoid useless # updates. site = find_or_create_site(db, identifier = site_identifier(record.document["site"]), details = {"type": "retrospective"}) # Most of the time we expect to see new individuals and new # encounters, so an insert-first approach makes more sense. # Encounters we see more than once are presumed to be # corrections. individual = upsert_individual(db, identifier = record.document["individual"], sex = sex(record.document["AssignedSex"])) encounter = upsert_encounter(db, identifier = record.document["identifier"], encountered = record.document["encountered"], individual_id = individual.id, site_id = site.id, age = age(record.document), details = encounter_details(record.document)) sample = update_sample(db, sample = sample, encounter_id = encounter.id) # Link encounter to a Census tract, if we have it tract_identifier = record.document.get("census_tract") if tract_identifier: # Special-case float-like identifiers in earlier date tract_identifier = re.sub(r'\.0$', '', str(tract_identifier)) tract = find_location(db, "tract", tract_identifier) assert tract, f"Tract «{tract_identifier}» is unknown" upsert_encounter_location(db, encounter_id = encounter.id, relation = "residence", location_id = tract.id) mark_processed(db, record.id, {"status": "processed"}) LOG.info(f"Finished processing clinical record {record.id}")
def kit_enrollments(*, db: DatabaseSession): LOG.debug( f"Starting the kit enrollments ETL routine, revision {ENROLLMENTS_REVISION}" ) expected_barcode_types = {"ScannedSelfSwab", "ManualSelfSwab"} LOG.debug("Fetching unprocessed enrollments") enrollments = db.cursor("enrollments") enrollments.execute( """ select enrollment_id as id, document from receiving.enrollment where not processing_log @> %s order by id for update """, (Json([{ "etl": ETL_NAME, "revision": ENROLLMENTS_REVISION }]), )) for enrollment in enrollments: with db.savepoint(f"enrollment {enrollment.id}"): LOG.info(f"Processing enrollment {enrollment.id}") # Find encounter that should have been created # from this enrollment record through etl enrollments encounter = find_encounter(db, enrollment.document["id"]) # Error out the kit etl process if no encounter found # The kit etl process can try again starting with this record # next time with the idea that the encounter will be # created by then. if not encounter: raise EncounterNotFoundError( f"No encounter with identifier «{enrollment.document['id']}» found" ) # Skip and mark the enrollment document as processed if the # encounter found is linked to a site that is not self-test if encounter.site != "self-test": LOG.debug( f"Found encounter {encounter.id} «{encounter.identifier}»" + f"linked to site «{encounter.site}», not 'self-test'") mark_enrollment_processed(db, enrollment.id) continue for code in enrollment.document["sampleCodes"]: barcode = code.get("code") # Kit must have a barcode if not barcode: LOG.warning(f"No barcode found in sampleCodes {code}") continue # Barcode must be of expected barcode type if code["type"] not in expected_barcode_types: LOG.debug(f"Skipping barcode with type {code['type']}") continue # Convert kit barcode to full identifier kit_identifier = find_identifier(db, barcode) if not kit_identifier: LOG.warning( f"Skipping kit with unknown barcode «{barcode}»") continue if kit_identifier.set_name not in expected_identifier_sets[ "kits"]: LOG.warning( f"Skipping kit with identifier found in " + f"set «{kit_identifier.set_name}» not {expected_identifier_sets['kits']}" ) continue details = {"type": code["type"]} kit, status = upsert_kit_with_encounter( db, identifier=kit_identifier.uuid, encounter_id=encounter.id, additional_details=details) if status == "updated": update_kit_samples(db, kit) mark_enrollment_processed(db, enrollment.id) LOG.info(f"Finished processing enrollment {enrollment.id}")