def upload(sequence_read_set_file, unknown_sample_output, db: DatabaseSession):
    """
    Upload sequence read sets into the database warehouse.

    <sequence-read-set.ndjson> must be a newline delimited JSON file produced
    by this command's sibling command.

    Sequence read sets with NWGC sample IDs that cannot be found within the
    database warehouse are printed out as newline delimited JSON file
    <unknown-sample-output.ndjson>.
    """

    for sequence_read_set in sequence_read_set_file:
        sample_set = json.loads(sequence_read_set)
        nwgc_id = sample_set.get("sample")
        urls = sample_set.get("urls")
        with db.savepoint(f"sequence read set {nwgc_id}"):
            LOG.info(f"Processing sequence read set for sample {nwgc_id}")
            sample_id = find_sample(db, nwgc_id)
            if sample_id is None:
                LOG.warning(
                    f"Skipping sample with NWGC ID «{nwgc_id}» because it was not found within warehouse."
                )
                unknown_sample_output.write(sequence_read_set)
                continue
            sequence_read_set = insert_sequence_read_set(db, sample_id, urls)
            LOG.info(
                f"Finished uploading sequence read set for sample {nwgc_id}")
Exemple #2
0
def etl_fhir(*, db: DatabaseSession):
    LOG.debug(f"Starting the FHIR ETL routine, revision {REVISION}")

    # Fetch and iterate over FHIR documents that aren't processed
    #
    # Use a server-side cursor by providing a name and limit to one fetched
    # record at a time, to limit local process size.
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same FHIR documents.
    LOG.debug("Fetching unprocessed FHIR documents")

    fhir_documents = db.cursor("fhir")
    fhir_documents.itersize = 1
    fhir_documents.execute(
        """
        select fhir_id as id, document
          from receiving.fhir
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "etl": ETL_NAME,
            "revision": REVISION
        }]), ))

    for record in fhir_documents:
        with db.savepoint(f"FHIR document {record.id}"):
            LOG.info(f"Processing FHIR document {record.id}")

            assert_bundle_collection(record.document)
            bundle = Bundle(record.document)
            resources = extract_resources(bundle)

            # Loop over every Resource the Bundle entry, processing what is
            # needed along the way.
            try:
                assert_required_resource_types_present(resources)
                process_bundle_entries(db, bundle)

            except SkipBundleError as error:
                LOG.warning(
                    f"Skipping bundle in FHIR document «{record.id}»: {error}")
                mark_skipped(db, record.id)
                continue

            mark_processed(db, record.id, {"status": "processed"})
            LOG.info(f"Finished processing FHIR document {record.id}")
Exemple #3
0
def etl_clinical(*, db: DatabaseSession):
    LOG.debug(f"Starting the clinical ETL routine, revision {REVISION}")

    # Fetch and iterate over clinical records that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same clinical records.
    LOG.debug("Fetching unprocessed clinical records")

    clinical = db.cursor("clinical")
    clinical.execute("""
        select clinical_id as id, document
          from receiving.clinical
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{ "revision": REVISION }]),))

    for record in clinical:
        with db.savepoint(f"clinical record {record.id}"):
            LOG.info(f"Processing clinical record {record.id}")

            # Check validity of barcode
            received_sample_identifier = sample_identifier(db,
                record.document["barcode"])

            # Skip row if no matching identifier found
            if received_sample_identifier is None:
                LOG.info("Skipping due to unknown barcode " + \
                          f"{record.document['barcode']}")
                mark_skipped(db, record.id)
                continue

            # Check sample exists in database
            sample = find_sample(db,
                identifier = received_sample_identifier)

            # Skip row if sample does not exist
            if sample is None:
                LOG.info("Skipping due to missing sample with identifier " + \
                            f"{received_sample_identifier}")
                mark_skipped(db, record.id)
                continue

            # Most of the time we expect to see existing sites so a
            # select-first approach makes the most sense to avoid useless
            # updates.
            site = find_or_create_site(db,
                identifier = site_identifier(record.document["site"]),
                details    = {"type": "retrospective"})


            # Most of the time we expect to see new individuals and new
            # encounters, so an insert-first approach makes more sense.
            # Encounters we see more than once are presumed to be
            # corrections.
            individual = upsert_individual(db,
                identifier  = record.document["individual"],
                sex         = sex(record.document["AssignedSex"]))

            encounter = upsert_encounter(db,
                identifier      = record.document["identifier"],
                encountered     = record.document["encountered"],
                individual_id   = individual.id,
                site_id         = site.id,
                age             = age(record.document),
                details         = encounter_details(record.document))

            sample = update_sample(db,
                sample = sample,
                encounter_id = encounter.id)

            # Link encounter to a Census tract, if we have it
            tract_identifier = record.document.get("census_tract")

            if tract_identifier:
                # Special-case float-like identifiers in earlier date
                tract_identifier = re.sub(r'\.0$', '', str(tract_identifier))

                tract = find_location(db, "tract", tract_identifier)
                assert tract, f"Tract «{tract_identifier}» is unknown"

                upsert_encounter_location(db,
                    encounter_id = encounter.id,
                    relation = "residence",
                    location_id = tract.id)

            mark_processed(db, record.id, {"status": "processed"})

            LOG.info(f"Finished processing clinical record {record.id}")
def etl_presence_absence(*, db: DatabaseSession):
    LOG.debug(
        f"Starting the presence_absence ETL routine, revision {REVISION}")

    # Fetch and iterate over presence-absence tests that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same presence-absence tests.
    LOG.debug("Fetching unprocessed presence-absence tests")

    presence_absence = db.cursor("presence_absence")
    presence_absence.itersize = 1
    presence_absence.execute(
        """
        select presence_absence_id as id, document,
               received::date as received_date
          from receiving.presence_absence
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "revision": REVISION
        }]), ))

    for group in presence_absence:
        with db.savepoint(f"presence_absence group {group.id}"):
            LOG.info(f"Processing presence_absence group {group.id}")

            # Samplify will now send documents with a top level key
            # "samples". The new format also includes a "chip" key for each
            # sample which is then included in the unique identifier for
            # each presence/absence result
            #   -Jover, 14 Nov 2019
            try:
                received_samples = group.document["samples"]
            except KeyError as error:
                # Skip documents in the old format because they do not
                # include the "chip" key which is needed for the
                # unique identifier for each result.
                #   -Jover, 14 Nov 2019
                # Also skip old format to avoid ingesting wrong data from
                # plate swapped data! This will lead to 188 samples with the
                # wrong nwgc_id associated with them.
                #   -Jover, 06 Dec 2019
                if (group.document.get("store") is not None
                        or group.document.get("Update") is not None):

                    LOG.info(
                        "Skipping presence_absence record that is in old format"
                    )
                    mark_processed(db, group.id)
                    continue

                else:
                    raise error from None

            for received_sample in received_samples:
                received_sample_barcode = received_sample.get("investigatorId")
                if not received_sample_barcode:
                    LOG.info(
                        f"Skipping sample «{received_sample['sampleId']}» without SFS barcode"
                    )
                    continue

                # Don't go any further if the sample is marked as Failed
                sample_failed = received_sample.get("sampleFailed")
                if sample_failed is True:
                    LOG.info(
                        f"Skipping sample «{received_sample_barcode}» that has been failed"
                    )
                    continue

                # Don't go any further if there are no results to import.
                test_results = received_sample["targetResults"]

                if not test_results:
                    LOG.warning(
                        f"Skipping sample «{received_sample_barcode}» without any results"
                    )
                    continue

                received_sample_id = str(received_sample["sampleId"])
                chip = received_sample.get("chip")
                extraction_date = received_sample.get("extractionDate")
                assay_name = received_sample.get("assayName")
                assay_date = received_sample.get("assayDate")
                # The assayType field will be removed after Samplify starts
                # sending us OpenArray results with target.clinicalStatus.
                #
                # kfay, 28 Dec 2020
                assay_type = received_sample.get("assayType")

                # Guard against empty chip values
                assert chip or "chip" not in received_sample, "Received bogus chip id"

                # Must be current results
                LOG.info(f"Processing sample «{received_sample_barcode}»")

                if not received_sample.get("isCurrentExpressionResult"):
                    LOG.warning(
                        f"Skipping out-of-date results for sample «{received_sample_barcode}»"
                    )
                    continue

                # Barcode must match a known identifier
                received_sample_identifier = sample_identifier(
                    db, received_sample_barcode)

                if not received_sample_identifier:
                    LOG.warning(
                        f"Skipping results for sample without a known identifier «{received_sample_barcode}»"
                    )
                    continue

                # Track Samplify's internal ids for our samples, which is
                # unfortunately necessary for linking genomic data NWGC also
                # sends.
                sample = update_sample(
                    db,
                    identifier=received_sample_identifier,
                    additional_details=sample_details(received_sample))

                # Finally, process all results.
                for test_result in test_results:
                    test_result_target_id = test_result["geneTarget"]
                    LOG.debug(
                        f"Processing target «{test_result_target_id}» for \
                    sample «{received_sample_barcode}»")

                    # Skip this result if it's actually a non-result
                    present = target_present(test_result)

                    if present is ...:
                        LOG.debug(
                            f"No test result for «{test_result_target_id}», skipping"
                        )
                        continue

                    # Most of the time we expect to see existing targets so a
                    # select-first approach makes the most sense to avoid useless
                    # updates.
                    target = find_or_create_target(
                        db,
                        identifier=test_result_target_id,
                        control=target_control(test_result["controlStatus"]))

                    # The unique identifier for each result.  If chip is
                    # applicable, then it's included to differentiate the same
                    # sample being run on multiple chips (uncommon, but it
                    # happens).
                    if chip:
                        identifier = f"NWGC/{received_sample_id}/{target.identifier}/{chip}"
                    else:
                        identifier = f"NWGC/{received_sample_id}/{target.identifier}"

                    # Most of the time we expect to see new samples and new
                    # presence_absence tests, so an insert-first approach makes more sense.
                    # Presence-absence tests we see more than once are presumed to be
                    # corrections.
                    upsert_presence_absence(db,
                                            identifier=identifier,
                                            sample_id=sample.id,
                                            target_id=target.id,
                                            present=present,
                                            details=presence_absence_details(
                                                test_result,
                                                group.received_date, chip,
                                                extraction_date, assay_name,
                                                assay_date, assay_type))

            mark_processed(db, group.id)

            LOG.info(f"Finished processing presence_absence group {group.id}")
def etl_longitudinal(*, db: DatabaseSession):
    LOG.debug(f"Starting the longitudinal ETL routine, revision {REVISION}")

    # Fetch and iterate over longitudinal records that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same longitudinal records.
    LOG.debug("Fetching unprocessed longitudinal records")

    longitudinal = db.cursor("longitudinal")
    longitudinal.execute(
        """
        select longitudinal_id as id, document
          from receiving.longitudinal
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "revision": REVISION
        }]), ))

    for record in longitudinal:
        with db.savepoint(f"longitudinal record {record.id}"):
            LOG.info(f"Processing longitudinal record {record.id}")

            # Check validity of barcode
            received_sample_identifier = sample_identifier(db, record.document)

            # Check sample exists in database
            if received_sample_identifier:
                sample = find_sample(db, identifier=received_sample_identifier)
            else:
                sample = None

            # Most of the time we expect to see existing sites so a
            # select-first approach makes the most sense to avoid useless
            # updates.
            site = find_or_create_site(
                db,
                identifier=site_identifier(record.document),
                details={"type": record.document['type']})

            # Most of the time we expect to see existing individuals and new
            # encounters.
            # Encounters we see more than once are presumed to be
            # corrections.
            individual = find_or_create_individual(
                db,
                identifier=record.document["individual"],
                sex=sex(record.document))

            encounter = upsert_encounter(
                db,
                identifier=record.document["identifier"],
                encountered=record.document["encountered"],
                individual_id=individual.id,
                site_id=site.id,
                age=age(record.document),
                details=encounter_details(record.document))

            if sample:
                sample = update_sample(db,
                                       sample=sample,
                                       encounter_id=encounter.id)

            # Link encounter to a Census tract, if we have it
            tract_identifier = record.document.get("census_tract")

            if tract_identifier:
                tract = find_location(db, "tract", str(tract_identifier))
                assert tract, f"Tract «{tract_identifier}» is unknown"

                upsert_encounter_location(db,
                                          encounter_id=encounter.id,
                                          relation="residence",
                                          location_id=tract.id)

            mark_processed(db, record.id, {"status": "processed"})

            LOG.info(f"Finished processing longitudinal record {record.id}")
Exemple #6
0
def etl_manifest(*, db: DatabaseSession):
    LOG.debug(f"Starting the manifest ETL routine, revision {REVISION}")

    # XXX TODO: Stop hardcoding valid identifier sets.  Instead, accept them as
    # an option or config (and validate option choices against what's actually
    # in the database).  We won't want to validate using click.option(),
    # because that would necessitate a database connection simply to run
    # bin/id3c at all.
    #   -trs, 13 May 2019
    expected_identifier_sets = {
        "samples": {"samples"},
        "collections": {
            "collections-environmental",
            "collections-fluathome.org",
            "collections-household-intervention",
            "collections-household-intervention-asymptomatic",
            "collections-household-observation",
            "collections-household-observation-asymptomatic",
            "collections-kiosks",
            "collections-kiosks-asymptomatic",
            "collections-seattleflu.org",
            "collections-swab&send",
            "collections-swab&send-asymptomatic",
            "collections-self-test",
            "collections-scan",
            "collections-haarvi",
            "samples-haarvi",
        },
        "rdt": {"collections-fluathome.org"}
    }

    # Fetch and iterate over samples that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same samples.
    LOG.debug("Fetching unprocessed manifest records")

    manifest = db.cursor("manifest")
    manifest.execute("""
        select manifest_id as id, document
          from receiving.manifest
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]),))

    for manifest_record in manifest:
        with db.savepoint(f"manifest record {manifest_record.id}"):
            LOG.info(f"Processing record {manifest_record.id}")

            # Convert sample barcode to full identifier, ensuring it's
            # known and from the correct identifier set.
            sample_barcode = manifest_record.document.pop("sample")
            sample_identifier = find_identifier(db, sample_barcode)

            if not sample_identifier:
                LOG.warning(f"Skipping sample with unknown sample barcode «{sample_barcode}»")
                mark_skipped(db, manifest_record.id)
                continue

            if (manifest_record.document.get("sample_type") and
                manifest_record.document["sample_type"] == "rdt"):
                assert sample_identifier.set_name in expected_identifier_sets["rdt"], \
                    (f"Sample identifier found in set «{sample_identifier.set_name}», " +
                    f"not {expected_identifier_sets['rdt']}")
            else:
                assert sample_identifier.set_name in expected_identifier_sets["samples"], \
                    (f"Sample identifier found in set «{sample_identifier.set_name}», " +
                    f"not {expected_identifier_sets['samples']}")

            # Optionally, convert the collection barcode to full
            # identifier, ensuring it's known and from the correct
            # identifier set.
            collection_barcode = manifest_record.document.pop("collection", None)
            collection_identifier = find_identifier(db, collection_barcode) if collection_barcode else None

            if collection_barcode and not collection_identifier:
                LOG.warning(f"Skipping sample with unknown collection barcode «{collection_barcode}»")
                mark_skipped(db, manifest_record.id)
                continue

            assert not collection_identifier \
                or collection_identifier.set_name in expected_identifier_sets["collections"], \
                    f"Collection identifier found in set «{collection_identifier.set_name}», not {expected_identifier_sets['collections']}" # type: ignore

            # Sample collection date
            collection_date = manifest_record.document.get("date")

            # Upsert sample cooperatively with enrollments ETL routine
            #
            # The details document was intentionally modified by two pop()s
            # earlier to remove barcodes that were looked up.  The
            # rationale is that we want just one clear place in the
            # warehouse for each piece of information.
            sample, status = upsert_sample(db,
                identifier            = sample_identifier.uuid,
                collection_identifier = collection_identifier.uuid if collection_identifier else None,
                collection_date       = collection_date,
                additional_details    = manifest_record.document)

            mark_loaded(db, manifest_record.id,
                status = status,
                sample_id = sample.id)

            LOG.info(f"Finished processing manifest record {manifest_record.id}")
Exemple #7
0
def kit_enrollments(*, db: DatabaseSession):
    LOG.debug(
        f"Starting the kit enrollments ETL routine, revision {ENROLLMENTS_REVISION}"
    )

    expected_barcode_types = {"ScannedSelfSwab", "ManualSelfSwab"}

    LOG.debug("Fetching unprocessed enrollments")
    enrollments = db.cursor("enrollments")
    enrollments.execute(
        """
        select enrollment_id as id, document
          from receiving.enrollment
         where not processing_log @> %s
         order by id
          for update
        """, (Json([{
            "etl": ETL_NAME,
            "revision": ENROLLMENTS_REVISION
        }]), ))

    for enrollment in enrollments:
        with db.savepoint(f"enrollment {enrollment.id}"):
            LOG.info(f"Processing enrollment {enrollment.id}")

            # Find encounter that should have been created
            # from this enrollment record through etl enrollments
            encounter = find_encounter(db, enrollment.document["id"])

            # Error out the kit etl process if no encounter found
            # The kit etl process can try again starting with this record
            # next time with the idea that the encounter will be
            # created by then.
            if not encounter:
                raise EncounterNotFoundError(
                    f"No encounter with identifier «{enrollment.document['id']}» found"
                )

            # Skip and mark the enrollment document as processed if the
            # encounter found is linked to a site that is not self-test
            if encounter.site != "self-test":
                LOG.debug(
                    f"Found encounter {encounter.id} «{encounter.identifier}»"
                    + f"linked to site «{encounter.site}», not 'self-test'")
                mark_enrollment_processed(db, enrollment.id)
                continue

            for code in enrollment.document["sampleCodes"]:
                barcode = code.get("code")

                # Kit must have a barcode
                if not barcode:
                    LOG.warning(f"No barcode found in sampleCodes {code}")
                    continue

                # Barcode must be of expected barcode type
                if code["type"] not in expected_barcode_types:
                    LOG.debug(f"Skipping barcode with type {code['type']}")
                    continue

                # Convert kit barcode to full identifier
                kit_identifier = find_identifier(db, barcode)

                if not kit_identifier:
                    LOG.warning(
                        f"Skipping kit with unknown barcode «{barcode}»")
                    continue

                if kit_identifier.set_name not in expected_identifier_sets[
                        "kits"]:
                    LOG.warning(
                        f"Skipping kit with identifier found in " +
                        f"set «{kit_identifier.set_name}» not {expected_identifier_sets['kits']}"
                    )
                    continue

                details = {"type": code["type"]}

                kit, status = upsert_kit_with_encounter(
                    db,
                    identifier=kit_identifier.uuid,
                    encounter_id=encounter.id,
                    additional_details=details)

                if status == "updated":
                    update_kit_samples(db, kit)

            mark_enrollment_processed(db, enrollment.id)

            LOG.info(f"Finished processing enrollment {enrollment.id}")
Exemple #8
0
def kit_manifests(*, db: DatabaseSession):
    LOG.debug(
        f"Starting the kits manifests ETL routine, revision {MANIFEST_REVISION}"
    )

    LOG.debug("Fetching unprocessed manifest records")

    manifest = db.cursor("manifest")
    manifest.execute(
        """
        select manifest_id as id, document
          from receiving.manifest
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "etl": ETL_NAME,
            "revision": MANIFEST_REVISION
        }]), ))

    for manifest_record in manifest:
        with db.savepoint(f"manifest record {manifest_record.id}"):
            LOG.info(f"Processing record {manifest_record.id}")

            # Mark record as skipped
            # if it does not contain a kit related sample
            if "kit" not in manifest_record.document:
                LOG.info(
                    f"Skipping manifest record {manifest_record.id} without kit data"
                )
                mark_skipped(db, manifest_record.id)
                continue

            sample_barcode = manifest_record.document.pop("sample")
            sample_identifier = find_identifier(db, sample_barcode)

            # Mark record as skipped
            # if it has an unknown sample barcode
            if not sample_identifier:
                LOG.warning(
                    f"Skipping manifest record with unknown sample barcode «{sample_barcode}»"
                )
                mark_skipped(db, manifest_record.id)
                continue

            # Mark record as skipped sample identifier set is unexpected
            if sample_identifier.set_name not in expected_identifier_sets[
                    "samples"]:
                LOG.warning(
                    f"Skipping manifest record with sample identifier found in "
                    +
                    f"set «{sample_identifier.set_name}», not {expected_identifier_sets['samples']}"
                )
                mark_skipped(db, manifest_record.id)
                continue

            # Find sample that should have been created from this
            # manifest record via etl manifest
            sample = find_sample(db, sample_identifier.uuid)

            # Error out the kit etl process if no sample found
            # The kit etl process can try again starting with this record
            # next time with the idea that the sample will be
            # created by then.
            if not sample:
                raise SampleNotFoundError(
                    f"No sample with «{sample_identifier.uuid}» found")

            # Mark record as skipped if the sample does not have a
            # sample type (utm or rdt)
            if sample.type not in {"utm", "rdt"}:
                LOG.info(f"Skipping manifest record {manifest_record.id} " +
                         f"with unknown sample type {sample.type}")
                mark_skipped(db, manifest_record.id)
                continue

            kit_barcode = manifest_record.document.pop("kit")
            kit_identifier = find_identifier(db, kit_barcode)

            # Mark record as skipped if it has an unknown kit barcode
            if not kit_identifier:
                LOG.warning(
                    f"Skipping kit with unknown barcode «{kit_barcode}»")
                mark_skipped(db, manifest_record.id)
                continue

            # Mark record as skipped if kit identifier set is unexpected
            if kit_identifier.set_name not in expected_identifier_sets["kits"]:
                LOG.warning(
                    f"Skipping kit with identifier found in " +
                    f"set «{kit_identifier.set_name}» not {expected_identifier_sets['kits']}"
                )
                mark_skipped(db, manifest_record.id)
                continue

            # List of extra data not needed for kit record that can
            # be removed before adding manifest document to kit details
            extra_data = [
                "collection", "sample_type", "aliquot_date", "aliquots",
                "racks"
            ]
            for key in extra_data:
                manifest_record.document.pop(key, None)

            # Try to find identifier for the test-strip barcode for rdt samples
            if sample.type == "rdt":
                update_test_strip(db, manifest_record.document)

            kit, status = upsert_kit_with_sample(
                db,
                identifier=kit_identifier.uuid,
                sample=sample,
                additional_details=manifest_record.document)

            if status == "updated":
                update_sample(db, sample, kit.encounter_id)

            mark_loaded(db, manifest_record.id, status, kit.id)
Exemple #9
0
        def decorated(*args,
                      db: DatabaseSession,
                      log_output: bool,
                      det_limit: int = None,
                      redcap_api_batch_size: int,
                      geocoding_cache: str = None,
                      **kwargs):
            LOG.debug(
                f"Starting the REDCap DET ETL routine {name}, revision {revision}"
            )

            project = Project(redcap_url, project_id)

            if det_limit:
                LOG.debug(f"Processing up to {det_limit:,} pending DETs")
                limit = sql.Literal(det_limit)
            else:
                LOG.debug(f"Processing all pending DETs")
                limit = sql.SQL("all")

            redcap_det = db.cursor(f"redcap-det {name}")
            redcap_det.execute(
                sql.SQL("""
                select redcap_det_id as id, document
                  from receiving.redcap_det
                 where not processing_log @> %s
                   and document::jsonb @> %s
                 order by id
                 limit {}
                   for update
                """).format(limit), (Json([etl_id]), Json(det_contains)))

            # First loop of the DETs to determine how to process each one.
            # Uses `first_complete_dets` to keep track of which DET to
            # use to process a unique REDCap record.
            # Uses `all_dets` to keep track of the status for each DET record
            # so that they can be processed in order of `redcap_det_id` later.
            #   --Jover, 21 May 2020
            first_complete_dets: Dict[str, Any] = {}
            all_dets: List[Dict[str, str]] = []
            for det in redcap_det:
                instrument = det.document['instrument']
                record_id = det.document['record']
                # Assume we are loading all DETs
                # Status will be updated to "skip" if DET does not need to be processed
                det_record = {"id": det.id, "status": "load"}

                # Only pull REDCap record if
                # `include_incomplete` flag was not included and
                # the current instrument is complete
                if not include_incomplete and not is_complete(
                        instrument, det.document):
                    det_record.update({
                        "status": "skip",
                        "reason": "incomplete/unverified DET"
                    })

                # Check if this is record has an older DET
                # Skip latest DET in favor of the first DET
                # This is done to continue our first-in-first-out
                # semantics of our receiving tables
                elif first_complete_dets.get(record_id):
                    det_record.update({
                        "status": "skip",
                        "reason": "repeat REDCap record"
                    })

                else:
                    first_complete_dets[record_id] = det
                    det_record["record_id"] = record_id

                all_dets.append(det_record)

            if not first_complete_dets:
                LOG.info("No new complete DETs found.")
            else:
                # Batch request records from REDCap
                LOG.info(f"Fetching REDCap project {project_id}")
                record_ids = list(first_complete_dets.keys())

                LOG.info(
                    f"Fetching {len(record_ids):,} REDCap records from project {project.id}"
                )

                # Convert list of REDCap records to a dict so that
                # records can be looked up by record id.
                # Records with repeating instruments or longitudinal
                # events will have multiple entries in the list.
                redcap_records: DefaultDict[str,
                                            List[dict]] = defaultdict(list)

                batches = list(chunked(record_ids, redcap_api_batch_size))

                for i, batch in enumerate(batches, 1):
                    LOG.info(
                        f"Fetching REDCap record batch {i:,}/{len(batches):,} of size {len(batch):,}"
                    )

                    for record in project.records(ids=batch,
                                                  raw=raw_coded_values):
                        redcap_records[record.id].append(record)

            # Process all DETs in order of redcap_det_id
            with pickled_cache(geocoding_cache) as cache:
                for det in all_dets:
                    with db.savepoint(f"redcap_det {det['id']}"):
                        LOG.info(f"Processing REDCap DET {det['id']}")

                        if det["status"] == "skip":
                            LOG.debug(
                                f"Skipping REDCap DET {det['id']} due to {det['reason']}"
                            )
                            mark_skipped(db, det["id"], etl_id, det["reason"])
                            continue

                        received_det = first_complete_dets.pop(
                            det["record_id"])
                        redcap_record_instances = redcap_records.get(
                            received_det.document["record"])

                        if not redcap_record_instances:
                            LOG.debug(
                                f"REDCap record is missing or invalid.  Skipping REDCap DET {received_det.id}"
                            )
                            mark_skipped(db, received_det.id, etl_id,
                                         "invalid REDCap record")
                            continue

                        bundle = routine(
                            db=db,
                            cache=cache,
                            det=received_det,
                            redcap_record_instances=redcap_record_instances)

                        if not bundle:
                            LOG.debug(
                                f"Skipping REDCap DET {received_det.id} due to insufficient data in REDCap record."
                            )
                            mark_skipped(db, received_det.id, etl_id,
                                         "insufficient data in record")
                            continue

                        if log_output:
                            print(as_json(bundle))

                        insert_fhir_bundle(db, bundle)
                        mark_loaded(db, received_det.id, etl_id, bundle['id'])
def notify(*, action: str):
    LOG.debug(f"Starting the reportable conditions notification routine, revision {REVISION}")

    db = DatabaseSession()

    def webhook(suffix):
        return os.environ.get("SLACK_WEBHOOK_ALERTS_TEST") \
            or os.environ[f"SLACK_WEBHOOK_REPORTING_{suffix}"]

    slack_webhooks = {
        "ncov-reporting": webhook("HCOV19"),
    }

    projects = [
        {
            "collection_sets": {"collections-childcare"},
            "slack_channel_name": "ncov-reporting-childcare",
            "slack_webhook": webhook("HCOV19_CHILDCARE"),
        },

        {
            "collection_sets": {"collections-apple-respiratory"},
            "slack_channel_name": "ncov-reporting-apple",
            "slack_webhook": webhook("HCOV19_APPLE"),
        },

        {
            "collection_sets": {"collections-school-testing-home",
              "collections-school-testing-observed",
              "collections-radxup-yakima-schools-home",
              "collections-radxup-yakima-schools-observed"},
            "slack_channel_name": "ncov-reporting-schools",
            "slack_webhook": webhook("HCOV19_SCHOOLS"),
        },

        {
            "collection_sets": {"collections-adult-family-home-outbreak"},
            "slack_channel_name": "ncov-reporting-afh",
            "slack_webhook": webhook("HCOV19_AFH"),
        },

        {
            "collection_sets": {"collections-workplace-outbreak"},
            "slack_channel_name": "ncov-reporting-workplace",
            "slack_webhook": webhook("HCOV19_WORKPLACE"),
        }
    ]

    # Fetch and iterate over reportable condition records that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same reportable condition records.
    LOG.debug("Fetching unprocessed reportable conditions records")

    reportable_conditions = db.cursor("reportable_conditions")
    reportable_conditions.execute("""
        select reportable_condition_v1.*, presence_absence_id as id
            from shipping.reportable_condition_v1
            join warehouse.presence_absence using (presence_absence_id)
        where details @> %s is not true
        order by id
            for update of presence_absence;
        """, (Json({"reporting_log":[{ "revision": REVISION }]}),))

    processed_without_error = None

    try:
        for record in reportable_conditions:
            with db.savepoint(f"reportable condition presence_absence_id {record.id}"):
                LOG.info(f"Processing reportable condition, presence_absence_id «{record.id}»")

                if not record.site:
                    LOG.info(f"No site found for presence_absence_id «{record.id}». " +
                        "Inferring site from manifest data.")

                responses = {'ncov-reporting': send_slack_post_request(record, slack_webhooks['ncov-reporting'])}

                # Also send study-specific results to their respective channels
                for project in projects:
                    if (record.collection_set_name in project['collection_sets']):
                        responses[project['slack_channel_name']] = send_slack_post_request(
                            record, project['slack_webhook'])

                # Check all POSTs to Slack were successful to mark as processed
                # This does mean that if one fails but others succeed, there
                # will be duplicate POSTs to the already succeeded channels.
                # The chance of this happening is pretty small, but we can
                # revisit this if it becomes a common problem
                #   -Jover, 21 October 2020
                if all(response.status_code == 200 for response in responses.values()):
                    mark_processed(db, record.id, {"status": "sent Slack notification"})
                    LOG.info(f"Finished processing presence_absence_id «{record.id}»")

                else:
                    for channel, response in responses.items():
                        if response.status_code != 200:
                            LOG.error(("Error: A Slack notification could not " \
                            f"be sent to the channel «{channel}» for "
                            f"presence_absence_id «{record.id}».\n" \
                            f"Slack API returned status code {response.status_code}: "\
                            f"{response.text}"))

    except Exception as error:
        processed_without_error = False

        LOG.error(f"Aborting with error")
        raise error from None

    else:
        processed_without_error = True

    finally:
        if action == "prompt":
            ask_to_commit = \
                "Commit all changes?" if processed_without_error else \
                "Commit successfully processed reportable condition records up to this point?"

            commit = click.confirm(ask_to_commit)
        else:
            commit = action == "commit"

        if commit:
            LOG.info(
                "Committing all changes" if processed_without_error else \
                "Committing successfully processed reportable condition records up to this point")
            db.commit()

        else:
            LOG.info("Rolling back all changes; the database will not be modified")
            db.rollback()
Exemple #11
0
        def decorated(*args, db: DatabaseSession, log_output: bool, **kwargs):
            LOG.debug(
                f"Starting the REDCap DET ETL routine {name}, revision {revision}"
            )

            redcap_det = db.cursor(f"redcap-det {name}")
            redcap_det.execute(
                """
                select redcap_det_id as id, document
                  from receiving.redcap_det
                 where not processing_log @> %s
                   and document::jsonb @> %s
                 order by id
                   for update
                """, (Json([etl_id]), Json(det_contains)))

            with pickled_cache(CACHE_FILE) as cache:
                for det in redcap_det:
                    with db.savepoint(f"redcap_det {det.id}"):
                        LOG.info(f"Processing REDCap DET {det.id}")

                        instrument = det.document['instrument']

                        # Only pull REDCap record if
                        # `include_incomplete` flag was not included and
                        # the current instrument is complete
                        if not include_incomplete and not is_complete(
                                instrument, det.document):
                            LOG.debug(
                                f"Skipping incomplete or unverified REDCap DET {det.id}"
                            )
                            mark_skipped(db, det.id, etl_id)
                            continue

                        redcap_record = get_redcap_record_from_det(
                            det.document, raw_coded_values)

                        if not redcap_record:
                            LOG.debug(
                                f"REDCap record is missing or invalid.  Skipping REDCap DET {det.id}"
                            )
                            mark_skipped(db, det.id, etl_id)
                            continue

                        # Only process REDCap record if all required instruments are complete
                        incomplete_instruments = {
                            instrument
                            for instrument in required_instruments
                            if not is_complete(instrument, redcap_record)
                        }

                        if incomplete_instruments:
                            LOG.debug(f"The following required instruments «{incomplete_instruments}» are not yet marked complete. " + \
                                      f"Skipping REDCap DET {det.id}")
                            mark_skipped(db, det.id, etl_id)
                            continue

                        bundle = routine(db=db,
                                         cache=cache,
                                         det=det,
                                         redcap_record=redcap_record)

                        if not bundle:
                            mark_skipped(db, det.id, etl_id)
                            continue

                        if log_output:
                            print(as_json(bundle))

                        insert_fhir_bundle(db, bundle)
                        mark_loaded(db, det.id, etl_id, bundle['id'])
Exemple #12
0
def etl_enrollments(*, db: DatabaseSession):
    LOG.debug(f"Starting the enrollment ETL routine, revision {REVISION}")

    # Fetch and iterate over enrollments that aren't processed
    #
    # Use a server-side cursor by providing a name.  This ensures we limit how
    # much data we fetch at once, to limit local process size.  Each enrollment
    # document is ~10 KB and the default fetch size (cursor.itersize) is 2,000,
    # thus we'll get ~20 MB on each fetch of 2,000 enrollments.
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same enrollments.
    LOG.debug("Fetching unprocessed enrollments")

    enrollments = db.cursor("enrollments")
    enrollments.execute(
        """
        select enrollment_id as id, document
          from receiving.enrollment
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "etl": ETL_NAME,
            "revision": REVISION
        }]), ))

    for enrollment in enrollments:
        with db.savepoint(f"enrollment {enrollment.id}"):
            LOG.info(f"Processing enrollment {enrollment.id}")

            # Out of an abundance of caution, fail when the schema version
            # of the enrollment document changes.  This ensures manual
            # intervention happens on document structure changes.  After
            # seeing how versions are handled over time, this restriction
            # may be toned down a bit.
            known_versions = {"1.1.0", "1.0.0"}

            assert enrollment.document["schemaVersion"] in known_versions, \
                f"Document schema version {enrollment.document['schemaVersion']} is not in {known_versions}"

            # Most of the time we expect to see existing sites so a
            # select-first approach makes the most sense to avoid useless
            # updates.
            site = find_or_create_site(
                db,
                identifier=enrollment.document["site"]["name"],
                details=site_details(enrollment.document["site"]))

            # Most of the time we expect to see new individuals and new
            # encounters, so an insert-first approach makes more sense.
            # Encounters we see more than once are presumed to be
            # corrections.
            individual = upsert_individual(
                db,
                identifier=enrollment.document["participant"],
                sex=assigned_sex(enrollment.document))

            encounter = upsert_encounter(
                db,
                identifier=enrollment.document["id"],
                encountered=enrollment.document["startTimestamp"],
                individual_id=individual.id,
                site_id=site.id,
                age=age(enrollment.document),
                details=encounter_details(enrollment.document))

            process_samples(db, encounter.id, enrollment.document)
            process_locations(db, encounter.id, enrollment.document)

            mark_processed(db, enrollment.id)

            LOG.info(f"Finished processing enrollment {enrollment.id}")
def etl_consensus_genome(*, db: DatabaseSession):
    LOG.debug(
        f"Starting the consensus genome ETL routine, revision {REVISION}")

    # Fetch and iterate over consensus genome records that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same consensus genome records.
    LOG.debug("Fetching unprocessed consensus genome records")

    consensus_genome = db.cursor("consensus genome")
    consensus_genome.execute(
        """
        select consensus_genome_id as id, document, received
          from receiving.consensus_genome
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "revision": REVISION
        }]), ))

    for record in consensus_genome:
        with db.savepoint(f"consensus genome record {record.id}"):
            LOG.info(f"Processing consensus genome record {record.id}")

            # Verify sample identifier is in the database
            sample = find_sample(db,
                                 record.document["sample_identifier"],
                                 for_update=False)

            assert sample, f"No sample found with identifier «{record.document['sample_identifier']}»"

            # Most of the time we expect to see existing sequence read sets,
            # but we also want to update the details log. However, the
            # only unique constraint on the sequence_read_set table is
            # defined within a trigger function, as Postgres does not allow
            # unique constraints on array columns. Therefore, perform a
            # find-or-create followed by an update to the details column to
            # avoid conflict.
            sequence_read_set = find_or_create_sequence_read_set(
                db, record.document, sample)

            status = record.document.get("status")

            # Find the matching organism within the warehouse for the
            # reference organism
            organism_name = get_lineage(db, record.document)
            organism = find_organism(db, organism_name)

            assert organism, f"No organism found with name «{organism_name}»"

            # Only upsert genome and genomic sequences if the assembly job
            # was marked as complete.
            if status == 'complete':
                # Most of the time we expect to see new sequences, so an
                # insert-first approach makes the most sense to avoid useless
                # queries.
                genome = upsert_genome(db,
                                       sequence_read_set=sequence_read_set,
                                       organism=organism,
                                       document=record.document)

                for masked_consensus in record.document['masked_consensus']:
                    genomic_sequence = upsert_genomic_sequence(
                        db, genome=genome, masked_consensus=masked_consensus)

            update_sequence_read_set_details(db, sequence_read_set.id,
                                             organism, status)
            mark_processed(db, record.id, {"status": "processed"})

            LOG.info(
                f"Finished processing consensus genome record {record.id}")
Exemple #14
0
def etl_manifest(*, db: DatabaseSession):
    LOG.debug(f"Starting the manifest ETL routine, revision {REVISION}")

    # XXX TODO: Stop hardcoding valid identifier sets.  Instead, accept them as
    # an option or config (and validate option choices against what's actually
    # in the database).  We won't want to validate using click.option(),
    # because that would necessitate a database connection simply to run
    # bin/id3c at all.
    #   -trs, 13 May 2019
    expected_identifier_sets = {
        "samples": {"samples"},
        "collections": {
            "collections-environmental",
            "collections-fluathome.org",
            "collections-household-intervention",
            "collections-household-intervention-asymptomatic",
            "collections-household-observation",
            "collections-household-observation-asymptomatic",
            "collections-kiosks",
            "collections-kiosks-asymptomatic",
            "collections-seattleflu.org",
            "collections-swab&send",
            "collections-swab&send-asymptomatic",
            "collections-self-test",
            "collections-scan",
            "collections-scan-kiosks",
            "collections-haarvi",
            "samples-haarvi",
            "collections-validation",
            "collections-uw-home",
            "collections-uw-observed",
            "collections-household-general",
            "collections-childcare",
            "collections-school-testing-home",
            "collections-school-testing-observed",
            "collections-apple-respiratory",
            "collections-apple-respiratory-serial",
            "collections-adult-family-home-outbreak",
            "collections-workplace-outbreak",
        },
        "rdt": {"collections-fluathome.org"}
    }

    # Fetch and iterate over samples that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same samples.
    LOG.debug("Fetching unprocessed manifest records")

    manifest = db.cursor("manifest")
    manifest.execute(
        """
        select manifest_id as id, document
          from receiving.manifest
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "etl": ETL_NAME,
            "revision": REVISION
        }]), ))

    for manifest_record in manifest:
        with db.savepoint(f"manifest record {manifest_record.id}"):
            LOG.info(f"Processing record {manifest_record.id}")

            # When updating an existing row, update the identifiers
            # only if the record has both the 'sample' and
            # 'collection' keys.
            should_update_identifiers = "sample" in manifest_record.document \
                and "collection" in manifest_record.document

            # Sample collection date
            # Don't pop this entry off the document. For backwards
            # compatibility reasons, keep it in the document so that 'date'
            # also gets written to the 'details' column in warehouse.sample.
            collected_date = manifest_record.document.get("date", None)

            # Attempt to find barcodes and their related identifiers
            sample_barcode = manifest_record.document.pop("sample", None)
            sample_identifier = find_identifier(
                db, sample_barcode) if sample_barcode else None
            collection_barcode = manifest_record.document.pop(
                "collection", None)
            collection_identifier = find_identifier(
                db, collection_barcode) if collection_barcode else None

            # Skip a record if it has no associated barcodes
            if not sample_barcode and not collection_barcode:
                LOG.warning(
                    f"Skipping record «{manifest_record.id}» because it has neither a sample "
                    "barcode nor a collection barcode")
                mark_skipped(db, manifest_record.id)
                continue

            # Skip a record if it has a sample barcode but the barcode doesn't match an identifier
            if sample_barcode and not sample_identifier:
                LOG.warning(
                    f"Skipping sample with unknown sample barcode «{sample_barcode}»"
                )
                mark_skipped(db, manifest_record.id)
                continue

            # Skip a record if it has a collection barcode but the barcode doesn't match an identifier
            if collection_barcode and not collection_identifier:
                LOG.warning(
                    f"Skipping sample with unknown collection barcode «{collection_barcode}»"
                )
                mark_skipped(db, manifest_record.id)
                continue

            # Skip a record if the collection identifier is from an unexpected set
            if collection_identifier and collection_identifier.set_name not in expected_identifier_sets[
                    "collections"]:
                LOG.warning(
                    f"Skipping sample because collection identifier found in set «{collection_identifier.set_name}», not \
                    {expected_identifier_sets['collections']}")
                mark_skipped(db, manifest_record.id)
                continue

            # Validate the sample identifer and assert if a record fails
            if sample_identifier:
                if (manifest_record.document.get("sample_type")
                        and manifest_record.document["sample_type"] == "rdt"):
                    assert sample_identifier.set_name in expected_identifier_sets["rdt"], \
                        (f"Sample identifier found in set «{sample_identifier.set_name}»," +
                        f"not {expected_identifier_sets['rdt']}")
                else:
                    assert sample_identifier.set_name in expected_identifier_sets["samples"], \
                        (f"Sample identifier found in set «{sample_identifier.set_name}», " +
                        f"not {expected_identifier_sets['samples']}")

            # Upsert sample cooperatively with enrollments ETL routine
            #
            # The details document was intentionally modified by two pop()s
            # earlier to remove barcodes that were looked up.
            # The rationale is that we want just one clear place in the
            # warehouse for each piece of information.
            sample, status = upsert_sample(
                db,
                update_identifiers=should_update_identifiers,
                identifier=sample_identifier.uuid
                if sample_identifier else None,
                collection_identifier=collection_identifier.uuid
                if collection_identifier else None,
                collection_date=collected_date,
                additional_details=manifest_record.document)

            mark_loaded(db,
                        manifest_record.id,
                        status=status,
                        sample_id=sample.id)

            LOG.info(
                f"Finished processing manifest record {manifest_record.id}")