Beispiel #1
0
def find_or_create_site(db: DatabaseSession, identifier: str,
                        details: dict) -> Any:
    """
    Select encounter site by *identifier*, or insert it if it doesn't exist.
    """
    LOG.debug(f"Looking up site «{identifier}»")

    site = db.fetch_row(
        """
        select site_id as id, identifier
          from warehouse.site
         where identifier = %s
        """, (identifier, ))

    if site:
        LOG.info(f"Found site {site.id} «{site.identifier}»")
    else:
        LOG.debug(f"Site «{identifier}» not found, adding")

        data = {
            "identifier": identifier,
            "details": Json(details),
        }

        site = db.fetch_row(
            """
            insert into warehouse.site (identifier, details)
                values (%(identifier)s, %(details)s)
            returning site_id as id, identifier
            """, data)

        LOG.info(f"Created site {site.id} «{site.identifier}»")

    return site
Beispiel #2
0
def update_sequence_read_set_details(db, sequence_read_set_id: int,
                                     organism: OrganismRecord,
                                     status: str) -> None:
    """
    This function is a workaround to the order-specific unique constraint for
    arrays in Postgres. It searches for an existing *sequence_read_set* by its
    ID and updates the details column with the given *entry* details.
    """
    LOG.debug(
        f"Marking sequence read set data {sequence_read_set_id} as received")

    entry = {organism.lineage: {"status": status}}

    data = {
        "sequence_read_set_id": sequence_read_set_id,
        "log_entry": Json(entry),
    }

    # Postgres allows array columns to be defined as unique, but the ordering of
    # the arrays must be the same for the arrays to match. We are only
    # interested in matching on array content, not array ordering. This prevents
    # us from being able to use the built-in unique constraint on urls, thereby
    # preventing `ON CONFLICT` updates to a table when urls is the only unique
    # column in the table.
    with db.cursor() as cursor:
        cursor.execute(
            """
            update warehouse.sequence_read_set
              set details = coalesce(details, '{}') || %(log_entry)s
            where sequence_read_set_id = %(sequence_read_set_id)s
            """, data)
Beispiel #3
0
def upsert_sample(db: DatabaseSession, collection_identifier: str,
                  encounter_id: int, details: dict) -> Any:
    """
    Upsert collected sample by its *collection_identifier*.

    The provided *details* are merged (at the top-level only) into
    the existing sample details, if any.
    """
    LOG.debug(f"Upserting sample collection «{collection_identifier}»")

    data = {
        "collection_identifier": collection_identifier,
        "encounter_id": encounter_id,
        "details": Json(details),
    }

    sample = db.fetch_row(
        """
        insert into warehouse.sample (collection_identifier, encounter_id, details)
            values (%(collection_identifier)s, %(encounter_id)s, %(details)s)

        on conflict (collection_identifier) do update
            set encounter_id = excluded.encounter_id,
                details = coalesce(sample.details, '{}') || %(details)s

        returning sample_id as id, identifier, collection_identifier, encounter_id
        """, data)

    assert sample.id, "Upsert affected no rows!"

    LOG.info(
        f"Upserted sample {sample.id} with collection identifier «{sample.collection_identifier}»"
    )

    return sample
Beispiel #4
0
def update_sample(db: DatabaseSession,
                  identifier: str = None,
                  collection_identifier: str = None,
                  additional_details: dict = None) -> Any:
    """
    Find sample by *identifier* and update with any *additional_details*.

    The provided *additional_details* are merged (at the top-level only) into
    the existing sample details, if any.

    Raises an :class:`SampleNotFoundError` if there is no sample known by
    *identifier*.
    """
    if identifier:
        LOG.debug(f"Looking up sample with identifier «{identifier}»")
        sample = db.fetch_row("""
            select sample_id as id, identifier, details
              from warehouse.sample
            where identifier = %s
              for update
            """, (identifier,))
    elif collection_identifier:
        LOG.debug(f"Looking up sample with collection_identifier «{collection_identifier}»")
        sample = db.fetch_row("""
            select sample_id as id, collection_identifier as identifier, details
              from warehouse.sample
            where collection_identifier = %s
              for update
            """, (collection_identifier,))

    if not sample:
        LOG.error(f"No sample found with identifier «{identifier}» or collection identifier «{collection_identifier}»")
        raise SampleNotFoundError(identifier)

    LOG.info(f"Found sample {sample.id} «{sample.identifier}»")

    if additional_details:
        LOG.info(f"Updating sample {sample.id} «{sample.identifier}» details")

        update_details_nwgc_id(sample, additional_details)

        sample = db.fetch_row("""
            update warehouse.sample
               set details = coalesce(details, '{}') || %s
             where sample_id = %s
            returning sample_id as id, identifier
            """, (Json(additional_details), sample.id))

        assert sample.id, "Updating details affected no rows!"

    return sample
Beispiel #5
0
def upsert_presence_absence(db: DatabaseSession, identifier: str,
                            sample_id: int, target_id: int, present: bool,
                            details: dict) -> Any:
    """
    Upsert presence_absence by its *identifier*.

    Confirmed with Samplify that their numeric identifier for each test is stable
    and persistent.
    """
    LOG.debug(f"Upserting presence_absence «{identifier}»")

    data = {
        "identifier": identifier,
        "sample_id": sample_id,
        "target_id": target_id,
        "present": present,
        "details": Json(details)
    }

    presence_absence = db.fetch_row(
        """
        insert into warehouse.presence_absence (
                identifier,
                sample_id,
                target_id,
                present,
                details)
            values (
                %(identifier)s,
                %(sample_id)s,
                %(target_id)s,
                %(present)s,
                %(details)s)

        on conflict (identifier) do update
            set sample_id = excluded.sample_id,
                target_id = excluded.target_id,
                present   = excluded.present,
                details = coalesce(presence_absence.details, '{}') || excluded.details

        returning presence_absence_id as id, identifier
        """, data)

    assert presence_absence.id, "Upsert affected no rows!"

    LOG.info(f"Upserted presence_absence {presence_absence.id} \
        «{presence_absence.identifier}»")

    return presence_absence
Beispiel #6
0
def upsert_encounter(db: DatabaseSession, identifier: str, encountered: str,
                     individual_id: int, site_id: int, age: Optional[str],
                     details: dict) -> Any:
    """
    Upsert encounter by its *identifier*.
    """
    LOG.debug(f"Upserting encounter «{identifier}»")

    data = {
        "identifier": identifier,
        "encountered": encountered,
        "individual_id": individual_id,
        "site_id": site_id,
        "age": age,
        "details": Json(details),
    }

    encounter = db.fetch_row(
        """
        insert into warehouse.encounter (
                identifier,
                individual_id,
                site_id,
                encountered,
                age,
                details)
            values (
                %(identifier)s,
                %(individual_id)s,
                %(site_id)s,
                %(encountered)s::timestamp with time zone,
                %(age)s,
                %(details)s)

        on conflict (identifier) do update
            set individual_id = excluded.individual_id,
                site_id       = excluded.site_id,
                encountered   = excluded.encountered,
                age           = excluded.age,
                details       = excluded.details

        returning encounter_id as id, identifier
        """, data)

    assert encounter.id, "Upsert affected no rows!"

    LOG.info(f"Upserted encounter {encounter.id} «{encounter.identifier}»")

    return encounter
def insert_dets(db: DatabaseSession, project: Project, offers: List[dict]):
    """
    Inserts synthethic DETs into ``receiving.redcap_det`` for the REDCap record
    *offers* made for *project*.
    """
    dets = [(Json(det(project, offer, TESTING_INSTRUMENT)), )
            for offer in offers]

    LOG.info(f"Inserting {len(dets):,} synthetic REDCap DETs for {project}")

    with db.cursor() as cursor:
        execute_values(
            cursor, """
            insert into receiving.redcap_det (document) values %s
            """, dets)
Beispiel #8
0
def etl_fhir(*, db: DatabaseSession):
    LOG.debug(f"Starting the FHIR ETL routine, revision {REVISION}")

    # Fetch and iterate over FHIR documents that aren't processed
    #
    # Use a server-side cursor by providing a name and limit to one fetched
    # record at a time, to limit local process size.
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same FHIR documents.
    LOG.debug("Fetching unprocessed FHIR documents")

    fhir_documents = db.cursor("fhir")
    fhir_documents.itersize = 1
    fhir_documents.execute(
        """
        select fhir_id as id, document
          from receiving.fhir
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "etl": ETL_NAME,
            "revision": REVISION
        }]), ))

    for record in fhir_documents:
        with db.savepoint(f"FHIR document {record.id}"):
            LOG.info(f"Processing FHIR document {record.id}")

            assert_bundle_collection(record.document)
            bundle = Bundle(record.document)
            resources = extract_resources(bundle)

            # Loop over every Resource the Bundle entry, processing what is
            # needed along the way.
            try:
                assert_required_resource_types_present(resources)
                process_bundle_entries(db, bundle)

            except SkipBundleError as error:
                LOG.warning(
                    f"Skipping bundle in FHIR document «{record.id}»: {error}")
                mark_skipped(db, record.id)
                continue

            mark_processed(db, record.id, {"status": "processed"})
            LOG.info(f"Finished processing FHIR document {record.id}")
Beispiel #9
0
def insert_fhir_bundle(db: DatabaseSession, bundle: dict) -> None:
    """
    Insert FHIR bundles into the receiving area of the database.
    """
    LOG.debug(f"Inserting FHIR bundle «{bundle['id']}»")

    fhir = db.fetch_row("""
        insert into receiving.fhir(document)
            values (%s)

        returning fhir_id as id
        """, (Json(bundle),))

    assert fhir.id, "Insert affected no rows!"

    LOG.info(f"Inserted FHIR document {fhir.id} «{bundle['id']}»")
Beispiel #10
0
def mark_processed(db, group_id: int) -> None:
    LOG.debug(f"Marking presence_absence group {group_id} as processed")

    data = {
        "group_id": group_id,
        "log_entry": Json({
            "revision": REVISION,
            "timestamp": datetime.now(timezone.utc),
        }),
    }

    with db.cursor() as cursor:
        cursor.execute("""
            update receiving.presence_absence
               set processing_log = processing_log || %(log_entry)s
             where presence_absence_id = %(group_id)s
            """, data)
Beispiel #11
0
def mark_processed(db: DatabaseSession, det_id: int, entry = {}) -> None:
    LOG.debug(f"Appending to processing log of REDCap DET record {det_id}")

    data = {
        "det_id": det_id,
        "log_entry": Json({
            **entry,
            "timestamp": datetime.now(timezone.utc),
        }),
    }

    with db.cursor() as cursor:
        cursor.execute("""
            update receiving.redcap_det
               set processing_log = processing_log || %(log_entry)s
             where redcap_det_id = %(det_id)s
            """, data)
Beispiel #12
0
def mark_processed(db, clinical_id: int, entry: Mapping) -> None:
    LOG.debug(f"Marking clinical document {clinical_id} as processed")

    data = {
        "clinical_id": clinical_id,
        "log_entry": Json({
            **entry,
            "revision": REVISION,
            "timestamp": datetime.now(timezone.utc),
        }),
    }

    with db.cursor() as cursor:
        cursor.execute("""
            update receiving.clinical
               set processing_log = processing_log || %(log_entry)s
             where clinical_id = %(clinical_id)s
            """, data)
Beispiel #13
0
def mark_processed(db, fhir_id: int, entry = {}) -> None:
    LOG.debug(f"Marking FHIR document {fhir_id} as processed")

    data = {
        "fhir_id": fhir_id,
        "log_entry": Json({
            **entry,
            "etl": ETL_NAME,
            "revision": REVISION,
            "timestamp": datetime.now(timezone.utc),
        }),
    }

    with db.cursor() as cursor:
        cursor.execute("""
            update receiving.fhir
               set processing_log = processing_log || %(log_entry)s
             where fhir_id = %(fhir_id)s
            """, data)
Beispiel #14
0
def mark_processed(db, manifest_id: int, entry = {}) -> None:
    LOG.debug(f"Appending to processing log of sample manifest record {manifest_id}")

    data = {
        "manifest_id": manifest_id,
        "log_entry": Json({
            **entry,
            "etl": ETL_NAME,
            "revision": REVISION,
            "timestamp": datetime.now(timezone.utc),
        }),
    }

    with db.cursor() as cursor:
        cursor.execute("""
            update receiving.manifest
               set processing_log = processing_log || %(log_entry)s
             where manifest_id = %(manifest_id)s
            """, data)
def mark_processed(db, presence_absence_id: int, entry: Mapping) -> None:
    LOG.debug(dedent(f"""
    Marking reportable condition «{presence_absence_id}» as processed in the
    presence_absence table"""))

    data = {
        "presence_absence_id": presence_absence_id,
        "log_entry": Json({
            **entry,
            "revision": REVISION,
            "timestamp": datetime.now(timezone.utc),
        }),
    }

    with db.cursor() as cursor:
        cursor.execute("""
            update warehouse.presence_absence
               set details = jsonb_insert('{"reporting_log":[]}' || coalesce(details, '{}'), '{reporting_log, -1}', %(log_entry)s, insert_after => true)
             where presence_absence_id = %(presence_absence_id)s
            """, data)
Beispiel #16
0
def upsert_genome(db: DatabaseSession,
                  sequence_read_set: SequenceReadSetRecord,
                  organism: OrganismRecord, document: dict) -> GenomeRecord:
    """
    Upsert consensus genomes with the given *sequence_read_set*, *organism*,
    and consensus genome *document*.
    """
    LOG.debug(
        dedent(f"""
    Upserting genome with sequence read set {sequence_read_set.id},
    organism {organism.id} «{organism.lineage}»"""))

    data = {
        "sample_id": sequence_read_set.sample_id,
        "organism_id": organism.id,
        "sequence_read_set_id": sequence_read_set.id,
        "additional_details": Json(document['summary_stats'])
    }

    genome: GenomeRecord = db.fetch_row(
        """
        insert into warehouse.consensus_genome (sample_id, organism_id,
            sequence_read_set_id, details)
          values (%(sample_id)s, %(organism_id)s, %(sequence_read_set_id)s,
                %(additional_details)s)

        on conflict (sample_id, organism_id, sequence_read_set_id) do update
            set details = %(additional_details)s

        returning consensus_genome_id as id, sample_id, organism_id, sequence_read_set_id
        """, data)

    assert genome.id, "Upsert affected no rows!"

    LOG.info(
        dedent(f"""
    Upserted genome {genome.id} with sample ID «{genome.sample_id}»,
    organism ID «{genome.organism_id}», and sequence read set ID «{genome.sequence_read_set_id}»
    """))

    return genome
Beispiel #17
0
def mark_enrollment_processed(db, enrollment_id: int) -> None:
    LOG.debug(f"Marking enrollment {enrollment_id} as processed")

    data = {
        "enrollment_id":
        enrollment_id,
        "log_entry":
        Json({
            "revision": ENROLLMENTS_REVISION,
            "etl": ETL_NAME,
            "timestamp": datetime.now(timezone.utc),
        })
    }

    with db.cursor() as cursor:
        cursor.execute(
            """
            update receiving.enrollment
               set processing_log = processing_log || %(log_entry)s
             where enrollment_id = %(enrollment_id)s
            """, data)
Beispiel #18
0
def mark_manifest_processed(db, manifest_id: int, entry={}) -> None:
    LOG.debug(f"Marking manifest {manifest_id} as processed")

    data = {
        "manifest_id":
        manifest_id,
        "log_entry":
        Json({
            **entry, "revision": MANIFEST_REVISION,
            "etl": ETL_NAME,
            "timestamp": datetime.now(timezone.utc)
        })
    }

    with db.cursor() as cursor:
        cursor.execute(
            """
            update receiving.manifest
               set processing_log = processing_log || %(log_entry)s
             where manifest_id = %(manifest_id)s
            """, data)
def find_or_create_individual(db: DatabaseSession,
                              identifier: str,
                              sex: str,
                              details: dict = None) -> Any:
    """
    Select indinvidual by *identifier*, or insert it if it doesn't exist.
    """
    LOG.debug(f"Looking up individual «{identifier}»")

    individual = db.fetch_row(
        """
        select individual_id as id, identifier
          from warehouse.individual
         where identifier = %s
        """, (identifier, ))

    if individual:
        LOG.info(f"Found individual {individual.id} «{individual.identifier}»")
    else:
        LOG.debug(f"individual «{identifier}» not found, adding")

        data = {
            "identifier": identifier,
            "sex": sex,
            "details": Json(details),
        }

        individual = db.fetch_row(
            """
            insert into warehouse.individual (identifier, sex, details)
                values (%(identifier)s, %(sex)s, %(details)s)
            returning individual_id as id, identifier
            """, data)

        LOG.info(
            f"Created individual {individual.id} «{individual.identifier}»")

    return individual
Beispiel #20
0
def etl_presence_absence(*, db: DatabaseSession):
    LOG.debug(
        f"Starting the presence_absence ETL routine, revision {REVISION}")

    # Fetch and iterate over presence-absence tests that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same presence-absence tests.
    LOG.debug("Fetching unprocessed presence-absence tests")

    presence_absence = db.cursor("presence_absence")
    presence_absence.itersize = 1
    presence_absence.execute(
        """
        select presence_absence_id as id, document,
               received::date as received_date
          from receiving.presence_absence
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "revision": REVISION
        }]), ))

    for group in presence_absence:
        with db.savepoint(f"presence_absence group {group.id}"):
            LOG.info(f"Processing presence_absence group {group.id}")

            # Samplify will now send documents with a top level key
            # "samples". The new format also includes a "chip" key for each
            # sample which is then included in the unique identifier for
            # each presence/absence result
            #   -Jover, 14 Nov 2019
            try:
                received_samples = group.document["samples"]
            except KeyError as error:
                # Skip documents in the old format because they do not
                # include the "chip" key which is needed for the
                # unique identifier for each result.
                #   -Jover, 14 Nov 2019
                # Also skip old format to avoid ingesting wrong data from
                # plate swapped data! This will lead to 188 samples with the
                # wrong nwgc_id associated with them.
                #   -Jover, 06 Dec 2019
                if (group.document.get("store") is not None
                        or group.document.get("Update") is not None):

                    LOG.info(
                        "Skipping presence_absence record that is in old format"
                    )
                    mark_processed(db, group.id)
                    continue

                else:
                    raise error from None

            for received_sample in received_samples:
                received_sample_barcode = received_sample.get("investigatorId")
                if not received_sample_barcode:
                    LOG.info(
                        f"Skipping sample «{received_sample['sampleId']}» without SFS barcode"
                    )
                    continue

                # Don't go any further if the sample is marked as Failed
                sample_failed = received_sample.get("sampleFailed")
                if sample_failed is True:
                    LOG.info(
                        f"Skipping sample «{received_sample_barcode}» that has been failed"
                    )
                    continue

                # Don't go any further if there are no results to import.
                test_results = received_sample["targetResults"]

                if not test_results:
                    LOG.warning(
                        f"Skipping sample «{received_sample_barcode}» without any results"
                    )
                    continue

                received_sample_id = str(received_sample["sampleId"])
                chip = received_sample.get("chip")
                extraction_date = received_sample.get("extractionDate")
                assay_name = received_sample.get("assayName")
                assay_date = received_sample.get("assayDate")
                # The assayType field will be removed after Samplify starts
                # sending us OpenArray results with target.clinicalStatus.
                #
                # kfay, 28 Dec 2020
                assay_type = received_sample.get("assayType")

                # Guard against empty chip values
                assert chip or "chip" not in received_sample, "Received bogus chip id"

                # Must be current results
                LOG.info(f"Processing sample «{received_sample_barcode}»")

                if not received_sample.get("isCurrentExpressionResult"):
                    LOG.warning(
                        f"Skipping out-of-date results for sample «{received_sample_barcode}»"
                    )
                    continue

                # Barcode must match a known identifier
                received_sample_identifier = sample_identifier(
                    db, received_sample_barcode)

                if not received_sample_identifier:
                    LOG.warning(
                        f"Skipping results for sample without a known identifier «{received_sample_barcode}»"
                    )
                    continue

                # Track Samplify's internal ids for our samples, which is
                # unfortunately necessary for linking genomic data NWGC also
                # sends.
                sample = update_sample(
                    db,
                    identifier=received_sample_identifier,
                    additional_details=sample_details(received_sample))

                # Finally, process all results.
                for test_result in test_results:
                    test_result_target_id = test_result["geneTarget"]
                    LOG.debug(
                        f"Processing target «{test_result_target_id}» for \
                    sample «{received_sample_barcode}»")

                    # Skip this result if it's actually a non-result
                    present = target_present(test_result)

                    if present is ...:
                        LOG.debug(
                            f"No test result for «{test_result_target_id}», skipping"
                        )
                        continue

                    # Most of the time we expect to see existing targets so a
                    # select-first approach makes the most sense to avoid useless
                    # updates.
                    target = find_or_create_target(
                        db,
                        identifier=test_result_target_id,
                        control=target_control(test_result["controlStatus"]))

                    # The unique identifier for each result.  If chip is
                    # applicable, then it's included to differentiate the same
                    # sample being run on multiple chips (uncommon, but it
                    # happens).
                    if chip:
                        identifier = f"NWGC/{received_sample_id}/{target.identifier}/{chip}"
                    else:
                        identifier = f"NWGC/{received_sample_id}/{target.identifier}"

                    # Most of the time we expect to see new samples and new
                    # presence_absence tests, so an insert-first approach makes more sense.
                    # Presence-absence tests we see more than once are presumed to be
                    # corrections.
                    upsert_presence_absence(db,
                                            identifier=identifier,
                                            sample_id=sample.id,
                                            target_id=target.id,
                                            present=present,
                                            details=presence_absence_details(
                                                test_result,
                                                group.received_date, chip,
                                                extraction_date, assay_name,
                                                assay_date, assay_type))

            mark_processed(db, group.id)

            LOG.info(f"Finished processing presence_absence group {group.id}")
def etl_longitudinal(*, db: DatabaseSession):
    LOG.debug(f"Starting the longitudinal ETL routine, revision {REVISION}")

    # Fetch and iterate over longitudinal records that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same longitudinal records.
    LOG.debug("Fetching unprocessed longitudinal records")

    longitudinal = db.cursor("longitudinal")
    longitudinal.execute(
        """
        select longitudinal_id as id, document
          from receiving.longitudinal
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "revision": REVISION
        }]), ))

    for record in longitudinal:
        with db.savepoint(f"longitudinal record {record.id}"):
            LOG.info(f"Processing longitudinal record {record.id}")

            # Check validity of barcode
            received_sample_identifier = sample_identifier(db, record.document)

            # Check sample exists in database
            if received_sample_identifier:
                sample = find_sample(db, identifier=received_sample_identifier)
            else:
                sample = None

            # Most of the time we expect to see existing sites so a
            # select-first approach makes the most sense to avoid useless
            # updates.
            site = find_or_create_site(
                db,
                identifier=site_identifier(record.document),
                details={"type": record.document['type']})

            # Most of the time we expect to see existing individuals and new
            # encounters.
            # Encounters we see more than once are presumed to be
            # corrections.
            individual = find_or_create_individual(
                db,
                identifier=record.document["individual"],
                sex=sex(record.document))

            encounter = upsert_encounter(
                db,
                identifier=record.document["identifier"],
                encountered=record.document["encountered"],
                individual_id=individual.id,
                site_id=site.id,
                age=age(record.document),
                details=encounter_details(record.document))

            if sample:
                sample = update_sample(db,
                                       sample=sample,
                                       encounter_id=encounter.id)

            # Link encounter to a Census tract, if we have it
            tract_identifier = record.document.get("census_tract")

            if tract_identifier:
                tract = find_location(db, "tract", str(tract_identifier))
                assert tract, f"Tract «{tract_identifier}» is unknown"

                upsert_encounter_location(db,
                                          encounter_id=encounter.id,
                                          relation="residence",
                                          location_id=tract.id)

            mark_processed(db, record.id, {"status": "processed"})

            LOG.info(f"Finished processing longitudinal record {record.id}")
Beispiel #22
0
        def decorated(*args,
                      db: DatabaseSession,
                      log_output: bool,
                      det_limit: int = None,
                      redcap_api_batch_size: int,
                      geocoding_cache: str = None,
                      **kwargs):
            LOG.debug(
                f"Starting the REDCap DET ETL routine {name}, revision {revision}"
            )

            project = Project(redcap_url, project_id)

            if det_limit:
                LOG.debug(f"Processing up to {det_limit:,} pending DETs")
                limit = sql.Literal(det_limit)
            else:
                LOG.debug(f"Processing all pending DETs")
                limit = sql.SQL("all")

            redcap_det = db.cursor(f"redcap-det {name}")
            redcap_det.execute(
                sql.SQL("""
                select redcap_det_id as id, document
                  from receiving.redcap_det
                 where not processing_log @> %s
                   and document::jsonb @> %s
                 order by id
                 limit {}
                   for update
                """).format(limit), (Json([etl_id]), Json(det_contains)))

            # First loop of the DETs to determine how to process each one.
            # Uses `first_complete_dets` to keep track of which DET to
            # use to process a unique REDCap record.
            # Uses `all_dets` to keep track of the status for each DET record
            # so that they can be processed in order of `redcap_det_id` later.
            #   --Jover, 21 May 2020
            first_complete_dets: Dict[str, Any] = {}
            all_dets: List[Dict[str, str]] = []
            for det in redcap_det:
                instrument = det.document['instrument']
                record_id = det.document['record']
                # Assume we are loading all DETs
                # Status will be updated to "skip" if DET does not need to be processed
                det_record = {"id": det.id, "status": "load"}

                # Only pull REDCap record if
                # `include_incomplete` flag was not included and
                # the current instrument is complete
                if not include_incomplete and not is_complete(
                        instrument, det.document):
                    det_record.update({
                        "status": "skip",
                        "reason": "incomplete/unverified DET"
                    })

                # Check if this is record has an older DET
                # Skip latest DET in favor of the first DET
                # This is done to continue our first-in-first-out
                # semantics of our receiving tables
                elif first_complete_dets.get(record_id):
                    det_record.update({
                        "status": "skip",
                        "reason": "repeat REDCap record"
                    })

                else:
                    first_complete_dets[record_id] = det
                    det_record["record_id"] = record_id

                all_dets.append(det_record)

            if not first_complete_dets:
                LOG.info("No new complete DETs found.")
            else:
                # Batch request records from REDCap
                LOG.info(f"Fetching REDCap project {project_id}")
                record_ids = list(first_complete_dets.keys())

                LOG.info(
                    f"Fetching {len(record_ids):,} REDCap records from project {project.id}"
                )

                # Convert list of REDCap records to a dict so that
                # records can be looked up by record id.
                # Records with repeating instruments or longitudinal
                # events will have multiple entries in the list.
                redcap_records: DefaultDict[str,
                                            List[dict]] = defaultdict(list)

                batches = list(chunked(record_ids, redcap_api_batch_size))

                for i, batch in enumerate(batches, 1):
                    LOG.info(
                        f"Fetching REDCap record batch {i:,}/{len(batches):,} of size {len(batch):,}"
                    )

                    for record in project.records(ids=batch,
                                                  raw=raw_coded_values):
                        redcap_records[record.id].append(record)

            # Process all DETs in order of redcap_det_id
            with pickled_cache(geocoding_cache) as cache:
                for det in all_dets:
                    with db.savepoint(f"redcap_det {det['id']}"):
                        LOG.info(f"Processing REDCap DET {det['id']}")

                        if det["status"] == "skip":
                            LOG.debug(
                                f"Skipping REDCap DET {det['id']} due to {det['reason']}"
                            )
                            mark_skipped(db, det["id"], etl_id, det["reason"])
                            continue

                        received_det = first_complete_dets.pop(
                            det["record_id"])
                        redcap_record_instances = redcap_records.get(
                            received_det.document["record"])

                        if not redcap_record_instances:
                            LOG.debug(
                                f"REDCap record is missing or invalid.  Skipping REDCap DET {received_det.id}"
                            )
                            mark_skipped(db, received_det.id, etl_id,
                                         "invalid REDCap record")
                            continue

                        bundle = routine(
                            db=db,
                            cache=cache,
                            det=received_det,
                            redcap_record_instances=redcap_record_instances)

                        if not bundle:
                            LOG.debug(
                                f"Skipping REDCap DET {received_det.id} due to insufficient data in REDCap record."
                            )
                            mark_skipped(db, received_det.id, etl_id,
                                         "insufficient data in record")
                            continue

                        if log_output:
                            print(as_json(bundle))

                        insert_fhir_bundle(db, bundle)
                        mark_loaded(db, received_det.id, etl_id, bundle['id'])
Beispiel #23
0
def upsert_sample(db: DatabaseSession,
                  identifier: str,
                  collection_identifier: Optional[str],
                  collection_date: Optional[str],
                  additional_details: dict) -> Tuple[Any, str]:
    """
    Upsert sample by its *identifier* and/or *collection_identifier*.

    An existing sample has its *identifier*, *collection_identifier*,
    *collection_date* updated, and the provided *additional_details* are
    merged (at the top-level only) into the existing sample details, if any.

    Raises an exception if there is more than one matching sample.
    """
    data = {
        "identifier": identifier,
        "collection_identifier": collection_identifier,
        "collection_date": collection_date,
        "additional_details": Json(additional_details),
    }

    # Look for existing sample(s)
    with db.cursor() as cursor:
        cursor.execute("""
            select sample_id as id, identifier, collection_identifier, encounter_id
              from warehouse.sample
             where identifier = %(identifier)s
                or collection_identifier = %(collection_identifier)s
               for update
            """, data)

        samples = list(cursor)

    # Nothing found → create
    if not samples:
        LOG.info("Creating new sample")
        status = 'created'
        sample = db.fetch_row("""
            insert into warehouse.sample (identifier, collection_identifier, collected, details)
                values (%(identifier)s,
                        %(collection_identifier)s,
                        date_or_null(%(collection_date)s),
                        %(additional_details)s)
            returning sample_id as id, identifier, collection_identifier, encounter_id
            """, data)

    # One found → update
    elif len(samples) == 1:
        status = 'updated'
        sample = samples[0]

        LOG.info(f"Updating existing sample {sample.id}")
        sample = db.fetch_row("""
            update warehouse.sample
               set identifier = %(identifier)s,
                   collection_identifier = %(collection_identifier)s,
                   collected = date_or_null(%(collection_date)s),
                   details = coalesce(details, '{}') || %(additional_details)s

             where sample_id = %(sample_id)s

            returning sample_id as id, identifier, collection_identifier, encounter_id
            """,
            { **data, "sample_id": sample.id })

        assert sample.id, "Update affected no rows!"

    # More than one found → error
    else:
        raise Exception(f"More than one sample matching sample and/or collection barcodes: {samples}")

    return sample, status
Beispiel #24
0
def import_(features_path, scale, scale_from, hierarchy, hierarchy_by_feature,
            hierarchy_from, identifier_from, point_from,
            simplified_polygons_path, if_exists_action):
    """
    Import locations from GeoJSON, Shapefile, or tabular delimited text (CSV/TSV).

    Run `id3c location --help` for general information about locations.

    Many common geospatial formats are supported via Fiona
    <https://fiona.readthedocs.io/en/latest/README.html>, including:

    \b
    * GeoJSON (top-level type must be ``FeatureCollection``)
    * Shapefiles (*.shp with sidecar *.dbf file; may be zipped)
    * tabular (CSV or TSV, delimiter auto-detected, no geometry construction)

    If a tabular data file is provided, at least one of --point-from or
    --simplified-polygons should be used to provide a geometry.

    Each location's (scale, identifier) pair is automatically added to its
    hierarchy during import.  This is a non-configurable convention that could
    be made optional in the future.
    """

    # First, a handful of accessor and transformation functions to make the
    # code that follows more readable.  These functions make use of the command
    # options.
    def identifier(feature):
        if identifier_from:
            return feature["properties"].pop(identifier_from)
        else:
            return feature["id"]

    def geometry_type(feature):
        return feature["geometry"]["type"] if feature["geometry"] else None

    def point(feature):
        if point_from:
            return {
                "type":
                "Point",
                "coordinates": (
                    float(feature["properties"].pop(point_from[0])),
                    float(feature["properties"].pop(point_from[1])),
                )
            }
        elif geometry_type(feature) == "Point":
            return feature["geometry"]
        else:
            return None

    def polygon(feature):
        if geometry_type(feature) in {"Polygon", "MultiPolygon"}:
            return feature["geometry"]
        else:
            return None

    def get_hierarchy(feature, feature_identifier):
        if hierarchy_df is None:
            return hierarchy or feature["properties"].pop(hierarchy_from, None)

        hierarchy_list = hierarchy_df.loc[hierarchy_df['feature_identifier'] ==
                                          feature_identifier].to_dict('record')
        hierarchies = ''
        if hierarchy_list:
            # This is assuming only one row has matching feature_identifier
            hierarchy_map = hierarchy_list[0]
            hierarchy_map.pop('feature_identifier')
            for key, val in hierarchy_map.items():
                if pd.notna(val):
                    hierarchies += (key + '=>' + val + ',')

        return hierarchies + hierarchy

    # Technically PostGIS' SRIDs don't have to match the EPSG id, but in my
    # experience, they always do in practice.  If that doesn't hold true in the
    # future, then a lookup of (auth_name, auth_id) in spatial_ref_sys table
    # will be needed to map to srid.
    #   -trs, 2 Dec 2019

    def as_location(feature):
        feature_identifier = identifier(feature)
        return {
            "scale": scale or feature["properties"].pop(scale_from),
            "identifier": feature_identifier,
            "hierarchy": get_hierarchy(feature, feature_identifier),
            "point": point(feature),
            "polygon": polygon(feature),
            "srid": feature["crs"]["EPSG"],
            "details": feature["properties"],
        }

    def as_simplified(feature):
        return {
            "identifier": identifier(feature),
            "polygon": polygon(feature),
            "srid": feature["crs"]["EPSG"],
        }

    # Now, read in the data files and convert to our internal structure.
    LOG.info(f"Reading features from «{features_path}»")

    hierarchy_df = None
    if hierarchy_by_feature:
        hierarchy_df = pd.read_csv(hierarchy_by_feature, dtype=str)
        if "feature_identifier" not in hierarchy_df.columns:
            raise Exception(
                "hierarchy_by_feature CSV must include 'feature_identifier' column"
            )
        duplicated_feature_identifier = hierarchy_df[
            "feature_identifier"].duplicated(keep=False)
        duplicates = hierarchy_df["feature_identifier"][
            duplicated_feature_identifier]
        dup_identifiers = list(duplicates.unique())
        assert len(
            dup_identifiers
        ) == 0, f"Found duplicate feature_identifiers: {dup_identifiers}"

    locations = list(map(as_location, parse_features(features_path)))

    if simplified_polygons_path:
        LOG.info(
            f"Reading simplified polygons from «{simplified_polygons_path}»")
        simplified_polygons = list(
            map(as_simplified, parse_features(simplified_polygons_path)))
    else:
        simplified_polygons = []

    # Finally, do the updates in the database
    db = DatabaseSession()

    try:
        LOG.info(f"Importing locations")

        insert = SQL("""
            with new_location as (
                select
                    scale,
                    identifier,
                    coalesce(lower(hierarchy)::hstore, '') || hstore(lower(scale), lower(identifier)) as hierarchy,
                    st_transform(st_setsrid(st_geomfromgeojson(point), location.srid), 4326) as point,
                    st_transform(st_setsrid(st_multi(st_geomfromgeojson(location.polygon)), location.srid), 4326) as polygon,
                    st_transform(st_setsrid(st_multi(st_geomfromgeojson(simplified.polygon)), simplified.srid), 4326) as simplified_polygon,
                    details
                from jsonb_to_recordset(%s)
                    as location ( scale text
                                , identifier text
                                , hierarchy text
                                , point text
                                , polygon text
                                , srid integer
                                , details jsonb
                                )
                left join jsonb_to_recordset(%s)
                    as simplified ( identifier text
                                  , polygon text
                                  , srid integer
                                  )
                    using (identifier)
            ),
            inserted as (
                insert into warehouse.location (scale, identifier, hierarchy, point, polygon, simplified_polygon, details)
                table new_location
                {on_conflict}
                returning scale, identifier
            )
            select
                count(*) filter (where inserted is not null),
                count(*) filter (where inserted is not null and point is not null) as with_point,
                count(*) filter (where inserted is not null and polygon is not null) as with_polygon,
                count(*) filter (where inserted is not null and simplified_polygon is not null) as with_simplified_polygon,
                count(*) - count(*) filter (where inserted is not null) as skipped
            from new_location
            left join inserted using (scale, identifier)
        """)

        on_conflict = {
            "error":
            SQL(""),
            "update":
            SQL("""
                on conflict (scale, identifier) do update
                    set hierarchy           = EXCLUDED.hierarchy,
                        point               = EXCLUDED.point,
                        polygon             = EXCLUDED.polygon,
                        simplified_polygon  = EXCLUDED.simplified_polygon,
                        details             = EXCLUDED.details
            """),
            "skip":
            SQL("on conflict (scale, identifier) do nothing"),
        }

        imported = db.fetch_row(
            insert.format(on_conflict=on_conflict[if_exists_action]),
            (Json(locations), Json(simplified_polygons)))

        LOG.info(
            dedent(f"""\
            Imported {imported.count:,} locations
              {imported.with_point:,} with a point
              {imported.with_polygon:,} with a polygon
              {imported.with_simplified_polygon:,} with a simplified polygon
              {imported.skipped:,} skipped
            """))
        LOG.info("Committing all changes")
        db.commit()

    except:
        LOG.info("Rolling back all changes; the database will not be modified")
        db.rollback()
        raise
Beispiel #25
0
def kit_manifests(*, db: DatabaseSession):
    LOG.debug(
        f"Starting the kits manifests ETL routine, revision {MANIFEST_REVISION}"
    )

    LOG.debug("Fetching unprocessed manifest records")

    manifest = db.cursor("manifest")
    manifest.execute(
        """
        select manifest_id as id, document
          from receiving.manifest
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{
            "etl": ETL_NAME,
            "revision": MANIFEST_REVISION
        }]), ))

    for manifest_record in manifest:
        with db.savepoint(f"manifest record {manifest_record.id}"):
            LOG.info(f"Processing record {manifest_record.id}")

            # Mark record as skipped
            # if it does not contain a kit related sample
            if "kit" not in manifest_record.document:
                LOG.info(
                    f"Skipping manifest record {manifest_record.id} without kit data"
                )
                mark_skipped(db, manifest_record.id)
                continue

            sample_barcode = manifest_record.document.pop("sample")
            sample_identifier = find_identifier(db, sample_barcode)

            # Mark record as skipped
            # if it has an unknown sample barcode
            if not sample_identifier:
                LOG.warning(
                    f"Skipping manifest record with unknown sample barcode «{sample_barcode}»"
                )
                mark_skipped(db, manifest_record.id)
                continue

            # Mark record as skipped sample identifier set is unexpected
            if sample_identifier.set_name not in expected_identifier_sets[
                    "samples"]:
                LOG.warning(
                    f"Skipping manifest record with sample identifier found in "
                    +
                    f"set «{sample_identifier.set_name}», not {expected_identifier_sets['samples']}"
                )
                mark_skipped(db, manifest_record.id)
                continue

            # Find sample that should have been created from this
            # manifest record via etl manifest
            sample = find_sample(db, sample_identifier.uuid)

            # Error out the kit etl process if no sample found
            # The kit etl process can try again starting with this record
            # next time with the idea that the sample will be
            # created by then.
            if not sample:
                raise SampleNotFoundError(
                    f"No sample with «{sample_identifier.uuid}» found")

            # Mark record as skipped if the sample does not have a
            # sample type (utm or rdt)
            if sample.type not in {"utm", "rdt"}:
                LOG.info(f"Skipping manifest record {manifest_record.id} " +
                         f"with unknown sample type {sample.type}")
                mark_skipped(db, manifest_record.id)
                continue

            kit_barcode = manifest_record.document.pop("kit")
            kit_identifier = find_identifier(db, kit_barcode)

            # Mark record as skipped if it has an unknown kit barcode
            if not kit_identifier:
                LOG.warning(
                    f"Skipping kit with unknown barcode «{kit_barcode}»")
                mark_skipped(db, manifest_record.id)
                continue

            # Mark record as skipped if kit identifier set is unexpected
            if kit_identifier.set_name not in expected_identifier_sets["kits"]:
                LOG.warning(
                    f"Skipping kit with identifier found in " +
                    f"set «{kit_identifier.set_name}» not {expected_identifier_sets['kits']}"
                )
                mark_skipped(db, manifest_record.id)
                continue

            # List of extra data not needed for kit record that can
            # be removed before adding manifest document to kit details
            extra_data = [
                "collection", "sample_type", "aliquot_date", "aliquots",
                "racks"
            ]
            for key in extra_data:
                manifest_record.document.pop(key, None)

            # Try to find identifier for the test-strip barcode for rdt samples
            if sample.type == "rdt":
                update_test_strip(db, manifest_record.document)

            kit, status = upsert_kit_with_sample(
                db,
                identifier=kit_identifier.uuid,
                sample=sample,
                additional_details=manifest_record.document)

            if status == "updated":
                update_sample(db, sample, kit.encounter_id)

            mark_loaded(db, manifest_record.id, status, kit.id)
Beispiel #26
0
def upsert_kit_with_sample(db: DatabaseSession, identifier: str,
                           sample: SampleRecord,
                           additional_details: dict) -> Tuple[KitRecord, str]:
    """
    Upsert kit by its *identifier* to include link to a sample.

    An existing kit has its *sample_id* updated and the provided
    *additional_details* are merged (at the top level only into)
    the existing kit details, if any.
    """
    LOG.debug(f"Upserting kit «{identifier}»")

    data = {
        "identifier": identifier,
        "sample_id": sample.id,
        "additional_details": Json(additional_details)
    }

    if sample.type == 'utm':
        sample_type = "utm_sample_id"
    elif sample.type == 'rdt':
        sample_type = "rdt_sample_id"

    # Look for existing kit
    kit = find_kit(db, identifier)

    # Nothing found → create
    if not kit:
        LOG.info("Creating new kit")
        status = "created"
        kit = db.fetch_row(
            sql.SQL("""
            insert into warehouse.kit (identifier, {}, details)
                values(%(identifier)s,
                       %(sample_id)s,
                       %(additional_details)s)
            returning kit_id as id,
                      identifier,
                      encounter_id,
                      {}
            """).format(sql.Identifier(sample_type),
                        sql.Identifier(sample_type)), data)

    # Found kit → update
    else:
        status = "updated"
        kit_sample_id = getattr(kit, sample_type)
        # Warn if kit is already linked to a different sample!
        if (kit_sample_id and (sample.id != kit_sample_id)):
            LOG.warning(
                f"Kit «{kit.id}» already linked to another " +
                f"{sample_type} «{kit_sample_id}», linking with «{sample.id}» instead"
            )

        kit = db.fetch_row(
            sql.SQL("""
            update warehouse.kit
               set {} = %(sample_id)s,
                   details = coalesce(details, {}) || %(additional_details)s

             where kit_id = %(kit_id)s

            returning kit_id as id,
                      identifier,
                      encounter_id,
                      {}
            """).format(sql.Identifier(sample_type), sql.Literal(Json({})),
                        sql.Identifier(sample_type)), {
                            **data, "kit_id": kit.id
                        })

    assert kit.id, "Upsert affected no rows!"

    LOG.info(f"Upserted kit {kit.id} with identifier «{kit.identifier}» " +
             f"linked to {sample_type} «{getattr(kit, sample_type)}»")

    return kit, status
Beispiel #27
0
def upsert_kit_with_encounter(
        db: DatabaseSession, identifier: str, encounter_id: int,
        additional_details: dict) -> Tuple[KitRecord, str]:
    """
    Upsert kit by its *identifier* to include link to encounter.

    An existing kit has its *encounter_id* updated and the
    provided *additional_details* are merged (at the top level only into)
    the existing kit details, if any.
    """
    LOG.debug(f"Upserting kit «{identifier}»")

    data = {
        "identifier": identifier,
        "encounter_id": encounter_id,
        "additional_details": Json(additional_details)
    }

    # Look for existing kit
    kit = find_kit(db, identifier)

    # Nothing found → create
    if not kit:
        LOG.info("Creating new kit")
        status = "created"
        kit = db.fetch_row(
            """
            insert into warehouse.kit (identifier, encounter_id, details)
                values(%(identifier)s,
                       %(encounter_id)s,
                       %(additional_details)s)
            returning kit_id as id, identifier, encounter_id, null rdt_sample_id, null utm_sample_id
            """, data)

    # Found kit → update
    else:
        status = "updated"
        # Warn if kit is already linked to a different encounter!
        if kit.encounter_id and kit.encounter_id != encounter_id:
            LOG.warning(
                f"Kit «{kit.id}» already linked to another encounter «{kit.encounter_id}», linking with «{encounter_id}» instead"
            )

        kit = db.fetch_row(
            """
            update warehouse.kit
               set encounter_id = %(encounter_id)s,
                   details = coalesce(details, '{}') || %(additional_details)s

             where kit_id = %(kit_id)s

            returning kit_id as id, identifier, encounter_id, rdt_sample_id, utm_sample_id
            """, {
                **data, "kit_id": kit.id
            })

    assert kit.id, "Upsert affected no rows!"

    LOG.info(
        f"Upserted kit {kit.id} with identifier «{kit.identifier}» linked to encounter «{kit.encounter_id}»"
    )

    return kit, status
Beispiel #28
0
def etl_manifest(*, db: DatabaseSession):
    LOG.debug(f"Starting the manifest ETL routine, revision {REVISION}")

    # XXX TODO: Stop hardcoding valid identifier sets.  Instead, accept them as
    # an option or config (and validate option choices against what's actually
    # in the database).  We won't want to validate using click.option(),
    # because that would necessitate a database connection simply to run
    # bin/id3c at all.
    #   -trs, 13 May 2019
    expected_identifier_sets = {
        "samples": {"samples"},
        "collections": {
            "collections-environmental",
            "collections-fluathome.org",
            "collections-household-intervention",
            "collections-household-intervention-asymptomatic",
            "collections-household-observation",
            "collections-household-observation-asymptomatic",
            "collections-kiosks",
            "collections-kiosks-asymptomatic",
            "collections-seattleflu.org",
            "collections-swab&send",
            "collections-swab&send-asymptomatic",
            "collections-self-test",
            "collections-scan",
            "collections-haarvi",
            "samples-haarvi",
        },
        "rdt": {"collections-fluathome.org"}
    }

    # Fetch and iterate over samples that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same samples.
    LOG.debug("Fetching unprocessed manifest records")

    manifest = db.cursor("manifest")
    manifest.execute("""
        select manifest_id as id, document
          from receiving.manifest
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{ "etl": ETL_NAME, "revision": REVISION }]),))

    for manifest_record in manifest:
        with db.savepoint(f"manifest record {manifest_record.id}"):
            LOG.info(f"Processing record {manifest_record.id}")

            # Convert sample barcode to full identifier, ensuring it's
            # known and from the correct identifier set.
            sample_barcode = manifest_record.document.pop("sample")
            sample_identifier = find_identifier(db, sample_barcode)

            if not sample_identifier:
                LOG.warning(f"Skipping sample with unknown sample barcode «{sample_barcode}»")
                mark_skipped(db, manifest_record.id)
                continue

            if (manifest_record.document.get("sample_type") and
                manifest_record.document["sample_type"] == "rdt"):
                assert sample_identifier.set_name in expected_identifier_sets["rdt"], \
                    (f"Sample identifier found in set «{sample_identifier.set_name}», " +
                    f"not {expected_identifier_sets['rdt']}")
            else:
                assert sample_identifier.set_name in expected_identifier_sets["samples"], \
                    (f"Sample identifier found in set «{sample_identifier.set_name}», " +
                    f"not {expected_identifier_sets['samples']}")

            # Optionally, convert the collection barcode to full
            # identifier, ensuring it's known and from the correct
            # identifier set.
            collection_barcode = manifest_record.document.pop("collection", None)
            collection_identifier = find_identifier(db, collection_barcode) if collection_barcode else None

            if collection_barcode and not collection_identifier:
                LOG.warning(f"Skipping sample with unknown collection barcode «{collection_barcode}»")
                mark_skipped(db, manifest_record.id)
                continue

            assert not collection_identifier \
                or collection_identifier.set_name in expected_identifier_sets["collections"], \
                    f"Collection identifier found in set «{collection_identifier.set_name}», not {expected_identifier_sets['collections']}" # type: ignore

            # Sample collection date
            collection_date = manifest_record.document.get("date")

            # Upsert sample cooperatively with enrollments ETL routine
            #
            # The details document was intentionally modified by two pop()s
            # earlier to remove barcodes that were looked up.  The
            # rationale is that we want just one clear place in the
            # warehouse for each piece of information.
            sample, status = upsert_sample(db,
                identifier            = sample_identifier.uuid,
                collection_identifier = collection_identifier.uuid if collection_identifier else None,
                collection_date       = collection_date,
                additional_details    = manifest_record.document)

            mark_loaded(db, manifest_record.id,
                status = status,
                sample_id = sample.id)

            LOG.info(f"Finished processing manifest record {manifest_record.id}")
Beispiel #29
0
def etl_clinical(*, db: DatabaseSession):
    LOG.debug(f"Starting the clinical ETL routine, revision {REVISION}")

    # Fetch and iterate over clinical records that aren't processed
    #
    # Rows we fetch are locked for update so that two instances of this
    # command don't try to process the same clinical records.
    LOG.debug("Fetching unprocessed clinical records")

    clinical = db.cursor("clinical")
    clinical.execute("""
        select clinical_id as id, document
          from receiving.clinical
         where not processing_log @> %s
         order by id
           for update
        """, (Json([{ "revision": REVISION }]),))

    for record in clinical:
        with db.savepoint(f"clinical record {record.id}"):
            LOG.info(f"Processing clinical record {record.id}")

            # Check validity of barcode
            received_sample_identifier = sample_identifier(db,
                record.document["barcode"])

            # Skip row if no matching identifier found
            if received_sample_identifier is None:
                LOG.info("Skipping due to unknown barcode " + \
                          f"{record.document['barcode']}")
                mark_skipped(db, record.id)
                continue

            # Check sample exists in database
            sample = find_sample(db,
                identifier = received_sample_identifier)

            # Skip row if sample does not exist
            if sample is None:
                LOG.info("Skipping due to missing sample with identifier " + \
                            f"{received_sample_identifier}")
                mark_skipped(db, record.id)
                continue

            # Most of the time we expect to see existing sites so a
            # select-first approach makes the most sense to avoid useless
            # updates.
            site = find_or_create_site(db,
                identifier = site_identifier(record.document["site"]),
                details    = {"type": "retrospective"})


            # Most of the time we expect to see new individuals and new
            # encounters, so an insert-first approach makes more sense.
            # Encounters we see more than once are presumed to be
            # corrections.
            individual = upsert_individual(db,
                identifier  = record.document["individual"],
                sex         = sex(record.document["AssignedSex"]))

            encounter = upsert_encounter(db,
                identifier      = record.document["identifier"],
                encountered     = record.document["encountered"],
                individual_id   = individual.id,
                site_id         = site.id,
                age             = age(record.document),
                details         = encounter_details(record.document))

            sample = update_sample(db,
                sample = sample,
                encounter_id = encounter.id)

            # Link encounter to a Census tract, if we have it
            tract_identifier = record.document.get("census_tract")

            if tract_identifier:
                # Special-case float-like identifiers in earlier date
                tract_identifier = re.sub(r'\.0$', '', str(tract_identifier))

                tract = find_location(db, "tract", tract_identifier)
                assert tract, f"Tract «{tract_identifier}» is unknown"

                upsert_encounter_location(db,
                    encounter_id = encounter.id,
                    relation = "residence",
                    location_id = tract.id)

            mark_processed(db, record.id, {"status": "processed"})

            LOG.info(f"Finished processing clinical record {record.id}")
Beispiel #30
0
def kit_enrollments(*, db: DatabaseSession):
    LOG.debug(
        f"Starting the kit enrollments ETL routine, revision {ENROLLMENTS_REVISION}"
    )

    expected_barcode_types = {"ScannedSelfSwab", "ManualSelfSwab"}

    LOG.debug("Fetching unprocessed enrollments")
    enrollments = db.cursor("enrollments")
    enrollments.execute(
        """
        select enrollment_id as id, document
          from receiving.enrollment
         where not processing_log @> %s
         order by id
          for update
        """, (Json([{
            "etl": ETL_NAME,
            "revision": ENROLLMENTS_REVISION
        }]), ))

    for enrollment in enrollments:
        with db.savepoint(f"enrollment {enrollment.id}"):
            LOG.info(f"Processing enrollment {enrollment.id}")

            # Find encounter that should have been created
            # from this enrollment record through etl enrollments
            encounter = find_encounter(db, enrollment.document["id"])

            # Error out the kit etl process if no encounter found
            # The kit etl process can try again starting with this record
            # next time with the idea that the encounter will be
            # created by then.
            if not encounter:
                raise EncounterNotFoundError(
                    f"No encounter with identifier «{enrollment.document['id']}» found"
                )

            # Skip and mark the enrollment document as processed if the
            # encounter found is linked to a site that is not self-test
            if encounter.site != "self-test":
                LOG.debug(
                    f"Found encounter {encounter.id} «{encounter.identifier}»"
                    + f"linked to site «{encounter.site}», not 'self-test'")
                mark_enrollment_processed(db, enrollment.id)
                continue

            for code in enrollment.document["sampleCodes"]:
                barcode = code.get("code")

                # Kit must have a barcode
                if not barcode:
                    LOG.warning(f"No barcode found in sampleCodes {code}")
                    continue

                # Barcode must be of expected barcode type
                if code["type"] not in expected_barcode_types:
                    LOG.debug(f"Skipping barcode with type {code['type']}")
                    continue

                # Convert kit barcode to full identifier
                kit_identifier = find_identifier(db, barcode)

                if not kit_identifier:
                    LOG.warning(
                        f"Skipping kit with unknown barcode «{barcode}»")
                    continue

                if kit_identifier.set_name not in expected_identifier_sets[
                        "kits"]:
                    LOG.warning(
                        f"Skipping kit with identifier found in " +
                        f"set «{kit_identifier.set_name}» not {expected_identifier_sets['kits']}"
                    )
                    continue

                details = {"type": code["type"]}

                kit, status = upsert_kit_with_encounter(
                    db,
                    identifier=kit_identifier.uuid,
                    encounter_id=encounter.id,
                    additional_details=details)

                if status == "updated":
                    update_kit_samples(db, kit)

            mark_enrollment_processed(db, enrollment.id)

            LOG.info(f"Finished processing enrollment {enrollment.id}")