Ejemplo n.º 1
0
    def parse_nbhd(self, nbhd_object, date):
        properties = nbhd_object["properties"]
        nbhd = properties["community"]
        deaths = properties["value"]
        population = properties["population"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {"country": self.country, "state": self.state, "nbhd": nbhd},
        )

        summary_location = {
            "submitter_id": summary_location_submitter_id,
            "community_area": nbhd,
            "projects": [{"code": self.project_code}],
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {"date": date},
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "deaths_per_10000": round(10000 * deaths / population, 2),
            "deaths": deaths,
            "summary_locations": [{"submitter_id": summary_location_submitter_id}],
        }

        return summary_location, summary_clinical
Ejemplo n.º 2
0
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "ChestX-ray8"

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.file_helper = FileHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.cmc_submitter_id = format_submitter_id("cmc_chestxray8", {})
        self.core_metadata_collection = [{
            "submitter_id":
            self.cmc_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
        }]
        self.imaging_file = []
Ejemplo n.º 3
0
    def parse_row(self, row):
        fields_mapping = {
            "NPI": ("summary_location", "npi"),
            "Provider_First_Line_Business_Pra": (
                "summary_location",
                "first_line_address",
            ),
            "Provider_Second_Line_Business_Pr": (
                "summary_location",
                "second_line_address",
            ),
            "Provider_Business_Practice_City": ("summary_location", "city"),
            "Provider_Business_Practice_ST":
            ("summary_location", "province_state"),
            "TaxonomyCode": ("summary_clinical", "taxonomy_code"),
            "ProviderType": ("summary_clinical", "provider_type"),
            "ProviderSubtype": ("summary_clinical", "provider_subtype"),
            "DetailedSpecialty": ("summary_clinical", "detailed_specialty"),
        }

        npi = row["NPI"]
        state = row["Provider_Business_Practice_ST"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location", {
                "country": self.country,
                "state": state,
                "npi": npi
            })

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id, "summary_location",
            "summary_clinical", {})

        result = {
            "summary_location": {
                "submitter_id": summary_location_submitter_id,
                "projects": [{
                    "code": self.project_code
                }],
            },
            "summary_clinical": {
                "submitter_id":
                summary_clinical_submitter_id,
                "summary_locations": [{
                    "submitter_id":
                    summary_location_submitter_id
                }],
            },
        }

        for original_field, mappings in fields_mapping.items():
            node, node_field = mappings
            if node_field == "npi":
                result[node][node_field] = str(row[original_field])
            else:
                result[node][node_field] = row[original_field]

        return result["summary_location"], result["summary_clinical"]
Ejemplo n.º 4
0
    def parse_historical(self, utilization,
                         summary_clinical_statewide_current):
        utilization_mapping = {
            "reportDate": "date",
            "TotalBeds": "state_total_beds",
            "TotalOpenBeds": "total_open_beds",
            "TotalInUseBedsNonCOVID": "total_in_use_beds_non_covid",
            "TotalInUseBedsCOVID": "total_in_use_beds_covid",
            "ICUBeds": "icu_beds",
            "ICUOpenBeds": "icu_open_beds",
            "ICUInUseBedsNonCOVID": "icu_in_use_beds_non_covid",
            "ICUInUseBedsCOVID": "icu_in_use_beds_covid",
            "VentilatorCapacity": "ventilator_capacity",
            "VentilatorAvailable": "ventilator_available",
            "VentilatorInUseNonCOVID": "ventilator_in_use_non_covid",
            "VentilatorInUseCOVID": "ventilator_in_use_covid",
        }
        date = utilization["reportDate"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "project": "idph_hospital",
                "country": self.country,
                "state": self.state
            },
        )

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {
                "project": "idph_hospital",
                "date": date
            },
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        for k, v in utilization.items():
            summary_clinical[utilization_mapping[k]] = v

        if (summary_clinical_submitter_id ==
                summary_clinical_statewide_current["submitter_id"]):
            summary_clinical.update(summary_clinical_statewide_current)

        return summary_clinical
Ejemplo n.º 5
0
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "ncbi-covid-19"
        self.manifest_bucket = "sra-pub-sars-cov2"
        self.sra_src_manifest = "sra-src/Manifest"
        self.accession_number_filename_map = {}

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.file_helper = AsyncFileHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.data_file = NCBI_FILE(
            base_url=self.base_url,
            s3_bucket=self.project_code,
            access_token=access_token,
        )

        self.submitting_data = {
            "sample": [],
            "virus_sequence": [],
            "core_metadata_collection": [],
            "virus_sequence_run_taxonomy": [],
            "virus_sequence_contig": [],
            "virus_sequence_blastn": [],
            "virus_sequence_contig_taxonomy": [],
            "virus_sequence_peptide": [],
            "virus_sequence_hmm_search": [],
        }

        self.submitting_data["core_metadata_collection"].append({
            "submitter_id":
            format_submitter_id("cmc_ncbi_covid19", {}),
            "projects": [{
                "code": self.project_code
            }],
        })

        read_ncbi_manifest(
            self.manifest_bucket,
            self.sra_src_manifest,
            self.accession_number_filename_map,
        )
Ejemplo n.º 6
0
    def parse_facility(self, date, facility):
        """
        From county-level data, generate the data we can submit via Sheepdog
        """
        county = facility["County"]
        facility_name = facility["FacilityName"]
        confirmed_cases = facility["confirmed_cases"]
        deaths = facility["deaths"]
        status = facility.get("status", None)

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "country": self.country,
                "state": self.state,
                "facility_name": facility_name,
                "reporting_org_status": status,
            },
        )

        summary_location = {
            "country_region": self.country,
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "province_state": self.state,
            "county": county,
            "reporting_org": facility_name,
            "reporting_org_status": status,
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {"date": date},
        )

        summary_clinical = {
            "confirmed": confirmed_cases,
            "deaths": deaths,
            "submitter_id": summary_clinical_submitter_id,
            "lastUpdateEt": date,
            "date": date,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        return summary_location, summary_clinical
Ejemplo n.º 7
0
    def parse_region(self, date, hospital_region):
        """
        From county-level data, generate the data we can submit via Sheepdog
        """
        region = hospital_region["region"]
        region_description = hospital_region["region_description"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "project": "idph_hospital",
                "country": self.country,
                "state": self.state,
                "region": region,
            },
        )

        summary_location = {
            "country_region": self.country,
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "province_state": self.state,
            "state_hospital_region": region,
            "state_region_description": strip_prefix(region_description),
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {
                "project": "idph_hospital",
                "date": date
            },
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
            "region_icu_avail": hospital_region["ICUAvail"],
            "region_icu_capacity": hospital_region["ICUCapacity"],
            "region_vents_available": hospital_region["VentsAvailable"],
            "region_vents_capacity": hospital_region["VentsCapacity"],
        }

        return summary_location, summary_clinical
Ejemplo n.º 8
0
 def get_location_and_clinical_submitter_id(self, county, date):
     summary_location_submitter_id = format_submitter_id(
         "summary_location",
         {"country": self.country, "state": self.state, "county": county}
         if county is not None
         else {"country": self.country, "state": self.state},
     )
     summary_clinical_submitter_id = derived_submitter_id(
         summary_location_submitter_id,
         "summary_location",
         "summary_clinical",
         {"date": date},
     )
     return summary_location_submitter_id, summary_clinical_submitter_id
Ejemplo n.º 9
0
    def files_to_submissions(self):
        for image_type in ("No_findings", "Pneumonia"):
            for image_filepath in (
                    Path(CHESTXRAY8_DATA_PATH).joinpath("COVID-19").joinpath(
                        "X-Ray Image DataSet").joinpath(image_type).iterdir()):
                did, rev, md5, size = self.file_helper.find_by_name(
                    image_filepath.name)
                if not did:
                    guid = self.file_helper.upload_file(image_filepath)
                    print(
                        f"file {image_filepath.name} uploaded with guid: {guid}"
                    )
                else:
                    print(
                        f"file {image_filepath.name} exists in indexd... skipping..."
                    )

                imaging_file_submitter_id = format_submitter_id(
                    "imaging_file_chestxray8",
                    {"filename": image_filepath.name})
                uploaded_imaging_file = {
                    "submitter_id":
                    imaging_file_submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        self.cmc_submitter_id
                    }],
                    "data_type":
                    "PNG",
                    "data_format":
                    "Image File",
                    "data_category":
                    "X-Ray Image",
                    "file_name":
                    image_filepath.name,
                    "file_size":
                    size,
                    "md5sum":
                    md5,
                    "object_id":
                    did,
                    "clinical_notes":
                    image_type,
                }

                self.imaging_file.append(uploaded_imaging_file)
Ejemplo n.º 10
0
    def parse_historical_data(self, illinois_data):
        """
        Parses historical state-level data. "summary_location" node is created
        from "characteristics_by_county" data.

        Args:
            illinois_data (dict): data JSON with "testDate", "total_tested",
                "confirmed_cases" and "deaths"

        Returns:
            dict: "summary_clinical" node for Sheepdog
        """
        county = "Illinois"

        date = datetime.datetime.strptime(illinois_data["testDate"],
                                          "%m/%d/%Y").strftime("%Y-%m-%d")

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "country": self.country,
                "state": self.state,
                "county": county
            },
        )

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {"date": date},
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "confirmed": illinois_data["confirmed_cases"],
            "testing": illinois_data["total_tested"],
            "deaths": illinois_data["deaths"],
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        return summary_clinical
Ejemplo n.º 11
0
    def parse_zipcode(self, date, zipcode_values):
        """
        From county-level data, generate the data we can submit via Sheepdog
        """
        zipcode = zipcode_values["zip"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "country": self.country,
                "state": self.state,
                "zipcode": zipcode
            },
        )

        summary_location = {
            "submitter_id": summary_location_submitter_id,
            "country_region": self.country,
            "province_state": self.state,
            "zipcode": zipcode,
            "projects": [{
                "code": self.project_code
            }],
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {"date": date},
        )
        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "confirmed": zipcode_values["confirmed_cases"],
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        if "demographics" in zipcode_values:
            demographic = zipcode_values["demographics"]

            for k, v in fields_mapping.items():
                field, mapping = v
                demographic_group = demographic[k]

                for item in demographic_group:
                    dst_field = mapping[item[field]]
                    if dst_field:
                        if "count" in item:
                            age_group_count_field = "{}_{}".format(
                                mapping[item[field]], "count")
                            summary_clinical[age_group_count_field] = item[
                                "count"]
                        if "tested" in item:
                            age_group_tested_field = "{}_{}".format(
                                mapping[item[field]], "tested")
                            summary_clinical[age_group_tested_field] = item[
                                "tested"]

        return summary_location, summary_clinical
Ejemplo n.º 12
0
    def parse_file(self, latest_submitted_date, url):
        """
        Converts a JSON files to data we can submit via Sheepdog. Stores the
        records to submit in `self.summary_locations` and `self.summary_clinicals`.

        Args:
            latest_submitted_date (date): the date of latest available "summary_clinical" for project
            url (str): URL at which the JSON file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            date = idph_get_date(data["LastUpdateDate"])

            if latest_submitted_date and date == latest_submitted_date.strftime(
                    "%Y-%m-%d"):
                print(
                    "Nothing to submit: latest submitted date and date from data are the same."
                )
                return

            if "LTC_Reported_Cases" in data:
                summary_location_submitter_id = format_submitter_id(
                    "summary_location", {
                        "country": self.country,
                        "state": self.state
                    })

                summary_location = {
                    "country_region": self.country,
                    "submitter_id": summary_location_submitter_id,
                    "projects": [{
                        "code": self.project_code
                    }],
                    "province_state": self.state,
                }

                summary_clinical_submitter_id = derived_submitter_id(
                    summary_location_submitter_id,
                    "summary_location",
                    "summary_clinical",
                    {"date": date},
                )
                summary_clinical = {
                    "confirmed":
                    data["LTC_Reported_Cases"]["confirmed_cases"],
                    "deaths":
                    data["LTC_Reported_Cases"]["deaths"],
                    "submitter_id":
                    summary_clinical_submitter_id,
                    "lastUpdateEt":
                    date,
                    "date":
                    date,
                    "summary_locations": [{
                        "submitter_id":
                        summary_location_submitter_id
                    }],
                }
                self.summary_locations[
                    summary_location_submitter_id] = summary_location
                self.summary_clinicals[
                    summary_clinical_submitter_id] = summary_clinical

            for facility in data["FacilityValues"]:
                (summary_location,
                 summary_clinical) = self.parse_facility(date, facility)
                summary_location_submitter_id = summary_location[
                    "submitter_id"]
                summary_clinical_submitter_id = summary_clinical[
                    "submitter_id"]

                self.summary_locations[
                    summary_location_submitter_id] = summary_location

                if summary_clinical_submitter_id in self.summary_clinicals:
                    existed = self.summary_clinicals[
                        summary_clinical_submitter_id]
                    summary_clinical["confirmed"] = max(
                        summary_clinical["confirmed"], existed["confirmed"])
                    summary_clinical["deaths"] = max(
                        summary_clinical["deaths"], existed["deaths"])

                self.summary_clinicals[
                    summary_clinical_submitter_id] = summary_clinical
Ejemplo n.º 13
0
    def parse_county(self, date, county_json, demographic):
        """
        From county-level data, generate the data we can submit via Sheepdog

        Args:
            date (date): date
            county_json (dict): JSON for county statistics

        Returns:
            (dict, dict): "summary_location" and "summary_clinical" records
        """
        county = county_json["County"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "country": self.country,
                "state": self.state,
                "county": county
            },
        )

        summary_location = {
            "submitter_id": summary_location_submitter_id,
            "country_region": self.country,
            "province_state": self.state,
            "projects": [{
                "code": self.project_code
            }],
        }

        # the IDPH data use Illinois in "County" field for aggregated data
        # in Gen3 it would equal to location with "province_state" equal to "IL" and no "County" field
        if county != "Illinois":
            summary_location["county"] = county

        if county in self.county_dict:
            summary_location["latitude"] = self.county_dict[county]["lat"]
            summary_location["longitude"] = self.county_dict[county]["lon"]
        else:
            if county_json["lat"] != 0:
                summary_location["latitude"] = str(county_json["lat"])
            if county_json["lon"] != 0:
                summary_location["longitude"] = str(county_json["lon"])

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {"date": date},
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "confirmed": county_json["confirmed_cases"],
            "testing": county_json["total_tested"],
            "deaths": county_json["deaths"],
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        if "negative" in county_json:
            summary_clinical["negative"] = county_json["negative"]

        if county == "Illinois" and demographic:
            for k, v in fields_mapping.items():
                field, mapping = v
                demographic_group = demographic[k]

                for item in demographic_group:
                    dst_field = mapping[item[field]]
                    if dst_field:
                        if "count" in item:
                            age_group_count_field = "{}_{}".format(
                                mapping[item[field]], "count")
                            summary_clinical[age_group_count_field] = item[
                                "count"]
                        if "tested" in item:
                            age_group_tested_field = "{}_{}".format(
                                mapping[item[field]], "tested")
                            summary_clinical[age_group_tested_field] = item[
                                "tested"]

        return summary_location, summary_clinical
Ejemplo n.º 14
0
    async def files_to_node_submissions(self, node_name):
        """Get submitting data for the node"""

        retrying = True
        while retrying:
            try:
                submitting_accession_numbers = (
                    await self.get_submitting_accession_number_list(node_name))
                retrying = False
            except Exception as e:
                print(
                    f"Can not query peregine with {node_name}. Detail {e}. Retrying ..."
                )

        for accession_number in submitting_accession_numbers:
            submitter_id = format_submitter_id(
                node_name, {"accession_number": accession_number})

            cmc_submitter_id = format_submitter_id("cmc_ncbi_covid19", {})

            contig_submitter_id = format_submitter_id(
                "virus_sequence_contig",
                {"accession_number": accession_number})
            peptide_submitter_id = format_submitter_id(
                "virus_sequence_peptide",
                {"accession_number": accession_number})
            run_taxonomy_submitter_id = format_submitter_id(
                "virus_sequence_run_taxonomy",
                {"accession_number": accession_number})

            contig_taxonomy_submitter_id = format_submitter_id(
                "virus_sequence_contig_taxonomy",
                {"accession_number": accession_number})

            if node_name == "virus_sequence_contig":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequences_run_taxonomies": [{
                        "submitter_id":
                        run_taxonomy_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Virus Sequence Contig",
                    "data_format":
                    "json",
                    "data_category":
                    "Nucleotide Contig",
                }
            elif node_name == "virus_sequence_blastn":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequence_contigs": [{
                        "submitter_id":
                        contig_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Virus Sequence Blastn",
                    "data_format":
                    "tsv",
                    "data_category":
                    "Nucleotide Blast",
                }
            elif node_name == "virus_sequence_peptide":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequence_contigs": [{
                        "submitter_id":
                        contig_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Peptides Annotation Using VIGOR3",
                    "data_format":
                    "json",
                    "data_category":
                    "Peptides Annotation",
                }
            elif node_name == "virus_sequence_hmm_search":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequence_peptides": [{
                        "submitter_id":
                        peptide_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Virus Sequence HMM Search",
                    "data_format":
                    "json",
                    "data_category":
                    "HMMER Scab of Contigs",
                }
            elif node_name == "virus_sequence_contig_taxonomy":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequence_contigs": [{
                        "submitter_id":
                        contig_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Contig Taxonomy",
                    "data_format":
                    "json",
                    "data_category":
                    "Kmer-based Taxonomy Analysis of Contigs",
                }

            else:
                raise Exception(f"ERROR: {node_name} does not exist")

            ext = re.search("\.(.*)$",
                            self.data_file.nodes[node_name][0]).group(1)
            filename = f"{node_name}_{accession_number}.{ext}"

            print(f"Get indexd record of {filename}")

            retrying = True
            while retrying:
                try:
                    (
                        did,
                        rev,
                        md5sum,
                        filesize,
                        file_name,
                        authz,
                    ) = await self.file_helper.async_find_by_name(
                        filename=filename)
                    retrying = False
                except Exception as e:
                    print(
                        f"ERROR: Fail to query indexd for {filename}. Detail {e}. Retrying ..."
                    )
                    await asyncio.sleep(5)

            assert (
                did
            ), f"file {filename} does not exist in the index, rerun NCBI_FILE ETL"

            if not authz:
                tries = 0
                while tries < MAX_RETRIES:
                    try:
                        await self.file_helper.async_update_authz(did=did,
                                                                  rev=rev)
                        break
                    except Exception as e:
                        tries += 1
                        print(
                            f"ERROR: Fail to update indexd for {filename}. Detail {e}. Retrying ..."
                        )
                        await asyncio.sleep(5)

            submitted_json["file_size"] = filesize
            submitted_json["md5sum"] = md5sum
            submitted_json["object_id"] = did
            submitted_json["file_name"] = file_name

            self.submitting_data[node_name].append(submitted_json)
        return submitting_accession_numbers
Ejemplo n.º 15
0
    async def files_to_virus_sequence_run_taxonomy_submission(
            self, submitting_accession_numbers):
        """get submitting data for virus_sequence_run_taxonomy node"""

        if not submitting_accession_numbers:
            return

        records = self._get_response_from_big_query(
            submitting_accession_numbers)

        # Keep track accession_numbers having link to virus_sequence nodes
        accession_number_set = set()
        for record in records:
            if record["acc"] in self.accession_number_filename_map:
                accession_number = record["acc"]
                print(f"Get from bigquery response {accession_number}")
                success = await self._parse_big_query_response(record)
                if success:
                    accession_number_set.add(accession_number)

        cmc_submitter_id = format_submitter_id("cmc_ncbi_covid19", {})
        for accession_number in submitting_accession_numbers:
            virus_sequence_run_taxonomy_submitter_id = format_submitter_id(
                "virus_sequence_run_taxonomy",
                {"accession_number": accession_number})
            submitted_json = {
                "submitter_id": virus_sequence_run_taxonomy_submitter_id,
                "core_metadata_collections": [{
                    "submitter_id": cmc_submitter_id
                }],
                "accession_number": accession_number,
                "data_type": "Virus Sequence Run Taxonomy Analysis",
                "data_format": "json",
                "data_category": "Kmer-based Taxonomy Analysis",
            }

            # Add link to virus sequence node
            if accession_number in accession_number_set:
                submitted_json["virus_sequences"] = [{
                    "submitter_id":
                    f"virus_sequence_{accession_number}"
                }]

            filename = f"virus_sequence_run_taxonomy_{accession_number}.csv"
            print(f"Get indexd info of {filename}")
            trying = True
            while trying:
                try:
                    (
                        did,
                        rev,
                        md5sum,
                        filesize,
                        file_name,
                        authz,
                    ) = await self.file_helper.async_find_by_name(
                        filename=filename)
                    trying = False
                except Exception as e:
                    print(
                        f"Can not get indexd record of {filename}. Detail {e}. Retrying..."
                    )

            assert (
                did
            ), f"file {filename} does not exist in the index, rerun NCBI_FILE ETL"

            if not authz:
                tries = 0
                while tries < MAX_RETRIES:
                    try:
                        await self.file_helper.async_update_authz(did=did,
                                                                  rev=rev)
                        break
                    except Exception as e:
                        tries += 1
                        print(
                            f"Can not update indexd for {did}. Detail {e}. Retrying..."
                        )

            submitted_json["file_size"] = filesize
            submitted_json["md5sum"] = md5sum
            submitted_json["object_id"] = did
            submitted_json["file_name"] = file_name

            self.submitting_data["virus_sequence_run_taxonomy"].append(
                submitted_json)
Ejemplo n.º 16
0
    def parse_statewide_values(self, date, statewide_values):
        statewide_mapping = {
            "ICUCapacity": "state_icu_capacity",
            "ICUCovidPatients": "state_icu_covid_patients",
            "VentCapacity": "state_vent_capacity",
            "VentCovidPatients": "state_vent_covid_patients",
            "ICUAvailable": "state_icu_available",
            "VentsAvailable": "state_vents_available",
            "TotalBeds": "state_total_beds",
            "TotalBedsAvailable": "state_total_beds_available",
            "TotalBedsUsed": "state_total_beds_used",
            "PctHospitalBedsAvailable": "state_pct_hospital_beds_available",
            "AdultICUCapacity": "state_adult_icu_capacity",
            "ICUOpenBeds": "state_icu_open_beds",
            "ICUBedsUsed": "state_icu_beds_used",
            "ICUOpenBedsPct": "state_icu_open_beds_pct",
            "COVIDPUIPatients": "state_covid_pui_patients",
            "COVIDPUIPatientsPct": "state_covid_pui_patients_pct",
            "COVIDPUIPatientsBedsInUsePct":
            "state_covid_pui_patients_beds_in_use_pct",
            "VentilatorCapacity": "state_ventilator_capacity",
            "VentilatorsOpen": "state_ventilators_open",
            "VentilatorsOpenPct": "state_Ventilators_open_pct",
            "VentilatorsInUse": "state_ventilators_in_use",
            "VentilatorsInUseCOVID": "state_ventilators_in_use_covid",
            "VentilatorsCOVIDPatientsPct":
            "state_ventilators_covid_patients_pct",
            "VentilatorsCOVIDPatientsInUsePct":
            "state_ventilators_covid_patients_in_use_pct",
            "CovidPatientsNonICU": "state_covid_patients_non_icu",
            "TotalCOVIDPUIInICU": "state_total_covid_pui_in_icu",
            "TotalCOVIDPUIInHospital": "state_total_covid_pui_in_hospital",
            "PctBedsCOVIDPUI": "state_pct_beds_covid_pui",
            "MedSurgBeds": "state_med_surg_beds",
            "MedSurgBedsOpen": "state_med_surg_beds_open",
            "MedSurgBedsOpenPct": "state_med_surg_beds_open_pct",
            "MedSurgBedsInUse": "state_med_surg_beds_in_use",
        }

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "project": "idph_hospital",
                "country": self.country,
                "state": self.state
            },
        )

        summary_location = {
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "country_region": self.country,
            "province_state": self.state,
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {
                "project": "idph_hospital",
                "date": date
            },
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        for k, v in statewide_values.items():
            summary_clinical[statewide_mapping[k]] = v

        return summary_location, summary_clinical
Ejemplo n.º 17
0
    def parse_row(self, headers, row):
        cmc_submitter_id = format_submitter_id("cmc_coxray", {})
        subject_submitter_id = format_submitter_id(
            "subject_coxray", {"patientid": row[headers.index("patientid")]})
        observation_submitter_id = derived_submitter_id(
            subject_submitter_id, "subject_coxray", "observation_coxray", {})
        follow_up_submitter_id = derived_submitter_id(
            subject_submitter_id,
            "subject_coxray",
            "follow_up_coxray",
            {"offset": row[headers.index("offset")]},
        )
        demographic_submitter_id = derived_submitter_id(
            subject_submitter_id, "subject_coxray", "demographic_coxray", {})
        imaging_file_submitter_id = format_submitter_id(
            "imaging_file_coxray",
            {"filename": row[headers.index("filename")]})
        study_submitter_id = format_submitter_id(
            "study_coxray", {"doi": row[headers.index("doi")]})

        filename = row[headers.index("filename")]
        filename = Path(filename)
        filepath = Path(COXRAY_DATA_PATH).joinpath("images", filename)
        filepath_exist = filepath.exists()

        nodes = {
            "core_metadata_collection": {
                "submitter_id": cmc_submitter_id,
                "projects": [{
                    "code": self.project_code
                }],
            },
            "study": {
                "submitter_id": study_submitter_id,
                "projects": [{
                    "code": self.project_code
                }],
            },
            "subject": {
                "submitter_id": subject_submitter_id,
                "projects": [{
                    "code": self.project_code
                }],
                "studies": [{
                    "submitter_id": study_submitter_id
                }],
            },
            "observation": {
                "submitter_id": observation_submitter_id,
                "subjects": [{
                    "submitter_id": subject_submitter_id
                }],
            },
            "follow_up": {
                "submitter_id": follow_up_submitter_id,
                "subjects": [{
                    "submitter_id": subject_submitter_id
                }],
            },
            "demographic": {
                "submitter_id": demographic_submitter_id,
                "subjects": [{
                    "submitter_id": subject_submitter_id
                }],
            },
        }

        if filepath_exist:
            data_type = "".join(filename.suffixes)
            did, rev, md5sum, filesize = self.file_helper.find_by_name(
                filename=filename)
            assert (
                did
            ), f"file {filename} does not exist in the index, rerun COXRAY_FILE ETL"
            self.file_helper.update_authz(did=did, rev=rev)

            nodes["imaging_file"] = {
                "submitter_id": imaging_file_submitter_id,
                "subjects": [{
                    "submitter_id": subject_submitter_id
                }],
                "follow_ups": [{
                    "submitter_id": follow_up_submitter_id
                }],
                "core_metadata_collections": [{
                    "submitter_id": cmc_submitter_id
                }],
                "data_type": data_type,
                "data_format": "Image File",
                "data_category": "X-Ray Image",
                "file_size": filesize,
                "md5sum": md5sum,
                "object_id": did,
            }
        else:
            print(
                f"subject references the file that doesn't exist as a file: {filepath}"
            )

        for k, (node, field, converter) in fields_mapping.items():
            value = row[headers.index(k)]
            if node in nodes and value:
                if converter:
                    nodes[node][field] = converter(value)
                else:
                    nodes[node][field] = value

        return nodes
Ejemplo n.º 18
0
    def parse_input(self, row_data, date_mode=None):
        # (original property, (gen3 node, gen3 property, property type))
        mapping = [
            ("reportingOrg", ("summary_location", "reporting_org", str)),
            ("reportDate", ("statistical_summary_report", "report_date", str)),
            ("num_COVID", ("statistical_summary_report", "num_COVID", int)),
            (
                "num_COVID_deaths",
                ("statistical_summary_report", "num_COVID_deaths", int),
            ),
            ("num_outpatient", ("statistical_summary_report", "num_outpatient",
                                int)),
            ("num_admitted", ("statistical_summary_report", "num_admitted",
                              int)),
            ("num_icu", ("statistical_summary_report", "num_icu", int)),
            ("num_vent", ("statistical_summary_report", "num_vent", int)),
            ("num_resp", ("statistical_summary_report", "num_resp", int)),
            ("num_pneu", ("statistical_summary_report", "num_pneu", int)),
            ("num_diab", ("statistical_summary_report", "num_diab", int)),
            ("num_asth", ("statistical_summary_report", "num_asth", int)),
            ("num_obes", ("statistical_summary_report", "num_obes", int)),
            ("num_card", ("statistical_summary_report", "num_card", int)),
            ("num_chf", ("statistical_summary_report", "num_chf", int)),
        ]

        # row_records = { <node ID>: { <record data> } }
        # (there is only 1 record of each node type per row)
        row_records = defaultdict(dict)

        for orig_prop_name, (node_type, prop_name, _type) in mapping:
            if row_data[orig_prop_name]:
                row_records[node_type][prop_name] = format_value(
                    prop_name, row_data[orig_prop_name], _type, date_mode)

        # add missing summary_location props
        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "reporting_org":
                row_records["summary_location"]["reporting_org"]
            },
        )
        row_records["summary_location"].update({
            "type": "summary_location",
            "submitter_id": summary_location_submitter_id,
            "projects": {
                "code": self.project_code
            },
            "country_region": self.country,
            "province_state": self.state,
        })

        # add missing statistical_summary_report props
        ssr_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "statistical_summary_report",
            "ssr",
            {
                "report_date":
                row_records["statistical_summary_report"]["report_date"]
            },
        )
        row_records["statistical_summary_report"].update({
            "type": "statistical_summary_report",
            "submitter_id": ssr_submitter_id,
            "summary_locations": {
                "submitter_id": summary_location_submitter_id
            },
        })

        for node_type in row_records:
            rec = row_records[node_type]
            self.records[node_type][rec["submitter_id"]] = rec