Beispiel #1
0
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "NPI-PRO"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.country = "US"

        self.summary_locations = []
        self.summary_clinicals = []
Beispiel #2
0
class JHU_COUNTRY_CODES(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.program_name = "open"
        self.project_code = "JHU"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def files_to_submissions(self):
        codes_dict = get_codes_dictionary()
        locations = self.get_existing_locations()
        for location in locations:
            codes = get_codes_for_country_name(codes_dict,
                                               location["country_region"])

            # do not update the record if it already has the codes
            if location["iso2"] == codes["iso2"] and location["iso3"] == codes[
                    "iso3"]:
                continue

            record = {k: v for k, v in location.items() if v != None}
            record.update({
                "type": "summary_location",
                "projects": [{
                    "code": self.project_code
                }],
                "iso2": codes["iso2"],
                "iso3": codes["iso3"],
            })
            self.metadata_helper.add_record_to_submit(record)

    def submit_metadata(self):
        self.metadata_helper.batch_submit_records()

    def get_existing_locations(self):
        print("Getting summary_location data from Peregrine")
        query_string = ('{ summary_location (first: 0, project_id: "' +
                        self.program_name + "-" + self.project_code +
                        '") { submitter_id, country_region, iso2, iso3 } }')
        query_res = self.metadata_helper.query_peregrine(query_string)
        return [location for location in query_res["data"]["summary_location"]]
Beispiel #3
0
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "IDPH-Vaccine"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )
        self.country = "US"
        self.state = "IL"
        self.date = ""
        self.counties_inventory = {}

        self.summary_locations = {}
        self.summary_clinicals = {}
        self.summary_group_demographic = {}
Beispiel #4
0
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        # Get all input strings from YAML
        script = path.splitext(path.basename(__file__))[0].strip("/")
        script = path.join(CURRENT_DIR, script + ".yaml")
        with open(script) as f:
            config = yaml.load(f, Loader=yaml.FullLoader)

        self.verbose = config["verbose"]
        self.program_name = config["program_name"]
        self.project_code = config["project_code"]
        self.virus_genome_data_category = config["virus_genome_data_category"]
        self.virus_genome_data_type = config["virus_genome_data_type"]
        self.virus_genome_data_format = config["virus_genome_data_format"]
        self.virus_genome_source = config["virus_genome_source"]
        self.virus_genome_type = config["virus_genome_type"]
        self.virus_sequence_type = config["virus_sequence_type"]
        self.virus_sequence_data_type = config["virus_sequence_data_type"]
        self.virus_sequence_data_format = config["virus_sequence_data_format"]
        self.virus_sequence_alignment_type = config["virus_sequence_alignment_type"]
        self.virus_sequence_alignment_data_type = config[
            "virus_sequence_alignment_data_type"
        ]
        self.virus_sequence_alignment_data_format = config[
            "virus_sequence_alignment_data_format"
        ]
        self.virus_sequence_alignment_tool = config["virus_sequence_alignment_tool"]
        self.virus_sequence_hmm_type = config["virus_sequence_hmm_type"]
        self.virus_sequence_hmm_data_type = config["virus_sequence_hmm_data_type"]
        self.virus_sequence_hmm_data_format = config["virus_sequence_hmm_data_format"]
        self.virus_genomes = []
        self.virus_sequences = []
        self.virus_sequence_alignments = []
        self.virus_sequence_hmms = []

        self.metadata_helper = MetadataHelper(
            base_url=base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )
Beispiel #5
0
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.manifest_bucket = "sra-pub-sars-cov2"
        self.sra_src_manifest = "sra-src/Manifest"
        self.program_name = "open"
        self.project_code = "ncbi-covid-19"
        self.token = access_token
        self.last_submission_identifier = None

        self.file_helper = AsyncFileHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )
Beispiel #6
0
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "ATLAS"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.nodes = {
            "summary_location": [],
            "summary_socio_demographic": [],
        }
Beispiel #7
0
def main():
    headers = {"Authorization": f"bearer {access_token}"}
    records = get_existing_data(base_url, program, project, old_node, headers)

    metadata_helper = MetadataHelper(
        base_url=base_url,
        program_name=program,
        project_code=project,
        access_token=access_token,
    )
    print(f"Submitting {new_node} data")
    for old_rec in records:
        new_rec = {"type": new_node, "project_id": f"{program}-{project}"}
        for key, value in old_rec.items():
            if value:
                new_rec[key] = value
        metadata_helper.add_record_to_submit(new_rec)
    metadata_helper.batch_submit_records()
Beispiel #8
0
class CHESTXRAY8(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "ChestX-ray8"

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.file_helper = FileHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.cmc_submitter_id = format_submitter_id("cmc_chestxray8", {})
        self.core_metadata_collection = [{
            "submitter_id":
            self.cmc_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
        }]
        self.imaging_file = []

    def files_to_submissions(self):
        for image_type in ("No_findings", "Pneumonia"):
            for image_filepath in (
                    Path(CHESTXRAY8_DATA_PATH).joinpath("COVID-19").joinpath(
                        "X-Ray Image DataSet").joinpath(image_type).iterdir()):
                did, rev, md5, size = self.file_helper.find_by_name(
                    image_filepath.name)
                if not did:
                    guid = self.file_helper.upload_file(image_filepath)
                    print(
                        f"file {image_filepath.name} uploaded with guid: {guid}"
                    )
                else:
                    print(
                        f"file {image_filepath.name} exists in indexd... skipping..."
                    )

                imaging_file_submitter_id = format_submitter_id(
                    "imaging_file_chestxray8",
                    {"filename": image_filepath.name})
                uploaded_imaging_file = {
                    "submitter_id":
                    imaging_file_submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        self.cmc_submitter_id
                    }],
                    "data_type":
                    "PNG",
                    "data_format":
                    "Image File",
                    "data_category":
                    "X-Ray Image",
                    "file_name":
                    image_filepath.name,
                    "file_size":
                    size,
                    "md5sum":
                    md5,
                    "object_id":
                    did,
                    "clinical_notes":
                    image_type,
                }

                self.imaging_file.append(uploaded_imaging_file)

    def submit_metadata(self):
        print("Submitting data...")

        print("Submitting core_metadata_collection data")
        for cmc in self.core_metadata_collection:
            cmc_record = {"type": "core_metadata_collection"}
            cmc_record.update(cmc)
            self.metadata_helper.add_record_to_submit(cmc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting imaging_file data")
        for ifile in self.imaging_file:
            if_record = {"type": "imaging_file"}
            if_record.update(ifile)
            self.metadata_helper.add_record_to_submit(if_record)
        self.metadata_helper.batch_submit_records()
Beispiel #9
0
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.summary_locations = []
        self.summary_clinicals = []
        self.header_to_column = {}

        self.program_name = "open"
        self.project_code = "CTP"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.expected_file_headers = set([
            "date",
            "state",
            "positive",
            "negative",
            "pending",
            "totalTestResults",
            "hospitalizedCurrently",
            "hospitalizedCumulative",
            "inIcuCurrently",
            "inIcuCumulative",
            "onVentilatorCurrently",
            "onVentilatorCumulative",
            "recovered",
            "dataQualityGrade",
            "lastUpdateEt",
            "dateModified",
            "checkTimeEt",
            "death",
            "hospitalized",
            "dateChecked",
            "totalTestsViral",
            "positiveTestsViral",
            "negativeTestsViral",
            "positiveCasesViral",
            "deathConfirmed",
            "deathProbable",
            "totalTestEncountersViral",
            "totalTestsPeopleViral",
            "totalTestsAntibody",
            "positiveTestsAntibody",
            "negativeTestsAntibody",
            "totalTestsPeopleAntibody",
            "positiveTestsPeopleAntibody",
            "negativeTestsPeopleAntibody",
            "totalTestsPeopleAntigen",
            "positiveTestsPeopleAntigen",
            "totalTestsAntigen",
            "positiveTestsAntigen",
            "fips",
            "positiveIncrease",
            "negativeIncrease",
            "total",
            "totalTestResultsSource",
            "totalTestResultsIncrease",
            "posNeg",
            "deathIncrease",
            "hospitalizedIncrease",
            "hash",
            "commercialScore",
            "negativeRegularScore",
            "negativeScore",
            "positiveScore",
            "score",
            "grade",
        ])

        self.expected_race_headers = set([
            "Date",
            "State",
            "Cases_Total",
            "Cases_White",
            "Cases_Black",
            "Cases_Latinx",
            "Cases_Asian",
            "Cases_AIAN",
            "Cases_NHPI",
            "Cases_Multiracial",
            "Cases_Other",
            "Cases_Unknown",
            "Cases_Ethnicity_Hispanic",
            "Cases_Ethnicity_NonHispanic",
            "Cases_Ethnicity_Unknown",
            "Deaths_Total",
            "Deaths_White",
            "Deaths_Black",
            "Deaths_Latinx",
            "Deaths_Asian",
            "Deaths_AIAN",
            "Deaths_NHPI",
            "Deaths_Multiracial",
            "Deaths_Other",
            "Deaths_Unknown",
            "Deaths_Ethnicity_Hispanic",
            "Deaths_Ethnicity_NonHispanic",
            "Deaths_Ethnicity_Unknown",
        ])
Beispiel #10
0
class NCBI_MANIFEST(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.manifest_bucket = "sra-pub-sars-cov2"
        self.sra_src_manifest = "sra-src/Manifest"
        self.program_name = "open"
        self.project_code = "ncbi-covid-19"
        self.token = access_token
        self.last_submission_identifier = None

        self.file_helper = AsyncFileHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def read_ncbi_manifest(self, key):
        """read the manifest"""
        tries = 0
        last_row_num = 0
        while tries < MAX_RETRIES:
            try:
                s3 = boto3.resource("s3",
                                    config=Config(signature_version=UNSIGNED))
                s3_object = s3.Object(self.manifest_bucket, key)
                line_stream = codecs.getreader("utf-8")
                row_num = 0
                for line in line_stream(s3_object.get()["Body"]):
                    row_num = row_num + 1
                    if row_num < last_row_num:
                        continue
                    if row_num % 1000 == 0:
                        print(f"Processed {row_num} rows of {key}")
                    words = line.split("\t")
                    guid = conform_data_format(words[0].strip(), "guid")
                    size = int(conform_data_format(words[2].strip(), "size"))
                    md5 = conform_data_format(words[3].strip(), "md5")
                    authz = f"/programs/{self.program_name}/project/{self.project_code}"
                    url = conform_data_format(words[5].strip(), "url")
                    release_date = parse(
                        re.sub(r":[0-9]{3}", "", words[6].strip()))
                    yield guid, size, md5, authz, url, release_date
                break
            except Exception as e:
                print(f"Can not stream {key}. Retrying...")
                time.sleep(30)
                tries += 1
                last_row_num = row_num

    def submit_metadata(self):
        start = time.strftime("%X")

        loop = asyncio.get_event_loop()
        try:
            loop.run_until_complete(
                asyncio.gather(self.index_manifest(self.sra_src_manifest)))
            future = AsyncFileHelper.close_session()
            if future:
                loop.run_until_complete(asyncio.gather(future))

        finally:
            loop.close()
        end = time.strftime("%X")
        print(f"Running time: From {start} to {end}")

    async def index_manifest(self, manifest):
        query_string = ('{ project (first: 0, dbgap_accession_number: "' +
                        self.project_code +
                        '") { last_submission_identifier } }')
        try:
            response = self.metadata_helper.query_peregrine(query_string)
            self.last_submission_identifier = parse(
                response["data"]["project"][0]["last_submission_identifier"])
        except Exception as ex:
            self.last_submission_identifier = None

        now = datetime.datetime.now()
        last_submission_date_time = now.strftime("%m/%d/%Y, %H:%M:%S")

        for (guid, size, md5, authz, url,
             release_date) in self.read_ncbi_manifest(manifest):
            if (not self.last_submission_identifier
                    or release_date > self.last_submission_identifier):
                filename = url.split("/")[-1]
                retrying = True

                while retrying:
                    try:
                        did, _, _, _, _, _ = await self.file_helper.async_find_by_name(
                            filename)
                        retrying = False
                    except Exception as e:
                        print(
                            f"ERROR: Fail to query indexd for {filename}. Detail {e}. Retrying..."
                        )
                        await asyncio.sleep(5)

                if did:
                    print(f"{filename} was already indexed")
                    continue

                print(f"start to index {filename}")
                retries = 0
                while retries < MAX_RETRIES:
                    try:
                        await self.file_helper.async_index_record(
                            guid, size, filename, url, authz, md5)
                        break
                    except Exception as e:
                        retries += 1
                        print(
                            f"ERROR: Fail to create new indexd record for {guid}. Detail {e}. Retrying..."
                        )
                        await asyncio.sleep(5)

        headers = {
            "content-type": "application/json",
            "Authorization": f"Bearer {self.access_token}",
        }
        record = {
            "code": self.project_code,
            "dbgap_accession_number": self.project_code,
            "last_submission_identifier": last_submission_date_time,
        }
        res = requests.put(
            "{}/api/v0/submission/{}".format(self.base_url, self.program_name),
            headers=headers,
            data=json.dumps(record),
        )
Beispiel #11
0
class STOPLIGHT(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.summary_clinicals = []
        self.summary_locations = []
        self.program_name = "open"
        self.project_code = "covidstoplight"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def files_to_submissions(self):
        """
        Reads json files and converts the data to Sheepdog records
        """
        url = "https://covidstoplight.org/api/v0/location/US"
        self.parse_file(url)

    def parse_file(self, url):
        """
        Converts a json file to data we can submit via Sheepdog. Stores the
        records to submit in `self.location_data` and `self.time_series_data`.
        Ignores any records that are already in Sheepdog (relies on unique
        `submitter_id` to check)

        Args:
            url (str): URL at which the file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            timestamp_created = data["data"]["generated"]
            country = data["country_code"]
            summary_location_list = []
            try:
                for zipcode, feelings in data["data"]["submissions"].items():
                    node = {
                        "zipcode": zipcode,
                        "feelings": feelings,
                        "timestamp_created": timestamp_created,
                        "country": country,
                    }
                    summary_location, summary_clinical = self.parse_node(node)
                    summary_location_submitter_id = summary_location[
                        "submitter_id"]
                    if summary_location_submitter_id not in summary_location_list:
                        self.summary_locations.append(summary_location)
                        summary_location_list.append(
                            summary_location_submitter_id)
                    self.summary_clinicals.append(summary_clinical)
            except ValueError as e:
                print(f"ERROR: value error. Detail {e}")

    def parse_node(self, node):
        """
        Converts an element of an JSON file to data we can submit via Sheepdog

        Args:
            node (dict): node data

        Returns:
            (dict, dict) tuple:
                - location data, in a format ready to be submitted to Sheepdog
                - { "date1": <value>, "date2": <value> } from the row data
        """
        zipcode = node["zipcode"]
        feelings = node["feelings"]
        timestamp_created = node["timestamp_created"]
        country = node["country"]
        summary_location_submitter_id = format_location_submitter_id(
            country, zipcode)
        summary_location = {
            "country_region": country,
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "zipcode": zipcode,
        }

        date = datetime.strptime(timestamp_created, "%Y-%m-%dT%H:%M:%S").date()
        date = date.strftime("%Y-%m-%d")
        summary_clinical_submitter_id = format_summary_clinical_submitter_id(
            summary_location_submitter_id, date)

        summary_clinical = {
            "date": date,
            "timestamp_created": timestamp_created,
            "submitter_id": summary_clinical_submitter_id,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        map_fields = {
            1: "feeling_healthy_count",
            2: "feeling_not_so_good_count",
            3: "feeling_sick_count",
        }

        for element in feelings:
            summary_clinical[map_fields[element["feeling"]]] = element["count"]

        return summary_location, summary_clinical

    def submit_metadata(self):
        """
        Converts the data in `self.time_series_data` to Sheepdog records.
        `self.location_data already contains Sheepdog records. Batch submits
        all records in `self.location_data` and `self.time_series_data`
        """

        print("Submitting summary_location data")
        for loc in self.summary_locations:
            loc_record = {"type": "summary_location"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for rep in self.summary_clinicals:
            rep_record = {"type": "summary_clinical"}
            rep_record.update(rep)
            self.metadata_helper.add_record_to_submit(rep_record)
        self.metadata_helper.batch_submit_records()
Beispiel #12
0
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.summary_locations = []
        self.summary_clinicals = []

        self.program_name = "open"
        self.project_code = "OWID"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        # structure is
        # (csv field name, (node type, node field name, type of field))
        testing_fields = [
            ("ISO code", ("summary_location", "iso3", str)),
            ("Entity", (None, None, split_entity)),
            ("Date", ("summary_clinical", "date", str)),
            ("Source URL", ("summary_clinical", "source_url", str)),
            ("Source label", ("summary_clinical", "source_label", str)),
            ("Notes", ("summary_clinical", "notes", str)),
            ("Number of observations", ("summary_clinical", "num_observations",
                                        int)),
            ("Cumulative total", ("summary_clinical", "testing", int)),
            (
                "Cumulative total per thousand",
                ("summary_clinical", "cumulative_total_per_thousand", int),
            ),
            (
                "Daily change in cumulative total",
                ("summary_clinical", "daily_change_in_cumulative_total", int),
            ),
            (
                "Daily change in cumulative total per thousand",
                (
                    "summary_clinical",
                    "daily_change_in_cumulative_total_per_thousand",
                    int,
                ),
            ),
            (
                "7-day smoothed daily change",
                ("summary_clinical", "seven_day_smoothed_daily_change", int),
            ),
            (
                "7-day smoothed daily change per thousand",
                (
                    "summary_clinical",
                    "seven_day_smoothed_daily_change_per_thousand",
                    float,
                ),
            ),
            ("Short-term positive rate", (None, None, None)),
            ("Short-term tests per case", (None, None, None)),
            ("General source label", ("summary_clinical",
                                      "general_source_label", str)),
            ("General source URL", ("summary_clinical", "general_source_url",
                                    str)),
            ("Short description", ("summary_clinical", "short_description",
                                   str)),
            ("Detailed description", ("summary_clinical",
                                      "detailed_description", str)),
        ]

        self.headers_mapping = {
            field: (k, mapping)
            for k, (field, mapping) in enumerate(testing_fields)
        }
Beispiel #13
0
class IDPH_FACILITY(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "IDPH-Facility"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.country = "US"
        self.state = "IL"

        self.summary_locations = {}
        self.summary_clinicals = {}

    def files_to_submissions(self):
        """
        Reads JSON file and convert the data to Sheepdog records
        """

        latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph(
        )
        today = datetime.date.today()
        if latest_submitted_date == today:
            print(
                "Nothing to submit: today and latest submitted date are the same."
            )
            return
        today_str = today.strftime("%Y%m%d")

        print(f"Getting data for date: {today_str}")
        url = "https://dph.illinois.gov/sitefiles/COVIDLTC.json"
        self.parse_file(latest_submitted_date, url)

    def parse_file(self, latest_submitted_date, url):
        """
        Converts a JSON files to data we can submit via Sheepdog. Stores the
        records to submit in `self.summary_locations` and `self.summary_clinicals`.

        Args:
            latest_submitted_date (date): the date of latest available "summary_clinical" for project
            url (str): URL at which the JSON file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            date = idph_get_date(data["LastUpdateDate"])

            if latest_submitted_date and date == latest_submitted_date.strftime(
                    "%Y-%m-%d"):
                print(
                    "Nothing to submit: latest submitted date and date from data are the same."
                )
                return

            if "LTC_Reported_Cases" in data:
                summary_location_submitter_id = format_submitter_id(
                    "summary_location", {
                        "country": self.country,
                        "state": self.state
                    })

                summary_location = {
                    "country_region": self.country,
                    "submitter_id": summary_location_submitter_id,
                    "projects": [{
                        "code": self.project_code
                    }],
                    "province_state": self.state,
                }

                summary_clinical_submitter_id = derived_submitter_id(
                    summary_location_submitter_id,
                    "summary_location",
                    "summary_clinical",
                    {"date": date},
                )
                summary_clinical = {
                    "confirmed":
                    data["LTC_Reported_Cases"]["confirmed_cases"],
                    "deaths":
                    data["LTC_Reported_Cases"]["deaths"],
                    "submitter_id":
                    summary_clinical_submitter_id,
                    "lastUpdateEt":
                    date,
                    "date":
                    date,
                    "summary_locations": [{
                        "submitter_id":
                        summary_location_submitter_id
                    }],
                }
                self.summary_locations[
                    summary_location_submitter_id] = summary_location
                self.summary_clinicals[
                    summary_clinical_submitter_id] = summary_clinical

            for facility in data["FacilityValues"]:
                (summary_location,
                 summary_clinical) = self.parse_facility(date, facility)
                summary_location_submitter_id = summary_location[
                    "submitter_id"]
                summary_clinical_submitter_id = summary_clinical[
                    "submitter_id"]

                self.summary_locations[
                    summary_location_submitter_id] = summary_location

                if summary_clinical_submitter_id in self.summary_clinicals:
                    existed = self.summary_clinicals[
                        summary_clinical_submitter_id]
                    summary_clinical["confirmed"] = max(
                        summary_clinical["confirmed"], existed["confirmed"])
                    summary_clinical["deaths"] = max(
                        summary_clinical["deaths"], existed["deaths"])

                self.summary_clinicals[
                    summary_clinical_submitter_id] = summary_clinical

    def parse_facility(self, date, facility):
        """
        From county-level data, generate the data we can submit via Sheepdog
        """
        county = facility["County"]
        facility_name = facility["FacilityName"]
        confirmed_cases = facility["confirmed_cases"]
        deaths = facility["deaths"]
        status = facility.get("status", None)

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "country": self.country,
                "state": self.state,
                "facility_name": facility_name,
                "reporting_org_status": status,
            },
        )

        summary_location = {
            "country_region": self.country,
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "province_state": self.state,
            "county": county,
            "reporting_org": facility_name,
            "reporting_org_status": status,
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {"date": date},
        )

        summary_clinical = {
            "confirmed": confirmed_cases,
            "deaths": deaths,
            "submitter_id": summary_clinical_submitter_id,
            "lastUpdateEt": date,
            "date": date,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        return summary_location, summary_clinical

    def submit_metadata(self):
        print("Submitting data...")
        print("Submitting summary_location data")
        for sl in self.summary_locations.values():
            sl_record = {"type": "summary_location"}
            sl_record.update(sl)
            self.metadata_helper.add_record_to_submit(sl_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals.values():
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Beispiel #14
0
class DS4C(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "DS4C"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.subjects = []
        self.demographics = []
        self.observations = []

    def files_to_submissions(self):
        with open(
            os.path.join(CURRENT_DIR, "data/ds4c_PatientInfo.csv"), newline=""
        ) as csvfile:
            reader = csv.reader(csvfile, delimiter=",")
            header = next(reader)
            print("Headers:", header)
            header = {k: v for v, k in enumerate(header)}
            n_1200012238 = 1

            for row in reader:
                patient_id = row[header["patient_id"]].strip()
                if patient_id == "1200012238":
                    # there are 2 rows for the same ID
                    patient_id = f"{patient_id}_{n_1200012238}"
                    n_1200012238 += 1

                # generate subject record
                subject = {
                    "submitter_id": patient_id,
                    "projects": [{"code": self.project_code}],
                }

                confirmed_date = row[header["confirmed_date"]].strip()
                if confirmed_date:
                    check_date_format(confirmed_date)
                    subject["date_confirmation"] = confirmed_date
                    subject["covid_19_status"] = "Positive"

                infected_by = row[header["infected_by"]].strip()
                if infected_by:
                    subject["infected_by"] = list(
                        map(lambda v: v.strip(), infected_by.split(","))
                    )

                deceased_date = row[header["deceased_date"]].strip()
                if deceased_date:
                    check_date_format(deceased_date)
                    subject["deceased_date"] = deceased_date

                # generate demographic record
                demographic = {
                    "submitter_id": f"demographic_{patient_id}",
                    "subjects": {"submitter_id": patient_id},
                    "age_decade": row[header["age"]].strip(),
                    "province_state": row[header["province"]].strip(),
                    "city": row[header["city"]].strip(),
                }

                country = row[header["country"]].strip()
                if country == "Korea":
                    demographic["country_region"] = "South Korea"
                elif country == "United States":
                    demographic["country_region"] = "USA"
                else:
                    demographic["country_region"] = country

                gender = row[header["sex"]].strip()
                demographic["gender"] = harmonize_gender(gender)

                demographic["year_of_birth"] = None

                # generate observation record
                observation = {
                    "submitter_id": f"observation_{patient_id}",
                    "subjects": {"submitter_id": patient_id},
                    "exposure": row[header["infection_case"]].strip(),
                }
                date_onset_symptoms = row[header["symptom_onset_date"]].strip()
                if date_onset_symptoms:
                    check_date_format(row[header["symptom_onset_date"]])
                    observation["date_onset_symptoms"] = date_onset_symptoms

                state = row[header["state"]].strip()
                if state == "deceased":
                    subject["vital_status"] = "Dead"
                elif state == "isolated":
                    observation["isolation_status"] = "Isolated"
                elif state == "released":
                    observation["treatment_status"] = "Released"
                elif state:
                    raise Exception('State "{}" is unknown'.format(state))

                released_date = row[header["released_date"]].strip()
                if released_date:
                    check_date_format(released_date)
                    observation["released_date"] = released_date

                subject = {k: v if v else None for k, v in subject.items()}
                self.subjects.append(subject)

                demographic = {k: v for k, v in demographic.items() if v}
                self.demographics.append(demographic)

                observation = {k: v for k, v in observation.items() if v}
                self.observations.append(observation)

    def submit_metadata(self):
        print("Submitting data")
        print("Submitting subject data")
        for loc in self.subjects:
            loc_record = {"type": "subject"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting demographic data")
        for dem in self.demographics:
            dem_record = {"type": "demographic"}
            dem_record.update(dem)
            self.metadata_helper.add_record_to_submit(dem_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting observation data")
        for obs in self.observations:
            obs_record = {"type": "observation"}
            obs_record.update(obs)
            self.metadata_helper.add_record_to_submit(obs_record)
        self.metadata_helper.batch_submit_records()
Beispiel #15
0
class IDPH_ZIPCODE(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "IDPH-zipcode"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.country = "US"
        self.state = "IL"

        self.summary_locations = []
        self.summary_clinicals = []

    def files_to_submissions(self):
        """
        Reads JSON file and convert the data to Sheepdog records
        """

        latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph(
        )
        today = datetime.date.today()
        if latest_submitted_date == today:
            print(
                "Nothing to submit: today and latest submitted date are the same."
            )
            return

        today_str = today.strftime("%Y%m%d")
        print(f"Getting data for date: {today_str}")
        url = "http://dph.illinois.gov/sitefiles/COVIDZip.json?nocache=1"
        self.parse_file(latest_submitted_date, url)

    def parse_file(self, latest_submitted_date, url):
        """
        Converts a JSON files to data we can submit via Sheepdog. Stores the
        records to submit in `self.summary_locations` and `self.summary_clinicals`.

        Args:
            latest_submitted_date (date): date for latest submitted date
            url (str): URL at which the JSON file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            date = idph_get_date(data["LastUpdateDate"])

            if latest_submitted_date and date == latest_submitted_date.strftime(
                    "%Y-%m-%d"):
                print(
                    "Nothing to submit: latest submitted date and date from data are the same."
                )
                return

            for zipcode_values in data["zip_values"]:
                (summary_location,
                 summary_clinical) = self.parse_zipcode(date, zipcode_values)

                self.summary_locations.append(summary_location)
                self.summary_clinicals.append(summary_clinical)

    def parse_zipcode(self, date, zipcode_values):
        """
        From county-level data, generate the data we can submit via Sheepdog
        """
        zipcode = zipcode_values["zip"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "country": self.country,
                "state": self.state,
                "zipcode": zipcode
            },
        )

        summary_location = {
            "submitter_id": summary_location_submitter_id,
            "country_region": self.country,
            "province_state": self.state,
            "zipcode": zipcode,
            "projects": [{
                "code": self.project_code
            }],
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {"date": date},
        )
        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "confirmed": zipcode_values["confirmed_cases"],
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        if "demographics" in zipcode_values:
            demographic = zipcode_values["demographics"]

            for k, v in fields_mapping.items():
                field, mapping = v
                demographic_group = demographic[k]

                for item in demographic_group:
                    dst_field = mapping[item[field]]
                    if dst_field:
                        if "count" in item:
                            age_group_count_field = "{}_{}".format(
                                mapping[item[field]], "count")
                            summary_clinical[age_group_count_field] = item[
                                "count"]
                        if "tested" in item:
                            age_group_tested_field = "{}_{}".format(
                                mapping[item[field]], "tested")
                            summary_clinical[age_group_tested_field] = item[
                                "tested"]

        return summary_location, summary_clinical

    def submit_metadata(self):
        """
        Submits the data in `self.summary_locations` and `self.summary_clinicals` to Sheepdog.
        """
        print("Submitting data...")
        print("Submitting summary_location data")
        for sl in self.summary_locations:
            sl_record = {"type": "summary_location"}
            sl_record.update(sl)
            self.metadata_helper.add_record_to_submit(sl_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals:
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Beispiel #16
0
class DSCI(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "DSCI"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.subjects = []
        self.demographics = []
        self.observations = []

    def files_to_submissions(self):
        with open(
            os.path.join(CURRENT_DIR, "data/dsci_patient.csv"), newline=""
        ) as csvfile:
            reader = csv.reader(csvfile, delimiter=",", quotechar="|")
            header = next(reader)
            print("Headers:", header)
            header = {k: v for v, k in enumerate(header)}

            for row in reader:
                patient_id = row[header["patient_id"]].strip()

                # generate subject record
                subject = {
                    "submitter_id": patient_id,
                    "projects": [{"code": self.project_code}],
                }

                infected_by = row[header["contacted_with"]].strip()
                if infected_by:
                    subject["infected_by"] = list(
                        map(lambda v: v.strip(), infected_by.split(","))
                    )

                confirmed_date = row[header["confirmed_date"]].strip()
                if confirmed_date:
                    confirmed_date = format_date(confirmed_date)
                    check_date_format(confirmed_date)
                    subject["date_confirmation"] = confirmed_date
                    subject["covid_19_status"] = "Positive"

                deceased_date = row[header["deceased_date"]].strip()
                if deceased_date:
                    deceased_date = format_date(deceased_date)
                    check_date_format(deceased_date)
                    subject["deceased_date"] = deceased_date

                # generate demographic record
                demographic = {
                    "submitter_id": f"demographic_{patient_id}",
                    "subjects": {"submitter_id": f"{patient_id}"},
                }

                cols = {"age": "age", "province": "province_state"}

                for k, v in cols.items():
                    value = row[header[k]].strip()
                    if value:
                        demographic[v] = value

                if "age" in demographic:
                    demographic["age"] = int(demographic["age"])

                gender = row[header["gender"]].strip()
                demographic["gender"] = harmonize_gender(gender)

                nationality = row[header["nationality"]].strip()
                if nationality == "indonesia":
                    demographic["country_region"] = "Indonesia"
                elif nationality == "foreigner":
                    pass
                elif nationality:
                    raise Exception('Nationality "{}" is unknown'.format(nationality))

                # generate observation record
                observation = {
                    "submitter_id": f"observation_{patient_id}",
                    "subjects": {"submitter_id": f"{patient_id}"},
                }

                hospital = row[header["hospital"]].strip()
                if hospital:
                    observation["hospital"] = hospital

                state = row[header["current_state"]].strip()
                if state == "deceased":
                    subject["vital_status"] = "Dead"
                elif state == "isolated":
                    observation["isolation_status"] = "Isolated"
                elif state == "released":
                    observation["treatment_status"] = "Released"
                elif state:
                    raise Exception('State "{}" is unknown'.format(state))

                released_date = row[header["released_date"]].strip()
                if released_date:
                    released_date = format_date(released_date)
                    check_date_format(released_date)
                    observation["released_date"] = released_date

                self.subjects.append(subject)
                self.demographics.append(demographic)
                self.observations.append(observation)

    def submit_metadata(self):
        print("Submitting data")
        print("Submitting subject data")
        for loc in self.subjects:
            loc_record = {"type": "subject"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting demographic data")
        for dem in self.demographics:
            dem_record = {"type": "demographic"}
            dem_record.update(dem)
            self.metadata_helper.add_record_to_submit(dem_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting observation data")
        for obs in self.observations:
            obs_record = {"type": "observation"}
            obs_record.update(obs)
            self.metadata_helper.add_record_to_submit(obs_record)
        self.metadata_helper.batch_submit_records()
Beispiel #17
0
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.summary_locations = []
        self.summary_clinicals = []
        self.summary_socio_demographics = []

        self.program_name = "open"
        self.project_code = "CCMap"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        # structure is
        # (csv field name, (node type, node field name, type of field))
        county_fields = [
            ("fips_code", ("summary_location", "FIPS", int)),
            ("State", ("summary_location", "province_state", str)),
            ("County Name", ("summary_location", "county", str)),
            ("Staffed All Beds", ("summary_clinical", "staffed_all_beds",
                                  int)),
            ("Staffed ICU Beds", ("summary_clinical", "staffed_icu_beds",
                                  int)),
            ("Licensed All Beds", ("summary_clinical", "licensed_all_beds",
                                   int)),
            (
                "All Bed Occupancy Rate",
                ("summary_clinical", "all_bed_occupancy_rate", float),
            ),
            (
                "ICU Bed Occupancy Rate",
                ("summary_clinical", "icu_bed_occupancy_rate", float),
            ),
            ("Population", ("summary_clinical", "population", int)),
            ("Population (20+)", ("summary_clinical", "population_gtr_20",
                                  int)),
            ("Population (65+)", ("summary_clinical", "population_gtr_65",
                                  int)),
            (
                "Staffed All Beds [Per 1000 People]",
                ("summary_clinical", "staffed_all_beds_per_1000", float),
            ),
            (
                "Staffed All Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "staffed_all_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Staffed All Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "staffed_all_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Staffed ICU Beds [Per 1000 People]",
                ("summary_clinical", "staffed_icu_beds_per_1000", float),
            ),
            (
                "Staffed ICU Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "staffed_icu_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Staffed ICU Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "staffed_icu_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Licensed All Beds [Per 1000 People]",
                ("summary_clinical", "licensed_all_beds_per_1000", float),
            ),
            (
                "Licensed All Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "licensed_all_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Licensed All Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "licensed_all_beds_per_1000_gtr_65",
                 float),
            ),
        ]

        state_fields = [
            ("State", ("summary_location", None, int)),
            ("State Name", ("summary_location", "province_state", str)),
            ("Staffed All Beds", ("summary_clinical", "staffed_all_beds",
                                  int)),
            ("Staffed ICU Beds", ("summary_clinical", "staffed_icu_beds",
                                  int)),
            ("Licensed All Beds", ("summary_clinical", "licensed_all_beds",
                                   int)),
            (
                "All Bed Occupancy Rate",
                ("summary_clinical", "all_bed_occupancy_rate", float),
            ),
            (
                "ICU Bed Occupancy Rate",
                ("summary_clinical", "icu_bed_occupancy_rate", float),
            ),
            ("Population", ("summary_clinical", "population", int)),
            (
                "Population (20+)",
                ("summary_socio_demographic", "population_gtr_20", int),
            ),
            (
                "Population (65+)",
                ("summary_socio_demographic", "population_gtr_65", int),
            ),
            (
                "Staffed All Beds [Per 1000 People]",
                ("summary_clinical", "staffed_all_beds_per_1000", float),
            ),
            (
                "Staffed All Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "staffed_all_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Staffed All Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "staffed_all_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Staffed ICU Beds [Per 1000 People]",
                ("summary_clinical", "staffed_icu_beds_per_1000", float),
            ),
            (
                "Staffed ICU Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "staffed_icu_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Staffed ICU Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "staffed_icu_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Licensed All Beds [Per 1000 People]",
                ("summary_clinical", "licensed_all_beds_per_1000", float),
            ),
            (
                "Licensed All Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "licensed_all_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Licensed All Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "licensed_all_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Estimated No. Full-Featured Mechanical Ventilators (2010 study estimate)",
                ("summary_clinical", "estimated_full_mech_ventilators", int),
            ),
            (
                "Estimated No. Full-Featured Mechanical Ventilators per 100,000 Population (2010 study estimate)",
                (
                    "summary_clinical",
                    "estimated_full_mech_ventilators_per_100000",
                    float,
                ),
            ),
            (
                "Estimated No. Pediatrics-Capable Full-Feature Mechanical Ventilators (2010 study estimate)",
                ("summary_clinical",
                 "estimated_full_mech_pediatric_ventilators", int),
            ),
            (
                "Estimated No. Full-Feature Mechanical Ventilators, Pediatrics Capable per 100,000 Population <14 y (2010 study estimate)",
                (
                    "summary_clinical",
                    "estimated_full_mech_pediatric_ventilators_per_100000",
                    float,
                ),
            ),
        ]

        self.headers_mapping = {
            "county": {field: mapping
                       for field, mapping in county_fields},
            "state": {field: mapping
                      for field, mapping in state_fields},
        }
Beispiel #18
0
class CCMAP(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.summary_locations = []
        self.summary_clinicals = []
        self.summary_socio_demographics = []

        self.program_name = "open"
        self.project_code = "CCMap"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        # structure is
        # (csv field name, (node type, node field name, type of field))
        county_fields = [
            ("fips_code", ("summary_location", "FIPS", int)),
            ("State", ("summary_location", "province_state", str)),
            ("County Name", ("summary_location", "county", str)),
            ("Staffed All Beds", ("summary_clinical", "staffed_all_beds",
                                  int)),
            ("Staffed ICU Beds", ("summary_clinical", "staffed_icu_beds",
                                  int)),
            ("Licensed All Beds", ("summary_clinical", "licensed_all_beds",
                                   int)),
            (
                "All Bed Occupancy Rate",
                ("summary_clinical", "all_bed_occupancy_rate", float),
            ),
            (
                "ICU Bed Occupancy Rate",
                ("summary_clinical", "icu_bed_occupancy_rate", float),
            ),
            ("Population", ("summary_clinical", "population", int)),
            ("Population (20+)", ("summary_clinical", "population_gtr_20",
                                  int)),
            ("Population (65+)", ("summary_clinical", "population_gtr_65",
                                  int)),
            (
                "Staffed All Beds [Per 1000 People]",
                ("summary_clinical", "staffed_all_beds_per_1000", float),
            ),
            (
                "Staffed All Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "staffed_all_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Staffed All Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "staffed_all_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Staffed ICU Beds [Per 1000 People]",
                ("summary_clinical", "staffed_icu_beds_per_1000", float),
            ),
            (
                "Staffed ICU Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "staffed_icu_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Staffed ICU Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "staffed_icu_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Licensed All Beds [Per 1000 People]",
                ("summary_clinical", "licensed_all_beds_per_1000", float),
            ),
            (
                "Licensed All Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "licensed_all_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Licensed All Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "licensed_all_beds_per_1000_gtr_65",
                 float),
            ),
        ]

        state_fields = [
            ("State", ("summary_location", None, int)),
            ("State Name", ("summary_location", "province_state", str)),
            ("Staffed All Beds", ("summary_clinical", "staffed_all_beds",
                                  int)),
            ("Staffed ICU Beds", ("summary_clinical", "staffed_icu_beds",
                                  int)),
            ("Licensed All Beds", ("summary_clinical", "licensed_all_beds",
                                   int)),
            (
                "All Bed Occupancy Rate",
                ("summary_clinical", "all_bed_occupancy_rate", float),
            ),
            (
                "ICU Bed Occupancy Rate",
                ("summary_clinical", "icu_bed_occupancy_rate", float),
            ),
            ("Population", ("summary_clinical", "population", int)),
            (
                "Population (20+)",
                ("summary_socio_demographic", "population_gtr_20", int),
            ),
            (
                "Population (65+)",
                ("summary_socio_demographic", "population_gtr_65", int),
            ),
            (
                "Staffed All Beds [Per 1000 People]",
                ("summary_clinical", "staffed_all_beds_per_1000", float),
            ),
            (
                "Staffed All Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "staffed_all_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Staffed All Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "staffed_all_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Staffed ICU Beds [Per 1000 People]",
                ("summary_clinical", "staffed_icu_beds_per_1000", float),
            ),
            (
                "Staffed ICU Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "staffed_icu_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Staffed ICU Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "staffed_icu_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Licensed All Beds [Per 1000 People]",
                ("summary_clinical", "licensed_all_beds_per_1000", float),
            ),
            (
                "Licensed All Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "licensed_all_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Licensed All Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "licensed_all_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Estimated No. Full-Featured Mechanical Ventilators (2010 study estimate)",
                ("summary_clinical", "estimated_full_mech_ventilators", int),
            ),
            (
                "Estimated No. Full-Featured Mechanical Ventilators per 100,000 Population (2010 study estimate)",
                (
                    "summary_clinical",
                    "estimated_full_mech_ventilators_per_100000",
                    float,
                ),
            ),
            (
                "Estimated No. Pediatrics-Capable Full-Feature Mechanical Ventilators (2010 study estimate)",
                ("summary_clinical",
                 "estimated_full_mech_pediatric_ventilators", int),
            ),
            (
                "Estimated No. Full-Feature Mechanical Ventilators, Pediatrics Capable per 100,000 Population <14 y (2010 study estimate)",
                (
                    "summary_clinical",
                    "estimated_full_mech_pediatric_ventilators_per_100000",
                    float,
                ),
            ),
        ]

        self.headers_mapping = {
            "county": {field: mapping
                       for field, mapping in county_fields},
            "state": {field: mapping
                      for field, mapping in state_fields},
        }

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        repo = "covidcaremap/covid19-healthsystemcapacity"
        branch = "master"
        files = {
            "county":
            "data/published/us_healthcare_capacity-county-CovidCareMap.csv",
            "state":
            "data/published/us_healthcare_capacity-state-CovidCareMap.csv",
        }

        for k, url in files.items():
            self.parse_file(repo, branch, url, csv_type=k)

    def get_last_update_date_file(self, repo, url):
        """
        Gets latest update time for specific file in the repository

        :param repo: "user/repository" for Github repository
        :param url: path to file
        :return: last update (commit) datetime for the file
        """
        api_url = "https://api.github.com/repos"
        commit_info_url = "{}/{}/{}{}{}".format(api_url, repo, "commits?path=",
                                                url, "&page=1&per_page=1")

        with closing(requests.get(commit_info_url, stream=True)) as r:
            commit_info = r.json()
            last_update_date = commit_info[0]["commit"]["committer"]["date"]

        return datetime.datetime.strptime(last_update_date,
                                          "%Y-%m-%dT%H:%M:%SZ")

    def parse_file(self, repo, branch, file_url, csv_type):
        last_update_date = self.get_last_update_date_file(repo, file_url)

        raw_url = "https://raw.githubusercontent.com"
        url = "{}/{}/{}/{}".format(raw_url, repo, branch, file_url)

        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            assert (
                headers[0] != "404: Not Found"
            ), "  Unable to get file contents, received {}.".format(headers)

            expected_h = list(self.headers_mapping[csv_type].keys())
            assert (
                set(expected_h).issubset(set(headers)) == True
            ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format(
                expected_h, headers)

            for i, f in enumerate(headers):
                if f in self.headers_mapping[csv_type]:
                    old_value = self.headers_mapping[csv_type][f]
                    self.headers_mapping[csv_type][f] = (i, old_value)

            for row in reader:
                (
                    summary_location,
                    summary_clinical,
                    summary_socio_demographic,
                ) = self.parse_row(row, self.headers_mapping[csv_type],
                                   last_update_date)

                self.summary_locations.append(summary_location)
                self.summary_clinicals.append(summary_clinical)
                self.summary_socio_demographics.append(
                    summary_socio_demographic)

    def parse_row(self, row, mapping, last_update_date):
        summary_location = {"country_region": "US"}
        summary_clinical = {}
        summary_socio_demographic = {}

        for k, (i, (node_type, node_field, type_conv)) in mapping.items():
            try:
                if node_field:
                    value = row[i]
                    if value:
                        if node_type == "summary_location":
                            summary_location[node_field] = type_conv(value)
                        if node_type == "summary_clinical":
                            if type_conv == int:
                                summary_clinical[node_field] = type_conv(
                                    float(value))
                            else:
                                summary_clinical[node_field] = type_conv(value)
                        if node_type == "summary_socio_demographic":
                            if type_conv == int:
                                summary_socio_demographic[
                                    node_field] = type_conv(float(value))
                            else:
                                summary_socio_demographic[
                                    node_field] = type_conv(value)
                            summary_clinical[
                                node_field] = None  # TODO: remove when the properties are removed from dictionary
            except Exception as ex:
                print("Error with field: {}, problematic value: {}".format(
                    node_field, row[i]))

        summary_location_submitter_id = format_location_submitter_id(
            summary_location)

        summary_location["submitter_id"] = summary_location_submitter_id
        summary_location["projects"] = [{"code": self.project_code}]

        state = summary_location["province_state"]
        if len(state) == 2:
            summary_location["province_state"] = state_to_long(state)

        summary_clinical[
            "submitter_id"] = format_summary_clinical_submitter_id(
                summary_location_submitter_id,
                date=last_update_date.strftime("%Y-%m-%d"))
        summary_clinical["summary_locations"] = [{
            "submitter_id":
            summary_location_submitter_id
        }]

        summary_socio_demographic[
            "submitter_id"] = format_summary_socio_demographic_id(
                summary_location_submitter_id,
                date=last_update_date.strftime("%Y-%m-%d"))
        summary_socio_demographic["summary_locations"] = [{
            "submitter_id":
            summary_location_submitter_id
        }]

        return summary_location, summary_clinical, summary_socio_demographic

    def submit_metadata(self):
        print("Submitting summary_location data")
        for loc in self.summary_locations:
            loc_record = {"type": "summary_location"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals:
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_socio_demographic data")
        for sc in self.summary_socio_demographics:
            sc_record = {"type": "summary_socio_demographic"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Beispiel #19
0
class DSFSI(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.subjects = []
        self.demographics = []
        self.observations = []

        self.program_name = "open"
        self.project_code = "DSFSI"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        # structure is
        # (csv field name, (node type, node field name, type of field))
        self.countries_fields = [
            ("case_id", ("subject", "submitter_id", str)),
            ("origin_case_id", (None, None, None)),
            ("date", ("observation", "reporting_date", normalize_date)),
            ("age", ("demographic", "age", normalize_age)),
            ("gender", ("demographic", "gender", normalize_gender)),
            ("city", ("demographic", "city", str)),
            ("province/state", ("demographic", "province_state", str)),
            ("country", ("demographic", "country_region", str)),
            (
                "current_status",
                ("subject", "tmp_current_status", normalize_current_status),
            ),
            (
                "source",
                ("observation", "reporting_source_url", str),
            ),  # type of fields "None" is used to remove the value
            ("symptoms", ("observation", "symptoms", normalize_symptoms)),
            (
                "date_onset_symptoms",
                ("observation", "date_onset_symptoms", normalize_date),
            ),
            (
                "date_admission_hospital",
                ("observation", "date_admission_hospital", normalize_date),
            ),
            ("date_confirmation", ("subject", "date_confirmation", normalize_date)),
            ("underlying_conditions", (None, None, None)),
            ("travel_history_dates", ("subject", "travel_history_dates", str)),
            ("travel_history_location", ("subject", "travel_history_location", str)),
            ("death_date", ("subject", "deceased_date", normalize_date)),
            ("notes_for_discussion", (None, None, None)),
        ]

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        urls = {
            "Algeria": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-algeria.csv",
            "Angola": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-angola.csv",
            "Benin": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-benin.csv",
            "Burkina Faso": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-burkina-faso.csv",
            "Cabo Verde": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-cabo-verde.csv",
            "Cameroon": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-cameroon.csv",
            "Central African Republic": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-central-african-republic.csv",
            "Chad": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-chad.csv",
            "Côte d'Ivoire": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-cote-divoire.csv",
            "Democratic Republic of the Congo": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-democratic-republic-of-the-congo.csv",
            "Djibouti": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-djibouti.csv",
            # here should be an Egypt dataset, but it's not useful and omitted on purpose
            "Equatorial Guinea": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-equatorial-guinea.csv",
            "Eritrea": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-eritrea.csv",
            "Eswatini": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-eswatini.csv",
            "Ethiopia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-ethiopia.csv",
            "Gabon": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-gabon.csv",
            "Gambia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-gambia.csv",
            "Ghana": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-ghana.csv",
            "Guinea Bissau": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-guinea-bissau.csv",
            "Guinea": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-guinea.csv",
            "Kenya": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-kenya.csv",
            "Liberia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-liberia.csv",
            "Madagascar": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-madagascar.csv",
            "Mali": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mali.csv",
            "Mauritania": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mauritania.csv",
            "Mauritius": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mauritius.csv",
            "Mozambique": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mozambique.csv",
            "Namibia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-namibia.csv",
            "Niger": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-niger.csv",
            "Nigeria": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-nigeria.csv",
            "Republic of Congo": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-republic-of-congo.csv",
            "Rwanda": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-rwanda.csv",
            "Senegal": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-senegal.csv",
            "Seychelles": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-seychelles.csv",
            "Somalia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-somalia.csv",
            "South Africa": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-south-africa.csv",
            "Sudan": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-sudan.csv",
            "Tanzania": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-tanzania.csv",
            "Togo": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-togo.csv",
            "Uganda": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-uganda.csv",
            "Zambia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-zambia.csv",
            "Zimbabwe": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-zimbabwe.csv",
        }

        for k, url in urls.items():
            self.parse_file(k, url)

    def parse_file(self, country, url):
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            assert (
                headers[0] != "404: Not Found"
            ), "  Unable to get file contents, received {}.".format(headers)

            countries_with_empty_columns = [
                "Angola",
                "Burkina Faso",
                "Cabo Verde",
                "Cameroon",
                "Central African Republic",
                "Chad",
                "Côte d'Ivoire",
                "Democratic Republic of the Congo",
                "Djibouti",
                "Equatorial Guinea",
                "Eritrea",
                "Eswatini",
                "Gabon",
                "Guinea Bissau",
                "Guinea",
                "Liberia",
                "Madagascar",
                "Mali",
                "Mauritania",
                "Mauritius",
                "Mozambique",
                "Republic of Congo",
                "Senegal",
                "Seychelles",
                "Somalia",
                "Sudan",
                "Tanzania",
                "Togo",
                "Uganda",
                "Zambia",
            ]

            countries_with_mistyped_column = ["South Africa"]

            countries_without_notes = [
                "Eritrea",
                "Eswatini",
                "Gabon",
                "Madagascar",
                "Mali",
                "Mauritania",
                "Mauritius",
                "Mozambique",
                "Republic of Congo",
                "Senegal",
                "Seychelles",
                "Somalia",
                "Sudan",
                "Tanzania",
                "Togo",
                "Uganda",
                "Zambia",
            ]

            # Ok, this is ugly... But, almost all the countries have some ugliness in the CSV format...
            # And this code deals with it
            tmp = copy.deepcopy(self.countries_fields)
            if country in countries_with_empty_columns:
                tmp.insert(0, ("", (None, None, None)))

            if country in countries_with_mistyped_column:
                tmp[14] = ("underlyng_conditions", (None, None, None))

            if country in countries_without_notes:
                del tmp[-1]

            if country == "Ethiopia":
                tmp.insert(8, ("original_status", (None, None, None)))
                del tmp[10]
                tmp.insert(14, ("closed_date", (None, None, None)))
                tmp.insert(16, ("quarantine_status", (None, None, None)))
                del tmp[19]
                tmp.insert(19, ("contact", (None, None, None)))
                tmp.append(("source", (None, None, None)))

            if country == "Niger":
                del tmp[9]
                tmp.insert(9, ("source 1", (None, None, None)))
                tmp.insert(10, ("source 2", (None, None, None)))

            updated_headers_mapping = {
                field: (k, mapping) for k, (field, mapping) in enumerate(tmp)
            }
            expected_h = list(updated_headers_mapping.keys())
            obtained_h = headers[: len(expected_h)]
            obtained_h = [header.strip() for header in obtained_h]

            assert (
                obtained_h == expected_h
            ), "CSV headers have changed\nexpected: {}\n     got: {})".format(
                expected_h, obtained_h
            )

            # South Africa dataset has only 274 nice cases
            # Everything after has the same data and don't have any meaningful information
            idx = 0
            last = None
            if country == "South Africa":
                last = 275

            for row in reader:
                idx += 1
                if last and idx == last:
                    break

                subject, demographic, observation = self.parse_row(
                    country, row, updated_headers_mapping
                )

                self.subjects.append(subject)
                self.demographics.append(demographic)
                self.observations.append(observation)

    def parse_row(self, country, row, mapping):
        subject = {}
        demographic = {}
        observation = {}

        for (i, (node_type, node_field, type_conv)) in mapping.values():
            if node_field:
                value = row[i]
                if value:
                    if node_type == "subject":
                        if type_conv is None:
                            subject[node_field] = None
                            continue
                        subject[node_field] = type_conv(value)
                    if node_type == "demographic":
                        if type_conv is None:
                            demographic[node_field] = None
                            continue
                        demographic[node_field] = type_conv(value)

        # init subject node
        case_id = subject["submitter_id"]
        subject["submitter_id"] = format_subject_submitter_id(
            country, subject["submitter_id"]
        )
        subject["projects"] = [{"code": self.project_code}]

        # Only South Africa dataset has a record with the same case_id...
        # Because this code deals only with individual rows, it's hard coded right now
        if country == "South Africa" and case_id == "110":
            if demographic["age"] == 34:
                subject["submitter_id"] += "_1"
            elif demographic["age"] == 27:
                subject["submitter_id"] += "_2"

        # init demographic node
        demographic["submitter_id"] = format_node_submitter_id(
            subject["submitter_id"], "demographic"
        )
        demographic["subjects"] = [{"submitter_id": subject["submitter_id"]}]

        # init observation node
        observation["submitter_id"] = format_node_submitter_id(
            subject["submitter_id"], "observation"
        )
        observation["subjects"] = [{"submitter_id": subject["submitter_id"]}]

        if subject.get("date_confirmation"):
            subject["covid_19_status"] = "Positive"

        state = subject.get("tmp_current_status")
        if "tmp_current_status" in subject:
            del subject["tmp_current_status"]
        if state == "deceased":
            subject["vital_status"] = "Dead"
        elif state in ["alive"]:
            subject["vital_status"] = state.capitalize()
        elif state in ["positive"]:
            subject["covid_19_status"] = state.capitalize()
        elif state == "isolated":
            observation["isolation_status"] = state.capitalize()
        elif state in ["released", "recovered", "in recovery", "in treatment"]:
            observation["treatment_status"] = state.capitalize()
        elif state in ["stable", "unstable", "critical"]:
            observation["condition"] = state.capitalize()
        elif state:
            raise Exception('State "{}" is unknown'.format(state))

        if "travel_history_dates" in subject:
            date_list = normalize_date_list(subject["travel_history_dates"])
            if date_list:
                subject["travel_history_dates"] = date_list
            else:
                del subject["travel_history_dates"]

        if "travel_history_location" in subject:
            loc_list = normalize_location_list(subject["travel_history_location"])
            if loc_list:
                subject["travel_history_location"] = loc_list
            else:
                del subject["travel_history_location"]

        return subject, demographic, observation

    def submit_metadata(self):
        print("Submitting subject data")
        for loc in self.subjects:
            loc_record = {"type": "subject"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting demographic data")
        for dem in self.demographics:
            dem_record = {"type": "demographic"}
            dem_record.update(dem)
            self.metadata_helper.add_record_to_submit(dem_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting observation data")
        for obs in self.observations:
            obs_record = {"type": "observation"}
            obs_record.update(obs)
            self.metadata_helper.add_record_to_submit(obs_record)
        self.metadata_helper.batch_submit_records()
Beispiel #20
0
class IDPH(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "IDPH"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.country = "US"
        self.state = "IL"

        self.county_dict = {}
        self.il_counties()

        self.summary_locations = []
        self.summary_clinicals = []

    def get_location_and_clinical_submitter_id(self, county, date):
        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {"country": self.country, "state": self.state, "county": county}
            if county is not None
            else {"country": self.country, "state": self.state},
        )
        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {"date": date},
        )
        return summary_location_submitter_id, summary_clinical_submitter_id

    def il_counties(self):
        with open(
            os.path.join(CURRENT_DIR, "data/IL_counties_central_coords_lat_long.tsv")
        ) as f:
            counties = f.readlines()
            counties = counties[1:]
            counties = map(lambda l: l.strip().split("\t"), counties)

        for county, lat, lon in counties:
            self.county_dict[county] = {"lat": lat, "lon": lon}

    def files_to_submissions(self):
        """
        Reads JSON file and convert the data to Sheepdog records.
        """
        latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph()
        today = datetime.date.today()
        if latest_submitted_date == today:
            print("Nothing to submit: today and latest submitted date are the same.")
            return

        today_str = today.strftime("%Y%m%d")
        print(f"Getting data for date: {today_str}")

        # they changed the URL on April 1, 2020
        if today > datetime.date(2020, 3, 31):
            url = "http://www.dph.illinois.gov/sitefiles/COVIDTestResults.json"
        else:
            url = f"https://www.dph.illinois.gov/sites/default/files/COVID19/COVID19CountyResults{today_str}.json"
        self.parse_file(latest_submitted_date, url)

    def parse_file(self, latest_submitted_date, url):
        """
        Converts a JSON files to data we can submit via Sheepdog. Stores the
        records to submit in `self.summary_locations` and `self.summary_clinicals`.

        Args:
            latest_submitted_date (date): date for latest submitted date
            url (str): URL at which the JSON file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            date = idph_get_date(data["LastUpdateDate"])

            if latest_submitted_date and date == latest_submitted_date.strftime(
                "%Y-%m-%d"
            ):
                print(
                    "Nothing to submit: latest submitted date and date from data are the same."
                )
                return

            for county in data["characteristics_by_county"]["values"]:
                demographic = data.get("demographics", None)
                summary_location, summary_clinical = self.parse_county(
                    date, county, demographic
                )

                self.summary_locations.append(summary_location)
                self.summary_clinicals.append(summary_clinical)

            for illinois_data in data["state_testing_results"]["values"]:
                illinois_historic_data = self.parse_historical_data(illinois_data)
                self.summary_clinicals.append(illinois_historic_data)

    def parse_historical_data(self, illinois_data):
        """
        Parses historical state-level data. "summary_location" node is created
        from "characteristics_by_county" data.

        Args:
            illinois_data (dict): data JSON with "testDate", "total_tested",
                "confirmed_cases" and "deaths"

        Returns:
            dict: "summary_clinical" node for Sheepdog
        """
        county = "Illinois"

        date = datetime.datetime.strptime(
            illinois_data["testDate"], "%m/%d/%Y"
        ).strftime("%Y-%m-%d")

        (
            summary_location_submitter_id,
            summary_clinical_submitter_id,
        ) = self.get_location_and_clinical_submitter_id(county, date)

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "confirmed": illinois_data["confirmed_cases"],
            "testing": illinois_data["total_tested"],
            "deaths": illinois_data["deaths"],
            "summary_locations": [{"submitter_id": summary_location_submitter_id}],
        }

        return summary_clinical

    def parse_county(self, date, county_json, demographic):
        """
        From county-level data, generate the data we can submit via Sheepdog

        Args:
            date (date): date
            county_json (dict): JSON for county statistics

        Returns:
            (dict, dict): "summary_location" and "summary_clinical" records
        """
        county = county_json["County"]

        (
            summary_location_submitter_id,
            summary_clinical_submitter_id,
        ) = self.get_location_and_clinical_submitter_id(county, date)

        summary_location = {
            "submitter_id": summary_location_submitter_id,
            "country_region": self.country,
            "province_state": self.state,
            "projects": [{"code": self.project_code}],
        }

        # the IDPH data use Illinois in "County" field for aggregated data
        # in Gen3 it would equal to location with "province_state" equal to "IL" and no "County" field
        if county != "Illinois":
            summary_location["county"] = county

        if county in self.county_dict:
            summary_location["latitude"] = self.county_dict[county]["lat"]
            summary_location["longitude"] = self.county_dict[county]["lon"]
        else:
            if county_json["lat"] != 0:
                summary_location["latitude"] = str(county_json["lat"])
            if county_json["lon"] != 0:
                summary_location["longitude"] = str(county_json["lon"])

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "confirmed": county_json["confirmed_cases"],
            "testing": county_json["total_tested"],
            "deaths": county_json["deaths"],
            "summary_locations": [{"submitter_id": summary_location_submitter_id}],
        }

        if "negative" in county_json:
            summary_clinical["negative"] = county_json["negative"]

        if county == "Illinois" and demographic:
            for k, v in fields_mapping.items():
                field, mapping = v
                demographic_group = demographic[k]

                for item in demographic_group:
                    dst_field = mapping[item[field]]
                    if dst_field:
                        if "count" in item:
                            age_group_count_field = "{}_{}".format(
                                mapping[item[field]], "count"
                            )
                            summary_clinical[age_group_count_field] = item["count"]
                        if "tested" in item:
                            age_group_tested_field = "{}_{}".format(
                                mapping[item[field]], "tested"
                            )
                            summary_clinical[age_group_tested_field] = item["tested"]
        return summary_location, summary_clinical

    def submit_metadata(self):
        """
        Submits the data in `self.summary_locations` and `self.summary_clinicals` to Sheepdog.
        """
        print("Submitting data...")
        print("Submitting summary_location data")
        for sl in self.summary_locations:
            sl_record = {"type": "summary_location"}
            sl_record.update(sl)
            self.metadata_helper.add_record_to_submit(sl_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals:
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Beispiel #21
0
class COXRAY(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "COXRAY"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.file_helper = FileHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.nodes = {
            "core_metadata_collection": [],
            "study": [],
            "subject": [],
            "observation": [],
            "follow_up": [],
            "demographic": [],
            "imaging_file": [],
        }

    def files_to_submissions(self):
        with open(Path(COXRAY_DATA_PATH).joinpath("metadata.csv")) as f:
            reader = csv.reader(f, delimiter=",", quotechar='"')
            headers = next(reader)
            for row in reader:
                row_nodes = self.parse_row(headers, row)
                for k, v in row_nodes.items():
                    self.nodes[k].append(v)

    def parse_row(self, headers, row):
        cmc_submitter_id = format_submitter_id("cmc_coxray", {})
        subject_submitter_id = format_submitter_id(
            "subject_coxray", {"patientid": row[headers.index("patientid")]})
        observation_submitter_id = derived_submitter_id(
            subject_submitter_id, "subject_coxray", "observation_coxray", {})
        follow_up_submitter_id = derived_submitter_id(
            subject_submitter_id,
            "subject_coxray",
            "follow_up_coxray",
            {"offset": row[headers.index("offset")]},
        )
        demographic_submitter_id = derived_submitter_id(
            subject_submitter_id, "subject_coxray", "demographic_coxray", {})
        imaging_file_submitter_id = format_submitter_id(
            "imaging_file_coxray",
            {"filename": row[headers.index("filename")]})
        study_submitter_id = format_submitter_id(
            "study_coxray", {"doi": row[headers.index("doi")]})

        filename = row[headers.index("filename")]
        filename = Path(filename)
        filepath = Path(COXRAY_DATA_PATH).joinpath("images", filename)
        filepath_exist = filepath.exists()

        nodes = {
            "core_metadata_collection": {
                "submitter_id": cmc_submitter_id,
                "projects": [{
                    "code": self.project_code
                }],
            },
            "study": {
                "submitter_id": study_submitter_id,
                "projects": [{
                    "code": self.project_code
                }],
            },
            "subject": {
                "submitter_id": subject_submitter_id,
                "projects": [{
                    "code": self.project_code
                }],
                "studies": [{
                    "submitter_id": study_submitter_id
                }],
            },
            "observation": {
                "submitter_id": observation_submitter_id,
                "subjects": [{
                    "submitter_id": subject_submitter_id
                }],
            },
            "follow_up": {
                "submitter_id": follow_up_submitter_id,
                "subjects": [{
                    "submitter_id": subject_submitter_id
                }],
            },
            "demographic": {
                "submitter_id": demographic_submitter_id,
                "subjects": [{
                    "submitter_id": subject_submitter_id
                }],
            },
        }

        if filepath_exist:
            data_type = "".join(filename.suffixes)
            did, rev, md5sum, filesize = self.file_helper.find_by_name(
                filename=filename)
            assert (
                did
            ), f"file {filename} does not exist in the index, rerun COXRAY_FILE ETL"
            self.file_helper.update_authz(did=did, rev=rev)

            nodes["imaging_file"] = {
                "submitter_id": imaging_file_submitter_id,
                "subjects": [{
                    "submitter_id": subject_submitter_id
                }],
                "follow_ups": [{
                    "submitter_id": follow_up_submitter_id
                }],
                "core_metadata_collections": [{
                    "submitter_id": cmc_submitter_id
                }],
                "data_type": data_type,
                "data_format": "Image File",
                "data_category": "X-Ray Image",
                "file_size": filesize,
                "md5sum": md5sum,
                "object_id": did,
            }
        else:
            print(
                f"subject references the file that doesn't exist as a file: {filepath}"
            )

        for k, (node, field, converter) in fields_mapping.items():
            value = row[headers.index(k)]
            if node in nodes and value:
                if converter:
                    nodes[node][field] = converter(value)
                else:
                    nodes[node][field] = value

        return nodes

    def submit_metadata(self):
        print("Submitting data...")

        for k, v in self.nodes.items():
            submitter_id_exist = []
            print(f"Submitting {k} data...")
            for node in v:
                node_record = {"type": k}
                node_record.update(node)
                submitter_id = node_record["submitter_id"]
                if submitter_id not in submitter_id_exist:
                    submitter_id_exist.append(submitter_id)
                    self.metadata_helper.add_record_to_submit(node_record)
            self.metadata_helper.batch_submit_records()
Beispiel #22
0
class COM_MOBILITY(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "Com-Mobility"

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.expected_file_headers = [
            "country_region_code",
            "country_region",
            "sub_region_1",
            "sub_region_2",
            "metro_area",
            "iso_3166_2_code",
            "census_fips_code",
            "date",
            "retail_and_recreation_percent_change_from_baseline",
            "grocery_and_pharmacy_percent_change_from_baseline",
            "parks_percent_change_from_baseline",
            "transit_stations_percent_change_from_baseline",
            "workplaces_percent_change_from_baseline",
            "residential_percent_change_from_baseline",
        ]

        self.summary_locations = []
        self.summary_socio_demographics = []

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv"
        self.parse_file(url)

    def parse_file(self, url):
        """
        Converts a CSV file to data we can submit via Sheepdog. Stores the
        records to submit in `self.location_data` and `self.time_series_data`.
        Ignores any records that are already in Sheepdog (relies on unique
        `submitter_id` to check)

        Args:
            url (str): URL at which the CSV file is available
        """

        self.last_submission_date_time = self.metadata_helper.get_last_submission()
        the_lattest_data_datetime = None

        print("Getting data from {}".format(url))

        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            assert (
                headers[0] != "404: Not Found"
            ), "Unable to get file contents, received {}.".format(headers)

            assert set(self.expected_file_headers).issubset(
                set(headers)
            ), "CSV headers have changed (expected {} is a subset of {}). We may need to update the ETL code".format(
                self.expected_file_headers, headers
            )

            for row in reader:
                # ignore any empty row
                if not row:
                    continue

                row_dict = dict(zip(headers, row))
                if row_dict["country_region_code"] != "US":
                    continue

                if (
                    not self.last_submission_date_time
                    or parse(row_dict["date"]) > self.last_submission_date_time
                ):
                    if (
                        the_lattest_data_datetime is None
                        or the_lattest_data_datetime < parse(row_dict["date"])
                    ):
                        the_lattest_data_datetime = parse(row_dict["date"])

                    summary_location = {}
                    summary_socio_demographic = {}

                    summary_location_submitter_id = format_submitter_id(
                        "summary_location",
                        row_dict["country_region_code"],
                        row_dict["sub_region_1"],
                        row_dict["sub_region_2"],
                        row_dict["metro_area"],
                        row_dict["date"],
                    )

                    summary_socio_demographic_submitter_id = format_submitter_id(
                        "summary_socio_demographic",
                        row_dict["country_region_code"],
                        row_dict["sub_region_1"],
                        row_dict["sub_region_2"],
                        row_dict["metro_area"],
                        row_dict["date"],
                    )

                    summary_location = {
                        "submitter_id": summary_location_submitter_id,
                        "projects": [{"code": self.project_code}],
                    }

                    summary_socio_demographic = {
                        "submitter_id": summary_socio_demographic_submitter_id,
                        "summary_locations": [
                            {"submitter_id": summary_location_submitter_id}
                        ],
                    }

                    for field in [
                        "country_region_code",
                        "country_region",
                        "sub_region_1",
                        "sub_region_2",
                        "metro_area",
                        "iso_3166_2_code",
                        "census_fips_code",
                    ]:
                        gen3_field, func = SPECIAL_MAP_FIELDS[field]
                        summary_location[gen3_field] = func(row_dict[field])

                    for field in [
                        "retail_and_recreation_percent_change_from_baseline",
                        "grocery_and_pharmacy_percent_change_from_baseline",
                        "parks_percent_change_from_baseline",
                        "transit_stations_percent_change_from_baseline",
                        "workplaces_percent_change_from_baseline",
                        "residential_percent_change_from_baseline",
                        "date",
                    ]:
                        gen3_field, func = SPECIAL_MAP_FIELDS[field]
                        summary_socio_demographic[gen3_field] = func(row_dict[field])

                    self.summary_locations.append(summary_location)
                    self.summary_socio_demographics.append(summary_socio_demographic)
        if the_lattest_data_datetime:
            self.last_submission_date_time = the_lattest_data_datetime

    def submit_metadata(self):
        """
        Converts the data in `self.time_series_data` to Sheepdog records.
        `self.location_data already contains Sheepdog records. Batch submits
        all records in `self.location_data` and `self.time_series_data`
        """
        # Commented
        # Only required for one time submission of summary_location
        print("Submitting summary_location data")
        for loc in self.summary_locations:
            loc_record = {"type": "summary_location"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_socio_demographic data")
        for sc in self.summary_socio_demographics:
            sc_record = {"type": "summary_socio_demographic"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
        self.metadata_helper.update_last_submission(
            self.last_submission_date_time.strftime("%Y-%m-%d")
        )
Beispiel #23
0
class NCBI(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "ncbi-covid-19"
        self.manifest_bucket = "sra-pub-sars-cov2"
        self.sra_src_manifest = "sra-src/Manifest"
        self.accession_number_filename_map = {}

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.file_helper = AsyncFileHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.data_file = NCBI_FILE(
            base_url=self.base_url,
            s3_bucket=self.project_code,
            access_token=access_token,
        )

        self.submitting_data = {
            "sample": [],
            "virus_sequence": [],
            "core_metadata_collection": [],
            "virus_sequence_run_taxonomy": [],
            "virus_sequence_contig": [],
            "virus_sequence_blastn": [],
            "virus_sequence_contig_taxonomy": [],
            "virus_sequence_peptide": [],
            "virus_sequence_hmm_search": [],
        }

        self.submitting_data["core_metadata_collection"].append({
            "submitter_id":
            format_submitter_id("cmc_ncbi_covid19", {}),
            "projects": [{
                "code": self.project_code
            }],
        })

        read_ncbi_manifest(
            self.manifest_bucket,
            self.sra_src_manifest,
            self.accession_number_filename_map,
        )

    def submit_metadata(self):

        start = time.strftime("%X")
        loop = asyncio.get_event_loop()
        tasks = []

        for node_name, _ in self.data_file.nodes.items():
            if node_name == "virus_sequence_run_taxonomy":
                continue
            else:
                tasks.append(
                    asyncio.ensure_future(
                        self.files_to_node_submissions(node_name)))

        try:
            results = loop.run_until_complete(asyncio.gather(*tasks))
            loop.run_until_complete(
                asyncio.gather(
                    self.files_to_virus_sequence_run_taxonomy_submission(
                        results[0])))
            if AsyncFileHelper.session:
                loop.run_until_complete(
                    asyncio.gather(AsyncFileHelper.close_session()))
        finally:
            loop.close()
        end = time.strftime("%X")

        for k, v in self.submitting_data.items():
            print(f"Submitting {k} data...")
            for node in v:
                node_record = {"type": k}
                node_record.update(node)
                self.metadata_helper.add_record_to_submit(node_record)
            self.metadata_helper.batch_submit_records()

        print(f"Running time: From {start} to {end}")

    async def files_to_virus_sequence_run_taxonomy_submission(
            self, submitting_accession_numbers):
        """get submitting data for virus_sequence_run_taxonomy node"""

        if not submitting_accession_numbers:
            return

        records = self._get_response_from_big_query(
            submitting_accession_numbers)

        # Keep track accession_numbers having link to virus_sequence nodes
        accession_number_set = set()
        for record in records:
            if record["acc"] in self.accession_number_filename_map:
                accession_number = record["acc"]
                print(f"Get from bigquery response {accession_number}")
                success = await self._parse_big_query_response(record)
                if success:
                    accession_number_set.add(accession_number)

        cmc_submitter_id = format_submitter_id("cmc_ncbi_covid19", {})
        for accession_number in submitting_accession_numbers:
            virus_sequence_run_taxonomy_submitter_id = format_submitter_id(
                "virus_sequence_run_taxonomy",
                {"accession_number": accession_number})
            submitted_json = {
                "submitter_id": virus_sequence_run_taxonomy_submitter_id,
                "core_metadata_collections": [{
                    "submitter_id": cmc_submitter_id
                }],
                "accession_number": accession_number,
                "data_type": "Virus Sequence Run Taxonomy Analysis",
                "data_format": "json",
                "data_category": "Kmer-based Taxonomy Analysis",
            }

            # Add link to virus sequence node
            if accession_number in accession_number_set:
                submitted_json["virus_sequences"] = [{
                    "submitter_id":
                    f"virus_sequence_{accession_number}"
                }]

            filename = f"virus_sequence_run_taxonomy_{accession_number}.csv"
            print(f"Get indexd info of {filename}")
            trying = True
            while trying:
                try:
                    (
                        did,
                        rev,
                        md5sum,
                        filesize,
                        file_name,
                        authz,
                    ) = await self.file_helper.async_find_by_name(
                        filename=filename)
                    trying = False
                except Exception as e:
                    print(
                        f"Can not get indexd record of {filename}. Detail {e}. Retrying..."
                    )

            assert (
                did
            ), f"file {filename} does not exist in the index, rerun NCBI_FILE ETL"

            if not authz:
                tries = 0
                while tries < MAX_RETRIES:
                    try:
                        await self.file_helper.async_update_authz(did=did,
                                                                  rev=rev)
                        break
                    except Exception as e:
                        tries += 1
                        print(
                            f"Can not update indexd for {did}. Detail {e}. Retrying..."
                        )

            submitted_json["file_size"] = filesize
            submitted_json["md5sum"] = md5sum
            submitted_json["object_id"] = did
            submitted_json["file_name"] = file_name

            self.submitting_data["virus_sequence_run_taxonomy"].append(
                submitted_json)

    async def files_to_node_submissions(self, node_name):
        """Get submitting data for the node"""

        retrying = True
        while retrying:
            try:
                submitting_accession_numbers = (
                    await self.get_submitting_accession_number_list(node_name))
                retrying = False
            except Exception as e:
                print(
                    f"Can not query peregine with {node_name}. Detail {e}. Retrying ..."
                )

        for accession_number in submitting_accession_numbers:
            submitter_id = format_submitter_id(
                node_name, {"accession_number": accession_number})

            cmc_submitter_id = format_submitter_id("cmc_ncbi_covid19", {})

            contig_submitter_id = format_submitter_id(
                "virus_sequence_contig",
                {"accession_number": accession_number})
            peptide_submitter_id = format_submitter_id(
                "virus_sequence_peptide",
                {"accession_number": accession_number})
            run_taxonomy_submitter_id = format_submitter_id(
                "virus_sequence_run_taxonomy",
                {"accession_number": accession_number})

            contig_taxonomy_submitter_id = format_submitter_id(
                "virus_sequence_contig_taxonomy",
                {"accession_number": accession_number})

            if node_name == "virus_sequence_contig":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequences_run_taxonomies": [{
                        "submitter_id":
                        run_taxonomy_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Virus Sequence Contig",
                    "data_format":
                    "json",
                    "data_category":
                    "Nucleotide Contig",
                }
            elif node_name == "virus_sequence_blastn":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequence_contigs": [{
                        "submitter_id":
                        contig_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Virus Sequence Blastn",
                    "data_format":
                    "tsv",
                    "data_category":
                    "Nucleotide Blast",
                }
            elif node_name == "virus_sequence_peptide":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequence_contigs": [{
                        "submitter_id":
                        contig_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Peptides Annotation Using VIGOR3",
                    "data_format":
                    "json",
                    "data_category":
                    "Peptides Annotation",
                }
            elif node_name == "virus_sequence_hmm_search":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequence_peptides": [{
                        "submitter_id":
                        peptide_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Virus Sequence HMM Search",
                    "data_format":
                    "json",
                    "data_category":
                    "HMMER Scab of Contigs",
                }
            elif node_name == "virus_sequence_contig_taxonomy":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequence_contigs": [{
                        "submitter_id":
                        contig_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Contig Taxonomy",
                    "data_format":
                    "json",
                    "data_category":
                    "Kmer-based Taxonomy Analysis of Contigs",
                }

            else:
                raise Exception(f"ERROR: {node_name} does not exist")

            ext = re.search("\.(.*)$",
                            self.data_file.nodes[node_name][0]).group(1)
            filename = f"{node_name}_{accession_number}.{ext}"

            print(f"Get indexd record of {filename}")

            retrying = True
            while retrying:
                try:
                    (
                        did,
                        rev,
                        md5sum,
                        filesize,
                        file_name,
                        authz,
                    ) = await self.file_helper.async_find_by_name(
                        filename=filename)
                    retrying = False
                except Exception as e:
                    print(
                        f"ERROR: Fail to query indexd for {filename}. Detail {e}. Retrying ..."
                    )
                    await asyncio.sleep(5)

            assert (
                did
            ), f"file {filename} does not exist in the index, rerun NCBI_FILE ETL"

            if not authz:
                tries = 0
                while tries < MAX_RETRIES:
                    try:
                        await self.file_helper.async_update_authz(did=did,
                                                                  rev=rev)
                        break
                    except Exception as e:
                        tries += 1
                        print(
                            f"ERROR: Fail to update indexd for {filename}. Detail {e}. Retrying ..."
                        )
                        await asyncio.sleep(5)

            submitted_json["file_size"] = filesize
            submitted_json["md5sum"] = md5sum
            submitted_json["object_id"] = did
            submitted_json["file_name"] = file_name

            self.submitting_data[node_name].append(submitted_json)
        return submitting_accession_numbers

    async def get_submitting_accession_number_list_for_run_taxonomy(self):
        """get submitting number list for run_taxonomy file"""

        node_name = "virus_sequence_run_taxonomy"
        submitting_accession_numbers = set()
        existed_accession_numbers = await self.data_file.get_existed_accession_numbers(
            node_name)

        s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED))
        s3_object = s3.Object(self.data_file.bucket,
                              self.data_file.nodes[node_name][0])
        file_path = f"{DATA_PATH}/virus_sequence_run_taxonomy.gz"
        s3_object.download_file(file_path)

        n_lines = 0
        with gzip.open(file_path, "rb") as f:
            while True:
                bline = f.readline()
                if not bline:
                    break
                n_lines += 1
                if n_lines % 10000 == 0:
                    print(f"Finish process {n_lines} of file {node_name}")
                line = bline.decode("UTF-8")
                r1 = re.findall("[SDE]RR\d+", line)
                if len(r1) == 0:
                    continue
                read_accession_number = r1[0]
                if (f"{node_name}_{read_accession_number}"
                        not in existed_accession_numbers):
                    submitting_accession_numbers.add(read_accession_number)
        return list(submitting_accession_numbers)

    async def get_submitting_accession_number_list(self, node_name):
        """get submitting acession number list"""

        submitting_accession_numbers = set()
        existed_accession_numbers = await self.data_file.get_existed_accession_numbers(
            node_name)

        s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED))
        s3_object = s3.Object(self.data_file.bucket,
                              self.data_file.nodes[node_name][0])
        line_stream = codecs.getreader("utf-8")
        n_lines = 0
        for line in line_stream(s3_object.get()["Body"]):
            r1 = re.findall("[SDE]RR\d+", line)
            n_lines += 1
            if n_lines % 10000 == 0:
                print(f"Finish process {n_lines} of file {node_name}")
            if len(r1) == 0:
                continue
            read_accession_number = r1[0]
            if (f"{node_name}_{read_accession_number}".lower()
                    not in existed_accession_numbers):
                submitting_accession_numbers.add(read_accession_number)

        return list(submitting_accession_numbers)

    def _get_response_from_big_query(self, accession_numbers):
        """
        Get data from big query. The format of the response json is
        described as below:
        [{
            "acc": "DRR220591",
            "assay_type": "RNA-Seq",
            "center_name": "KUMAMOTO",
            "consent": "public",
            "experiment": "DRX210904",
            "sample_name": "SAMD00217265",
            "instrument": "Illumina NovaSeq 6000",
            "librarylayout": "PAIRED",
            "libraryselection": "RANDOM",
            "librarysource": "TRANSCRIPTOMIC",
            "platform": "ILLUMINA",
            "sample_acc": "DRS139760",
            "biosample": "SAMD00217265",
            "organism": "Mus musculus",
            "sra_study": "DRP006149",
            #'releasedate': datetime.datetime(2020, 6, 4, 0, 0, tzinfo=<UTC>),
            "bioproject": "PRJDB9618",
            "mbytes": 2160,
            "loaddate": None,
            "avgspotlen": 300,
            "mbases": 6395,
            "insertsize": None,
            "library_name": None,
            "biosamplemodel_sam": [],
            "collection_date_sam": [],
            "geo_loc_name_country_calc": None,
            "geo_loc_name_country_continent_calc": None,
            "geo_loc_name_sam": [],
            "ena_first_public_run": [],
            "ena_last_update_run": [],
            "sample_name_sam": ["WT3_plus"],
            "datastore_filetype": ["sra"],
            "datastore_provider": ["gs", "ncbi", "s3"],
            "datastore_region": ["gs.US", "ncbi.public", "s3.us-east-1"],
        }]
        """

        assert accession_numbers != [], "accession_numbers is not empty"

        start = 0
        offset = 100
        client = bigquery.Client()
        while start < len(accession_numbers):
            end = min(start + offset, len(accession_numbers))
            stm = 'SELECT * FROM `nih-sra-datastore`.sra.metadata where consent = "public"'

            stm = stm + f' and (acc = "{accession_numbers[start]}"'
            for accession_number in accession_numbers[start + 1:end]:
                stm = stm + f' or acc = "{accession_number}"'
            stm = stm + ")"

            query_job = client.query(stm)

            results = query_job.result()  # Waits for job to complete.

            for row in results:
                yield dict(row)
            start = end

    async def _parse_big_query_response(self, response):
        """
        Parse the big query response and get indexd record

        Return True if success

        """

        accession_number = response["acc"]

        sample = {}
        virus_sequence = {}

        sample["submitter_id"] = f"sample_{accession_number}"
        sample["projects"] = [{"code": self.project_code}]

        for field in [
                "ncbi_bioproject",
                "ncbi_biosample",
                "sample_accession",
                "host_associated_environmental_package_sam",
                "organism",
                "collection_date",
                "country_region",
                "continent",
        ]:
            if field in SPECIAL_MAP_FIELDS:
                old_name, dtype, handler = SPECIAL_MAP_FIELDS[field]
                sample[field] = handler(response.get(old_name))
            elif field in response:
                sample[field] = str(response.get(field))

        virus_sequence["submitter_id"] = f"virus_sequence_{accession_number}"
        for field in [
                "assay_type",
                "avgspotlen",
                "bytes",
                "center_name",
                "consent",
                "datastore_provider",
                "datastore_region",
                "description_sam",
                "ena_checklist_sam",
                "ena_first_public_run",
                "ena_last_update_run",
                "experiment",
                "insdc_center_name_sam",
                "insdc_first_public_sam",
                "insdc_center_alias_sam",
                "insdc_last_update_sam",
                "investigation_type_sam",
                "insdc_status_sam",
                "instrument",
                "library_name",
                "libraryselection",
                "librarysource",
                "mbases",
                "mbytes",
                "platform",
                "sra_accession_sam",
                "sra_study",
                "title_sam",
                "release_date",
                "data_format",
                "librarylayout",
        ]:
            if field in SPECIAL_MAP_FIELDS:
                old_name, dtype, handler = SPECIAL_MAP_FIELDS[field]
                virus_sequence[field] = handler(response.get(old_name))
            elif field in response:
                virus_sequence[field] = str(response.get(field))

        virus_sequence["samples"] = [{"submitter_id": sample["submitter_id"]}]
        virus_sequence["data_category"] = "Nucleotide"
        virus_sequence["data_type"] = "Sequence"
        virus_sequence["file_name"] = self.accession_number_filename_map[
            accession_number]

        virus_sequence["data_format"] = get_file_extension(
            virus_sequence["file_name"])
        filename = virus_sequence["file_name"]

        retrying = True
        while retrying:
            try:
                (
                    did,
                    rev,
                    md5sum,
                    filesize,
                    file_name,
                    authz,
                ) = await self.file_helper.async_find_by_name(filename=filename
                                                              )
                retrying = False
            except Exception as e:
                print(
                    f"ERROR: Fail to get indexd for {filename}. Detail {e}. Retrying ..."
                )
                await asyncio.sleep(5)

        if not did:
            print(
                f"file {filename} does not exist in the index, rerun NCBI_MANIFEST ETL"
            )
            return False

        if not authz:
            retries = 0
            while retries < MAX_RETRIES:
                try:
                    await self.file_helper.async_update_authz(did=did, rev=rev)
                    break
                except Exception as e:
                    print(
                        f"ERROR: Fail to update indexd for {filename}. Detail {e}. Retrying ..."
                    )
                    retries += 1
                    await asyncio.sleep(5)

        virus_sequence["file_size"] = filesize
        virus_sequence["md5sum"] = md5sum
        virus_sequence["object_id"] = did

        self.submitting_data["virus_sequence"].append(virus_sequence)
        self.submitting_data["sample"].append(sample)
        return True
Beispiel #24
0
class OWID(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.summary_locations = []
        self.summary_clinicals = []

        self.program_name = "open"
        self.project_code = "OWID"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        # structure is
        # (csv field name, (node type, node field name, type of field))
        testing_fields = [
            ("ISO code", ("summary_location", "iso3", str)),
            ("Entity", (None, None, split_entity)),
            ("Date", ("summary_clinical", "date", str)),
            ("Source URL", ("summary_clinical", "source_url", str)),
            ("Source label", ("summary_clinical", "source_label", str)),
            ("Notes", ("summary_clinical", "notes", str)),
            ("Number of observations", ("summary_clinical", "num_observations",
                                        int)),
            ("Cumulative total", ("summary_clinical", "testing", int)),
            (
                "Cumulative total per thousand",
                ("summary_clinical", "cumulative_total_per_thousand", int),
            ),
            (
                "Daily change in cumulative total",
                ("summary_clinical", "daily_change_in_cumulative_total", int),
            ),
            (
                "Daily change in cumulative total per thousand",
                (
                    "summary_clinical",
                    "daily_change_in_cumulative_total_per_thousand",
                    int,
                ),
            ),
            (
                "7-day smoothed daily change",
                ("summary_clinical", "seven_day_smoothed_daily_change", int),
            ),
            (
                "7-day smoothed daily change per thousand",
                (
                    "summary_clinical",
                    "seven_day_smoothed_daily_change_per_thousand",
                    float,
                ),
            ),
            ("Short-term positive rate", (None, None, None)),
            ("Short-term tests per case", (None, None, None)),
            ("General source label", ("summary_clinical",
                                      "general_source_label", str)),
            ("General source URL", ("summary_clinical", "general_source_url",
                                    str)),
            ("Short description", ("summary_clinical", "short_description",
                                   str)),
            ("Detailed description", ("summary_clinical",
                                      "detailed_description", str)),
        ]

        self.headers_mapping = {
            field: (k, mapping)
            for k, (field, mapping) in enumerate(testing_fields)
        }

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/testing/covid-testing-latest-data-source-details.csv"
        self.parse_file(url)

    def parse_file(self, url):
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            assert (
                headers[0] != "404: Not Found"
            ), "  Unable to get file contents, received {}.".format(headers)

            expected_h = list(self.headers_mapping.keys())
            obtained_h = headers[:len(expected_h)]
            assert (
                obtained_h == expected_h
            ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format(
                expected_h, obtained_h)

            for row in reader:
                summary_location, summary_clinical = self.parse_row(
                    row, self.headers_mapping)

                if summary_location not in self.summary_locations:
                    self.summary_locations.append(summary_location)
                self.summary_clinicals.append(summary_clinical)

    def parse_row(self, row, mapping):
        summary_location = {}
        summary_clinical = {}

        for k, (i, (node_type, node_field, type_conv)) in mapping.items():
            if k == "Entity":
                country, test_type = split_entity(row[i])
                summary_location["country_region"] = country
                summary_clinical["test_type"] = test_type
            if node_field:
                value = row[i]
                if value:
                    if node_type == "summary_location":
                        summary_location[node_field] = type_conv(value)
                    if node_type == "summary_clinical":
                        if type_conv == int:
                            summary_clinical[node_field] = type_conv(
                                float(value))
                        else:
                            summary_clinical[node_field] = type_conv(value)

        summary_location_submitter_id = format_location_submitter_id(
            summary_location)

        summary_location["submitter_id"] = summary_location_submitter_id
        summary_location["projects"] = [{"code": self.project_code}]

        summary_clinical[
            "submitter_id"] = format_summary_clinical_submitter_id(
                summary_location_submitter_id,
                test_type=summary_clinical["test_type"],
                date=datetime.date.today().strftime("%Y-%m-%d"),
            )
        summary_clinical["summary_locations"] = [{
            "submitter_id":
            summary_location_submitter_id
        }]

        return summary_location, summary_clinical

    def submit_metadata(self):
        print("Submitting summary_location data")
        for loc in self.summary_locations:
            loc_record = {"type": "summary_location"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for rep in self.summary_clinicals:
            rep_record = {"type": "summary_clinical"}
            rep_record.update(rep)
            self.metadata_helper.add_record_to_submit(rep_record)
        self.metadata_helper.batch_submit_records()
Beispiel #25
0
class CHI_NBHD(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.summary_locations = []
        self.summary_clinicals = []

        self.program_name = "open"
        self.project_code = "CHI-NBHD"

        self.country = "US"
        self.state = "IL"

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def files_to_submissions(self):
        """
        Reads JSON file and convert the data to Sheepdog records
        """
        url = "https://covid19neighborhoods.southsideweekly.com/page-data/index/page-data.json"
        self.parse_file(url)

    def parse_file(self, url):
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            data = data["result"]["data"]
            build_time_str = data["build_time"]["nodes"][0]["buildTime"]
            build_time = datetime.datetime.strptime(
                build_time_str, "%Y-%m-%dT%H:%M:%S.%fZ"
            )
            current_date = build_time.strftime("%Y-%m-%d")
            nbhd_stats = data["community_areas_all"]["nodes"][0]["childGeoJson"][
                "features"
            ]

            for nbhd_object in nbhd_stats:
                summary_location, summary_clinical = self.parse_nbhd(
                    nbhd_object, current_date
                )

                self.summary_locations.append(summary_location)
                self.summary_clinicals.append(summary_clinical)

                print(summary_location)
                print(summary_clinical)

    def parse_nbhd(self, nbhd_object, date):
        properties = nbhd_object["properties"]
        nbhd = properties["community"]
        deaths = properties["value"]
        population = properties["population"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {"country": self.country, "state": self.state, "nbhd": nbhd},
        )

        summary_location = {
            "submitter_id": summary_location_submitter_id,
            "community_area": nbhd,
            "projects": [{"code": self.project_code}],
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {"date": date},
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "deaths_per_10000": round(10000 * deaths / population, 2),
            "deaths": deaths,
            "summary_locations": [{"submitter_id": summary_location_submitter_id}],
        }

        return summary_location, summary_clinical

    def submit_metadata(self):
        print("Submitting summary_location data")
        for loc in self.summary_locations:
            loc_record = {"type": "summary_location"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals:
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Beispiel #26
0
class IDPH_HOSPITAL(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "IDPH-Hospital"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.country = "US"
        self.state = "IL"

        self.summary_locations = []
        self.summary_clinicals = []

    def files_to_submissions(self):
        """
        Reads JSON file and convert the data to Sheepdog records
        """

        latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph(
        )
        today = datetime.date.today()
        if latest_submitted_date == today:
            print(
                "Nothing to submit: today and latest submitted date are the same."
            )
            return
        today_str = today.strftime("%Y%m%d")

        print(f"Getting data for date: {today_str}")
        url = "https://dph.illinois.gov/sitefiles/COVIDHospitalRegions.json"
        self.parse_file(latest_submitted_date, url)

    def parse_file(self, latest_submitted_date, url):
        """
        Converts a JSON files to data we can submit via Sheepdog. Stores the
        records to submit in `self.summary_locations` and `self.summary_clinicals`.

        Args:
            latest_submitted_date (date): the date of latest available "summary_clinical" for project
            url (str): URL at which the JSON file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            date = idph_get_date(data["LastUpdateDate"])

            if latest_submitted_date and date == latest_submitted_date.strftime(
                    "%Y-%m-%d"):
                print(
                    "Nothing to submit: latest submitted date and date from data are the same."
                )
                return

            (
                summary_location,
                summary_clinical_statewide_current,
            ) = self.parse_statewide_values(date, data["statewideValues"])

            self.summary_locations.append(summary_location)

            for utilization in data["HospitalUtilizationResults"]:
                summary_clinical = self.parse_historical(
                    utilization, summary_clinical_statewide_current)

                self.summary_clinicals.append(summary_clinical)

            for region in data["regionValues"]:
                (summary_location,
                 summary_clinical) = self.parse_region(date, region)

                self.summary_locations.append(summary_location)
                self.summary_clinicals.append(summary_clinical)

    def parse_historical(self, utilization,
                         summary_clinical_statewide_current):
        utilization_mapping = {
            "reportDate": "date",
            "TotalBeds": "state_total_beds",
            "TotalOpenBeds": "total_open_beds",
            "TotalInUseBedsNonCOVID": "total_in_use_beds_non_covid",
            "TotalInUseBedsCOVID": "total_in_use_beds_covid",
            "ICUBeds": "icu_beds",
            "ICUOpenBeds": "icu_open_beds",
            "ICUInUseBedsNonCOVID": "icu_in_use_beds_non_covid",
            "ICUInUseBedsCOVID": "icu_in_use_beds_covid",
            "VentilatorCapacity": "ventilator_capacity",
            "VentilatorAvailable": "ventilator_available",
            "VentilatorInUseNonCOVID": "ventilator_in_use_non_covid",
            "VentilatorInUseCOVID": "ventilator_in_use_covid",
        }
        date = utilization["reportDate"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "project": "idph_hospital",
                "country": self.country,
                "state": self.state
            },
        )

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {
                "project": "idph_hospital",
                "date": date
            },
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        for k, v in utilization.items():
            summary_clinical[utilization_mapping[k]] = v

        if (summary_clinical_submitter_id ==
                summary_clinical_statewide_current["submitter_id"]):
            summary_clinical.update(summary_clinical_statewide_current)

        return summary_clinical

    def parse_statewide_values(self, date, statewide_values):
        statewide_mapping = {
            "ICUCapacity": "state_icu_capacity",
            "ICUCovidPatients": "state_icu_covid_patients",
            "VentCapacity": "state_vent_capacity",
            "VentCovidPatients": "state_vent_covid_patients",
            "ICUAvailable": "state_icu_available",
            "VentsAvailable": "state_vents_available",
            "TotalBeds": "state_total_beds",
            "TotalBedsAvailable": "state_total_beds_available",
            "TotalBedsUsed": "state_total_beds_used",
            "PctHospitalBedsAvailable": "state_pct_hospital_beds_available",
            "AdultICUCapacity": "state_adult_icu_capacity",
            "ICUOpenBeds": "state_icu_open_beds",
            "ICUBedsUsed": "state_icu_beds_used",
            "ICUOpenBedsPct": "state_icu_open_beds_pct",
            "COVIDPUIPatients": "state_covid_pui_patients",
            "COVIDPUIPatientsPct": "state_covid_pui_patients_pct",
            "COVIDPUIPatientsBedsInUsePct":
            "state_covid_pui_patients_beds_in_use_pct",
            "VentilatorCapacity": "state_ventilator_capacity",
            "VentilatorsOpen": "state_ventilators_open",
            "VentilatorsOpenPct": "state_Ventilators_open_pct",
            "VentilatorsInUse": "state_ventilators_in_use",
            "VentilatorsInUseCOVID": "state_ventilators_in_use_covid",
            "VentilatorsCOVIDPatientsPct":
            "state_ventilators_covid_patients_pct",
            "VentilatorsCOVIDPatientsInUsePct":
            "state_ventilators_covid_patients_in_use_pct",
            "CovidPatientsNonICU": "state_covid_patients_non_icu",
            "TotalCOVIDPUIInICU": "state_total_covid_pui_in_icu",
            "TotalCOVIDPUIInHospital": "state_total_covid_pui_in_hospital",
            "PctBedsCOVIDPUI": "state_pct_beds_covid_pui",
            "MedSurgBeds": "state_med_surg_beds",
            "MedSurgBedsOpen": "state_med_surg_beds_open",
            "MedSurgBedsOpenPct": "state_med_surg_beds_open_pct",
            "MedSurgBedsInUse": "state_med_surg_beds_in_use",
        }

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "project": "idph_hospital",
                "country": self.country,
                "state": self.state
            },
        )

        summary_location = {
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "country_region": self.country,
            "province_state": self.state,
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {
                "project": "idph_hospital",
                "date": date
            },
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        for k, v in statewide_values.items():
            summary_clinical[statewide_mapping[k]] = v

        return summary_location, summary_clinical

    def parse_region(self, date, hospital_region):
        """
        From county-level data, generate the data we can submit via Sheepdog
        """
        region = hospital_region["region"]
        region_description = hospital_region["region_description"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "project": "idph_hospital",
                "country": self.country,
                "state": self.state,
                "region": region,
            },
        )

        summary_location = {
            "country_region": self.country,
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "province_state": self.state,
            "state_hospital_region": region,
            "state_region_description": strip_prefix(region_description),
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {
                "project": "idph_hospital",
                "date": date
            },
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
            "region_icu_avail": hospital_region["ICUAvail"],
            "region_icu_capacity": hospital_region["ICUCapacity"],
            "region_vents_available": hospital_region["VentsAvailable"],
            "region_vents_capacity": hospital_region["VentsCapacity"],
        }

        return summary_location, summary_clinical

    def submit_metadata(self):
        print("Submitting data...")
        print("Submitting summary_location data")
        for sl in self.summary_locations:
            sl_record = {"type": "summary_location"}
            sl_record.update(sl)
            self.metadata_helper.add_record_to_submit(sl_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals:
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Beispiel #27
0
class JHU(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.location_data = {}
        self.time_series_data = defaultdict(lambda: defaultdict(dict))
        self.program_name = "open"
        self.project_code = "JHU"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )
        self.expected_csv_headers = {
            "global":
            ["Province/State", "Country/Region", "Lat", "Long", "1/22/20"],
            "US_counties": {
                "confirmed": [
                    "UID",
                    "iso2",
                    "iso3",
                    "code3",
                    "FIPS",
                    "Admin2",
                    "Province_State",
                    "Country_Region",
                    "Lat",
                    "Long_",
                    "Combined_Key",
                    "1/22/20",
                ],
                "deaths": [
                    "UID",
                    "iso2",
                    "iso3",
                    "code3",
                    "FIPS",
                    "Admin2",
                    "Province_State",
                    "Country_Region",
                    "Lat",
                    "Long_",
                    "Combined_Key",
                    "Population",  # TODO use this
                    "1/22/20",
                ],
            },
        }
        self.header_to_column = {
            "global": {
                "province": 0,
                "country": 1,
                "latitude": 2,
                "longitude": 3,
                "dates_start": 4,
            },
            "US_counties": {
                "confirmed": {
                    "iso2": 1,
                    "iso3": 2,
                    "code3": 3,
                    "FIPS": 4,
                    "county": 5,
                    "province": 6,
                    "country": 7,
                    "latitude": 8,
                    "longitude": 9,
                    "dates_start": 11,
                },
                "deaths": {
                    "iso2": 1,
                    "iso3": 2,
                    "code3": 3,
                    "FIPS": 4,
                    "county": 5,
                    "province": 6,
                    "country": 7,
                    "latitude": 8,
                    "longitude": 9,
                    "dates_start": 12,
                },
            },
        }
        self.existing_summary_locations = []
        self.last_date = ""

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        urls = {
            "global": {
                "confirmed":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",
                "deaths":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
                "recovered":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv",
                # "testing": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_testing_global.csv",
            },
            "US_counties": {
                "confirmed":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv",
                "deaths":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv",
            },
        }

        (
            self.existing_summary_locations,
            self.last_date,
        ) = self.metadata_helper.get_existing_data_jhu()

        for file_type in ["global", "US_counties"]:
            for data_type, url in urls[file_type].items():
                self.parse_file(file_type, data_type, url)

    def parse_file(self, file_type, data_type, url):
        """
        Converts a CSV file to data we can submit via Sheepdog. Stores the
        records to submit in `self.location_data` and `self.time_series_data`.
        Ignores any records that are already in Sheepdog (relies on unique
        `submitter_id` to check)

        Args:
            file_type (str): type of this file - one
                of ["global", "US_counties"]
            data_type (str): type of the data in this file - one
                of ["confirmed", "deaths", "recovered"]
            url (str): URL at which the CSV file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            if headers[0] == "404: Not Found":
                print("  Unable to get file contents, received {}.".format(
                    headers))
                return

            expected_h = self.expected_csv_headers[file_type]
            if isinstance(expected_h, dict):
                expected_h = expected_h[data_type]
            obtained_h = headers[:len(expected_h)]
            assert (
                obtained_h == expected_h
            ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format(
                expected_h, obtained_h)

            first_date_i = [
                i for i, h in enumerate(headers) if h.endswith("/20")
            ][0]
            last_date = headers[-1]
            print("  First date: {}; last date: {}".format(
                headers[first_date_i], last_date))

            for row in reader:
                if not row:  # ignore empty rows
                    continue
                location, date_to_value = self.parse_row(
                    file_type, data_type, headers, row)
                if not location:
                    # We are using US data by state instead of global
                    continue

                location_submitter_id = location["submitter_id"]
                if (location_submitter_id not in self.location_data
                        # do not re-submit location data that already exist
                        and location_submitter_id
                        not in self.existing_summary_locations):
                    self.location_data[location_submitter_id] = location

                for date, value in date_to_value.items():
                    # do not re-submit summary_clinical data that
                    # already exist. Assume anything older than the last
                    # submitted date has already been submitted
                    if (time_series_date_to_string(date) >
                            time_series_date_to_string(self.last_date)
                            or LAST_DATE_ONLY):
                        self.time_series_data[location_submitter_id][date][
                            data_type] = value

    def parse_row(self, file_type, data_type, headers, row):
        """
        Converts a row of a CSV file to data we can submit via Sheepdog

        Args:
            file_type (str): type of this file - one
                of ["global", "US_counties"]
            data_type (str): type of the data in this file - one
                of ["confirmed", "deaths", "recovered"]
            headers (list(str)): CSV file headers (first row of the file)
            row (list(str)): row of data

        Returns:
            (dict, dict) tuple:
                - location data, in a format ready to be submitted to Sheepdog
                - { "date1": <value>, "date2": <value> } from the row data
        """
        header_to_column = self.header_to_column[file_type]
        if "country" not in header_to_column:
            header_to_column = header_to_column[data_type]

        country = row[header_to_column["country"]]
        province = row[header_to_column["province"]]
        latitude = row[header_to_column["latitude"]] or "0"
        longitude = row[header_to_column["longitude"]] or "0"

        if country == "US" and province == "":
            # We are using US data by state instead of global
            return None, None

        if int(float(latitude)) == 0 and int(float(longitude)) == 0:
            # Data with "Out of <state>" or "Unassigned" county value have
            # unknown coordinates of (0,0). We don't submit them for now
            return None, None

        submitter_id = format_location_submitter_id(country, province)
        location = {
            "country_region": country,
            "latitude": latitude,
            "longitude": longitude,
            "projects": [{
                "code": self.project_code
            }],
        }
        if province:
            location["province_state"] = province
        if file_type == "US_counties":
            county = row[header_to_column["county"]]
            iso2 = row[header_to_column["iso2"]]
            iso3 = row[header_to_column["iso3"]]
            code3 = row[header_to_column["code3"]]
            fips = row[header_to_column["FIPS"]]
            if county:
                location["county"] = county
                submitter_id = format_location_submitter_id(
                    country, province, county)
            if iso2:
                location["iso2"] = iso2
            if iso3:
                location["iso3"] = iso3
            if code3:
                location["code3"] = int(code3)
            if fips:
                location["FIPS"] = int(float(fips))
        location["submitter_id"] = submitter_id

        date_to_value = {}
        dates_start = header_to_column["dates_start"]
        dates_indices = range(dates_start, len(headers))
        if LAST_DATE_ONLY:
            dates_indices = [len(headers) - 1]
        for i in dates_indices:
            date = headers[i]
            date = get_unified_date_format(date)

            if row[i] == "":  # ignore empty values
                continue
            try:
                val = int(float(row[i]))
            except ValueError:
                print(
                    'Unable to convert {} to int for "{}", "{}" at {}'.format(
                        row[i], province, country, date))
                raise
            date_to_value[date] = val

        return location, date_to_value

    def submit_metadata(self):
        """
        Converts the data in `self.time_series_data` to Sheepdog records.
        `self.location_data already contains Sheepdog records. Batch submits
        all records in `self.location_data` and `self.time_series_data`
        """
        if LAST_DATE_ONLY:
            # delete the old data from the Sheepdog DB
            print("Deleting old summary_clinical data")
            self.metadata_helper.delete_nodes(["summary_clinical"])

        print("Submitting summary_location data")
        for location in self.location_data.values():
            record = {"type": "summary_location"}
            record.update(location)
            self.metadata_helper.add_record_to_submit(record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for location_submitter_id, time_series in self.time_series_data.items(
        ):
            for date, data in time_series.items():
                submitter_id = format_summary_clinical_submitter_id(
                    location_submitter_id, date)
                record = {
                    "type": "summary_clinical",
                    "submitter_id": submitter_id,
                    "summary_locations": [{
                        "submitter_id": location_submitter_id
                    }],
                    "date": date,
                }
                for data_type, value in data.items():
                    record[data_type] = value
                self.metadata_helper.add_record_to_submit(record)
        self.metadata_helper.batch_submit_records()
Beispiel #28
0
class SSR(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.subjects = []
        self.demographics = []

        self.program_name = "controlled"
        self.project_code = "SSR"
        self.country = "US"
        self.state = "IL"

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        # self.records = { <node ID>: { <submitter_id: { <data> } } }
        self.records = defaultdict(dict)

        # TODO temporary - for now this ETL can only be run manually
        self.file_path = os.environ.get("FILE_PATH")
        if not self.file_path:
            # log instead of exception so that unit tests don't complain
            print("Need FILE_PATH environment variable (SSR file to parse)")

    def files_to_submissions(self):
        """
        Reads input files and converts the data to Sheepdog records
        """
        print("Parsing file: {}".format(self.file_path))
        extension = self.file_path.lower().split(".")[-1]
        if extension == "txt":
            self.parse_txt_input_file()
        elif extension == "xlsx":
            self.parse_xlsx_input_file()
        else:
            raise Exception(
                f"I don't know how to parse extension {extension} for file {self.file_path}"
            )

    def parse_txt_input_file(self):
        with open(self.file_path, newline="") as csvfile:
            reader = csv.reader(csvfile, delimiter="|")
            header = next(reader)
            header = {k: v for v, k in enumerate(header)}

            for row in reader:
                row_data = dict(zip(header, row))
                self.parse_input(row_data=row_data)

    def parse_xlsx_input_file(self):
        # Set up file path, workbook, and sheet.
        wb = xlrd.open_workbook(self.file_path)
        sheet = wb.sheet_by_index(0)

        # Create lists for SSR properties and value from Excel sheet.
        prop_list = sheet.col_values(0)[1:]
        value_list = sheet.col_values(1)[1:]

        col_data = dict(zip(prop_list, value_list))
        self.parse_input(row_data=col_data, date_mode=wb.datemode)

    def parse_input(self, row_data, date_mode=None):
        # (original property, (gen3 node, gen3 property, property type))
        mapping = [
            ("reportingOrg", ("summary_location", "reporting_org", str)),
            ("reportDate", ("statistical_summary_report", "report_date", str)),
            ("num_COVID", ("statistical_summary_report", "num_COVID", int)),
            (
                "num_COVID_deaths",
                ("statistical_summary_report", "num_COVID_deaths", int),
            ),
            ("num_outpatient", ("statistical_summary_report", "num_outpatient",
                                int)),
            ("num_admitted", ("statistical_summary_report", "num_admitted",
                              int)),
            ("num_icu", ("statistical_summary_report", "num_icu", int)),
            ("num_vent", ("statistical_summary_report", "num_vent", int)),
            ("num_resp", ("statistical_summary_report", "num_resp", int)),
            ("num_pneu", ("statistical_summary_report", "num_pneu", int)),
            ("num_diab", ("statistical_summary_report", "num_diab", int)),
            ("num_asth", ("statistical_summary_report", "num_asth", int)),
            ("num_obes", ("statistical_summary_report", "num_obes", int)),
            ("num_card", ("statistical_summary_report", "num_card", int)),
            ("num_chf", ("statistical_summary_report", "num_chf", int)),
        ]

        # row_records = { <node ID>: { <record data> } }
        # (there is only 1 record of each node type per row)
        row_records = defaultdict(dict)

        for orig_prop_name, (node_type, prop_name, _type) in mapping:
            if row_data[orig_prop_name]:
                row_records[node_type][prop_name] = format_value(
                    prop_name, row_data[orig_prop_name], _type, date_mode)

        # add missing summary_location props
        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "reporting_org":
                row_records["summary_location"]["reporting_org"]
            },
        )
        row_records["summary_location"].update({
            "type": "summary_location",
            "submitter_id": summary_location_submitter_id,
            "projects": {
                "code": self.project_code
            },
            "country_region": self.country,
            "province_state": self.state,
        })

        # add missing statistical_summary_report props
        ssr_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "statistical_summary_report",
            "ssr",
            {
                "report_date":
                row_records["statistical_summary_report"]["report_date"]
            },
        )
        row_records["statistical_summary_report"].update({
            "type": "statistical_summary_report",
            "submitter_id": ssr_submitter_id,
            "summary_locations": {
                "submitter_id": summary_location_submitter_id
            },
        })

        for node_type in row_records:
            rec = row_records[node_type]
            self.records[node_type][rec["submitter_id"]] = rec

    def submit_metadata(self):
        # TODO check which summary_locations already exist
        for node_type in SUBMISSION_ORDER:
            recs = self.records[node_type].values()
            self.metadata_helper.add_records_to_submit(recs)
            self.metadata_helper.batch_submit_records()
Beispiel #29
0
class VAC_TRACKER(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.clinical_trials = []
        self.program_name = "open"
        self.project_code = "VacTracker"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def files_to_submissions(self):
        """
        Reads json files and converts the data to Sheepdog records
        """
        url = "https://biorender.com/page-data/covid-vaccine-tracker/page-data.json"
        self.parse_file(url)

    def parse_file(self, url):
        """
        Converts a json file to data we can submit via Sheepdog. Stores the
        records to submit in `self.location_data` and `self.time_series_data`.
        Ignores any records that are already in Sheepdog (relies on unique
        `submitter_id` to check)

        Args:
            url (str): URL at which the file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            try:
                for treatment in data["result"]["pageContext"]["treatments"]:
                    node = treatment["node"]
                    clinical_trial = self.parse_node(node)
                    self.clinical_trials.append(clinical_trial)
            except ValueError as e:
                print(f"ERROR: value error. Detail {e}")

    def parse_node(self, node):
        """
        Converts an element of an JSON file to data we can submit via Sheepdog

        Args:
            node (dict): node data

        Returns:
            dict:
                - clinical trial data, in a format ready to be submitted to Sheepdog
        """
        clinical_trial = {
            "projects": [{"code": self.project_code}],
            "type": "clinical_trials",
        }

        for key, value in node.items():
            if key not in MAP_FIELDS:
                continue
            gen3_field = MAP_FIELDS.get(key)[0]
            gen3_field_type = MAP_FIELDS.get(key)[1]
            if type(value) != gen3_field_type:
                print(
                    f"ERROR: The type of {key} does not match with the one in Gen3. Skip it"
                )
                continue
            if key == "fdaApproved":
                if "FDA-approved" in value:
                    value = "Yes"
                elif value == "":
                    value = "Unknown"
                elif value in ["N/A", "N//A", "N/A*"]:
                    value = "NA"
                elif value not in ["Yes", "No", "Unknown", "NA", None]:
                    value = "Unknown"
            if key == "customClinicalPhase":
                if value.lower() == "phase na":
                    value = "Phase N/A"
                elif value.lower() in ["preclinical", "pre-clinical"]:
                    value = "Preclinical Phase"
                elif value not in [
                    "Preclinical Phase",
                    "Phase I",
                    "Phase I/II",
                    "Phase II",
                    "Phase I/II/III",
                    "Phase III",
                    "Phase III/IV",
                    "Phase IV",
                    "Phase I/III/IV",
                    "Phase I/IV",
                    "Phase II/IV",
                    "Phase II/III/IV",
                    "Phase I/II/III/IV",
                    "Phase II/III",
                    "Phase N/A",
                    None,
                ]:
                    value = None
            if key == "technology":
                value = value.replace("*", "")
                if "to repurpose" in value.lower():
                    value = "Repurposed"
                if value not in [
                    "Antibodies",
                    "Antivirals",
                    "Cell-based therapies",
                    "Device",
                    "DNA-based",
                    "Inactivated virus",
                    "Modified APC",
                    "Non-replicating viral vector",
                    "Protein subunit",
                    "RNA-based treatments",
                    "RNA-based vaccine",
                    "Repurposed",
                    "Virus Like Particle",
                    "Other",
                    None,
                ]:
                    value = "Other"
            if key == "developmentStage":
                if value.lower() in ["preclinical", "pre-clinical"]:
                    value = "Preclinical Phase"
                elif value not in ["Preclinical Phase", "Clinical", "Withdrawn", None]:
                    value = "Other"

            if gen3_field_type == list:
                value = [str(v) for v in value]
            clinical_trial[gen3_field] = value
        return clinical_trial

    def submit_metadata(self):
        """
        Converts the data in `self.time_series_data` to Sheepdog records.
        `self.location_data already contains Sheepdog records. Batch submits
        all records in `self.clinical_trials`
        """

        print("Submitting clinical_trial data")
        for clinical_trial in self.clinical_trials:
            self.metadata_helper.add_record_to_submit(clinical_trial)
        self.metadata_helper.batch_submit_records()
Beispiel #30
0
 def __init__(self, base_url, access_token, s3_bucket):
     super().__init__(base_url, access_token, s3_bucket)
     self.location_data = {}
     self.time_series_data = defaultdict(lambda: defaultdict(dict))
     self.program_name = "open"
     self.project_code = "JHU"
     self.metadata_helper = MetadataHelper(
         base_url=self.base_url,
         program_name=self.program_name,
         project_code=self.project_code,
         access_token=access_token,
     )
     self.expected_csv_headers = {
         "global":
         ["Province/State", "Country/Region", "Lat", "Long", "1/22/20"],
         "US_counties": {
             "confirmed": [
                 "UID",
                 "iso2",
                 "iso3",
                 "code3",
                 "FIPS",
                 "Admin2",
                 "Province_State",
                 "Country_Region",
                 "Lat",
                 "Long_",
                 "Combined_Key",
                 "1/22/20",
             ],
             "deaths": [
                 "UID",
                 "iso2",
                 "iso3",
                 "code3",
                 "FIPS",
                 "Admin2",
                 "Province_State",
                 "Country_Region",
                 "Lat",
                 "Long_",
                 "Combined_Key",
                 "Population",  # TODO use this
                 "1/22/20",
             ],
         },
     }
     self.header_to_column = {
         "global": {
             "province": 0,
             "country": 1,
             "latitude": 2,
             "longitude": 3,
             "dates_start": 4,
         },
         "US_counties": {
             "confirmed": {
                 "iso2": 1,
                 "iso3": 2,
                 "code3": 3,
                 "FIPS": 4,
                 "county": 5,
                 "province": 6,
                 "country": 7,
                 "latitude": 8,
                 "longitude": 9,
                 "dates_start": 11,
             },
             "deaths": {
                 "iso2": 1,
                 "iso3": 2,
                 "code3": 3,
                 "FIPS": 4,
                 "county": 5,
                 "province": 6,
                 "country": 7,
                 "latitude": 8,
                 "longitude": 9,
                 "dates_start": 12,
             },
         },
     }
     self.existing_summary_locations = []
     self.last_date = ""