Exemple #1
0
class JHU_COUNTRY_CODES(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.program_name = "open"
        self.project_code = "JHU"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def files_to_submissions(self):
        codes_dict = get_codes_dictionary()
        locations = self.get_existing_locations()
        for location in locations:
            codes = get_codes_for_country_name(codes_dict,
                                               location["country_region"])

            # do not update the record if it already has the codes
            if location["iso2"] == codes["iso2"] and location["iso3"] == codes[
                    "iso3"]:
                continue

            record = {k: v for k, v in location.items() if v != None}
            record.update({
                "type": "summary_location",
                "projects": [{
                    "code": self.project_code
                }],
                "iso2": codes["iso2"],
                "iso3": codes["iso3"],
            })
            self.metadata_helper.add_record_to_submit(record)

    def submit_metadata(self):
        self.metadata_helper.batch_submit_records()

    def get_existing_locations(self):
        print("Getting summary_location data from Peregrine")
        query_string = ('{ summary_location (first: 0, project_id: "' +
                        self.program_name + "-" + self.project_code +
                        '") { submitter_id, country_region, iso2, iso3 } }')
        query_res = self.metadata_helper.query_peregrine(query_string)
        return [location for location in query_res["data"]["summary_location"]]
def main():
    headers = {"Authorization": f"bearer {access_token}"}
    records = get_existing_data(base_url, program, project, old_node, headers)

    metadata_helper = MetadataHelper(
        base_url=base_url,
        program_name=program,
        project_code=project,
        access_token=access_token,
    )
    print(f"Submitting {new_node} data")
    for old_rec in records:
        new_rec = {"type": new_node, "project_id": f"{program}-{project}"}
        for key, value in old_rec.items():
            if value:
                new_rec[key] = value
        metadata_helper.add_record_to_submit(new_rec)
    metadata_helper.batch_submit_records()
Exemple #3
0
class STOPLIGHT(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.summary_clinicals = []
        self.summary_locations = []
        self.program_name = "open"
        self.project_code = "covidstoplight"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def files_to_submissions(self):
        """
        Reads json files and converts the data to Sheepdog records
        """
        url = "https://covidstoplight.org/api/v0/location/US"
        self.parse_file(url)

    def parse_file(self, url):
        """
        Converts a json file to data we can submit via Sheepdog. Stores the
        records to submit in `self.location_data` and `self.time_series_data`.
        Ignores any records that are already in Sheepdog (relies on unique
        `submitter_id` to check)

        Args:
            url (str): URL at which the file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            timestamp_created = data["data"]["generated"]
            country = data["country_code"]
            summary_location_list = []
            try:
                for zipcode, feelings in data["data"]["submissions"].items():
                    node = {
                        "zipcode": zipcode,
                        "feelings": feelings,
                        "timestamp_created": timestamp_created,
                        "country": country,
                    }
                    summary_location, summary_clinical = self.parse_node(node)
                    summary_location_submitter_id = summary_location[
                        "submitter_id"]
                    if summary_location_submitter_id not in summary_location_list:
                        self.summary_locations.append(summary_location)
                        summary_location_list.append(
                            summary_location_submitter_id)
                    self.summary_clinicals.append(summary_clinical)
            except ValueError as e:
                print(f"ERROR: value error. Detail {e}")

    def parse_node(self, node):
        """
        Converts an element of an JSON file to data we can submit via Sheepdog

        Args:
            node (dict): node data

        Returns:
            (dict, dict) tuple:
                - location data, in a format ready to be submitted to Sheepdog
                - { "date1": <value>, "date2": <value> } from the row data
        """
        zipcode = node["zipcode"]
        feelings = node["feelings"]
        timestamp_created = node["timestamp_created"]
        country = node["country"]
        summary_location_submitter_id = format_location_submitter_id(
            country, zipcode)
        summary_location = {
            "country_region": country,
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "zipcode": zipcode,
        }

        date = datetime.strptime(timestamp_created, "%Y-%m-%dT%H:%M:%S").date()
        date = date.strftime("%Y-%m-%d")
        summary_clinical_submitter_id = format_summary_clinical_submitter_id(
            summary_location_submitter_id, date)

        summary_clinical = {
            "date": date,
            "timestamp_created": timestamp_created,
            "submitter_id": summary_clinical_submitter_id,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        map_fields = {
            1: "feeling_healthy_count",
            2: "feeling_not_so_good_count",
            3: "feeling_sick_count",
        }

        for element in feelings:
            summary_clinical[map_fields[element["feeling"]]] = element["count"]

        return summary_location, summary_clinical

    def submit_metadata(self):
        """
        Converts the data in `self.time_series_data` to Sheepdog records.
        `self.location_data already contains Sheepdog records. Batch submits
        all records in `self.location_data` and `self.time_series_data`
        """

        print("Submitting summary_location data")
        for loc in self.summary_locations:
            loc_record = {"type": "summary_location"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for rep in self.summary_clinicals:
            rep_record = {"type": "summary_clinical"}
            rep_record.update(rep)
            self.metadata_helper.add_record_to_submit(rep_record)
        self.metadata_helper.batch_submit_records()
Exemple #4
0
class IDPH_HOSPITAL(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "IDPH-Hospital"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.country = "US"
        self.state = "IL"

        self.summary_locations = []
        self.summary_clinicals = []

    def files_to_submissions(self):
        """
        Reads JSON file and convert the data to Sheepdog records
        """

        latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph(
        )
        today = datetime.date.today()
        if latest_submitted_date == today:
            print(
                "Nothing to submit: today and latest submitted date are the same."
            )
            return
        today_str = today.strftime("%Y%m%d")

        print(f"Getting data for date: {today_str}")
        url = "https://dph.illinois.gov/sitefiles/COVIDHospitalRegions.json"
        self.parse_file(latest_submitted_date, url)

    def parse_file(self, latest_submitted_date, url):
        """
        Converts a JSON files to data we can submit via Sheepdog. Stores the
        records to submit in `self.summary_locations` and `self.summary_clinicals`.

        Args:
            latest_submitted_date (date): the date of latest available "summary_clinical" for project
            url (str): URL at which the JSON file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            date = idph_get_date(data["LastUpdateDate"])

            if latest_submitted_date and date == latest_submitted_date.strftime(
                    "%Y-%m-%d"):
                print(
                    "Nothing to submit: latest submitted date and date from data are the same."
                )
                return

            (
                summary_location,
                summary_clinical_statewide_current,
            ) = self.parse_statewide_values(date, data["statewideValues"])

            self.summary_locations.append(summary_location)

            for utilization in data["HospitalUtilizationResults"]:
                summary_clinical = self.parse_historical(
                    utilization, summary_clinical_statewide_current)

                self.summary_clinicals.append(summary_clinical)

            for region in data["regionValues"]:
                (summary_location,
                 summary_clinical) = self.parse_region(date, region)

                self.summary_locations.append(summary_location)
                self.summary_clinicals.append(summary_clinical)

    def parse_historical(self, utilization,
                         summary_clinical_statewide_current):
        utilization_mapping = {
            "reportDate": "date",
            "TotalBeds": "state_total_beds",
            "TotalOpenBeds": "total_open_beds",
            "TotalInUseBedsNonCOVID": "total_in_use_beds_non_covid",
            "TotalInUseBedsCOVID": "total_in_use_beds_covid",
            "ICUBeds": "icu_beds",
            "ICUOpenBeds": "icu_open_beds",
            "ICUInUseBedsNonCOVID": "icu_in_use_beds_non_covid",
            "ICUInUseBedsCOVID": "icu_in_use_beds_covid",
            "VentilatorCapacity": "ventilator_capacity",
            "VentilatorAvailable": "ventilator_available",
            "VentilatorInUseNonCOVID": "ventilator_in_use_non_covid",
            "VentilatorInUseCOVID": "ventilator_in_use_covid",
        }
        date = utilization["reportDate"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "project": "idph_hospital",
                "country": self.country,
                "state": self.state
            },
        )

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {
                "project": "idph_hospital",
                "date": date
            },
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        for k, v in utilization.items():
            summary_clinical[utilization_mapping[k]] = v

        if (summary_clinical_submitter_id ==
                summary_clinical_statewide_current["submitter_id"]):
            summary_clinical.update(summary_clinical_statewide_current)

        return summary_clinical

    def parse_statewide_values(self, date, statewide_values):
        statewide_mapping = {
            "ICUCapacity": "state_icu_capacity",
            "ICUCovidPatients": "state_icu_covid_patients",
            "VentCapacity": "state_vent_capacity",
            "VentCovidPatients": "state_vent_covid_patients",
            "ICUAvailable": "state_icu_available",
            "VentsAvailable": "state_vents_available",
            "TotalBeds": "state_total_beds",
            "TotalBedsAvailable": "state_total_beds_available",
            "TotalBedsUsed": "state_total_beds_used",
            "PctHospitalBedsAvailable": "state_pct_hospital_beds_available",
            "AdultICUCapacity": "state_adult_icu_capacity",
            "ICUOpenBeds": "state_icu_open_beds",
            "ICUBedsUsed": "state_icu_beds_used",
            "ICUOpenBedsPct": "state_icu_open_beds_pct",
            "COVIDPUIPatients": "state_covid_pui_patients",
            "COVIDPUIPatientsPct": "state_covid_pui_patients_pct",
            "COVIDPUIPatientsBedsInUsePct":
            "state_covid_pui_patients_beds_in_use_pct",
            "VentilatorCapacity": "state_ventilator_capacity",
            "VentilatorsOpen": "state_ventilators_open",
            "VentilatorsOpenPct": "state_Ventilators_open_pct",
            "VentilatorsInUse": "state_ventilators_in_use",
            "VentilatorsInUseCOVID": "state_ventilators_in_use_covid",
            "VentilatorsCOVIDPatientsPct":
            "state_ventilators_covid_patients_pct",
            "VentilatorsCOVIDPatientsInUsePct":
            "state_ventilators_covid_patients_in_use_pct",
            "CovidPatientsNonICU": "state_covid_patients_non_icu",
            "TotalCOVIDPUIInICU": "state_total_covid_pui_in_icu",
            "TotalCOVIDPUIInHospital": "state_total_covid_pui_in_hospital",
            "PctBedsCOVIDPUI": "state_pct_beds_covid_pui",
            "MedSurgBeds": "state_med_surg_beds",
            "MedSurgBedsOpen": "state_med_surg_beds_open",
            "MedSurgBedsOpenPct": "state_med_surg_beds_open_pct",
            "MedSurgBedsInUse": "state_med_surg_beds_in_use",
        }

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "project": "idph_hospital",
                "country": self.country,
                "state": self.state
            },
        )

        summary_location = {
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "country_region": self.country,
            "province_state": self.state,
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {
                "project": "idph_hospital",
                "date": date
            },
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        for k, v in statewide_values.items():
            summary_clinical[statewide_mapping[k]] = v

        return summary_location, summary_clinical

    def parse_region(self, date, hospital_region):
        """
        From county-level data, generate the data we can submit via Sheepdog
        """
        region = hospital_region["region"]
        region_description = hospital_region["region_description"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "project": "idph_hospital",
                "country": self.country,
                "state": self.state,
                "region": region,
            },
        )

        summary_location = {
            "country_region": self.country,
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "province_state": self.state,
            "state_hospital_region": region,
            "state_region_description": strip_prefix(region_description),
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {
                "project": "idph_hospital",
                "date": date
            },
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
            "region_icu_avail": hospital_region["ICUAvail"],
            "region_icu_capacity": hospital_region["ICUCapacity"],
            "region_vents_available": hospital_region["VentsAvailable"],
            "region_vents_capacity": hospital_region["VentsCapacity"],
        }

        return summary_location, summary_clinical

    def submit_metadata(self):
        print("Submitting data...")
        print("Submitting summary_location data")
        for sl in self.summary_locations:
            sl_record = {"type": "summary_location"}
            sl_record.update(sl)
            self.metadata_helper.add_record_to_submit(sl_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals:
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Exemple #5
0
class NPI_PRO(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "NPI-PRO"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.country = "US"

        self.summary_locations = []
        self.summary_clinicals = []

    def download_dataset(self, url):
        r = requests.get(url, allow_redirects=True)
        tf = tempfile.NamedTemporaryFile(suffix=".gdb.zip", delete=False)
        with open(tf.name, "wb") as npi_pro_geodatabase:
            npi_pro_geodatabase.write(r.content)

        return tf.name

    def files_to_submissions(self):
        print("Getting geodatabase for NPI-PRO dataset...")
        url = "https://www.arcgis.com/sharing/rest/content/items/7e80baf1773e4fd9b44fe9fb054677db/data"
        tf = self.download_dataset(url)
        self.parse_file(file_path=tf)

    def parse_file(self, file_path):
        try:
            gdf = gpd.read_file(file_path)
        except Exception as e:
            print(e)
            return

        print("Until better solution, submit only Illinois data")
        il_only = gdf.loc[gdf["Provider_Business_Practice_ST"] == "IL"]

        for i, row in il_only.iterrows():
            summary_location, summary_clinical = self.parse_row(row)
            self.summary_locations.append(summary_location)
            self.summary_clinicals.append(summary_clinical)

    def parse_row(self, row):
        fields_mapping = {
            "NPI": ("summary_location", "npi"),
            "Provider_First_Line_Business_Pra": (
                "summary_location",
                "first_line_address",
            ),
            "Provider_Second_Line_Business_Pr": (
                "summary_location",
                "second_line_address",
            ),
            "Provider_Business_Practice_City": ("summary_location", "city"),
            "Provider_Business_Practice_ST":
            ("summary_location", "province_state"),
            "TaxonomyCode": ("summary_clinical", "taxonomy_code"),
            "ProviderType": ("summary_clinical", "provider_type"),
            "ProviderSubtype": ("summary_clinical", "provider_subtype"),
            "DetailedSpecialty": ("summary_clinical", "detailed_specialty"),
        }

        npi = row["NPI"]
        state = row["Provider_Business_Practice_ST"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location", {
                "country": self.country,
                "state": state,
                "npi": npi
            })

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id, "summary_location",
            "summary_clinical", {})

        result = {
            "summary_location": {
                "submitter_id": summary_location_submitter_id,
                "projects": [{
                    "code": self.project_code
                }],
            },
            "summary_clinical": {
                "submitter_id":
                summary_clinical_submitter_id,
                "summary_locations": [{
                    "submitter_id":
                    summary_location_submitter_id
                }],
            },
        }

        for original_field, mappings in fields_mapping.items():
            node, node_field = mappings
            if node_field == "npi":
                result[node][node_field] = str(row[original_field])
            else:
                result[node][node_field] = row[original_field]

        return result["summary_location"], result["summary_clinical"]

    def submit_metadata(self):
        print("Submitting data...")
        print("Submitting summary_location data")
        for sl in self.summary_locations:
            sl_record = {"type": "summary_location"}
            sl_record.update(sl)
            self.metadata_helper.add_record_to_submit(sl_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals:
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Exemple #6
0
class IDPH_ZIPCODE(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "IDPH-zipcode"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.country = "US"
        self.state = "IL"

        self.summary_locations = []
        self.summary_clinicals = []

    def files_to_submissions(self):
        """
        Reads JSON file and convert the data to Sheepdog records
        """

        latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph(
        )
        today = datetime.date.today()
        if latest_submitted_date == today:
            print(
                "Nothing to submit: today and latest submitted date are the same."
            )
            return

        today_str = today.strftime("%Y%m%d")
        print(f"Getting data for date: {today_str}")
        url = "http://dph.illinois.gov/sitefiles/COVIDZip.json?nocache=1"
        self.parse_file(latest_submitted_date, url)

    def parse_file(self, latest_submitted_date, url):
        """
        Converts a JSON files to data we can submit via Sheepdog. Stores the
        records to submit in `self.summary_locations` and `self.summary_clinicals`.

        Args:
            latest_submitted_date (date): date for latest submitted date
            url (str): URL at which the JSON file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            date = idph_get_date(data["LastUpdateDate"])

            if latest_submitted_date and date == latest_submitted_date.strftime(
                    "%Y-%m-%d"):
                print(
                    "Nothing to submit: latest submitted date and date from data are the same."
                )
                return

            for zipcode_values in data["zip_values"]:
                (summary_location,
                 summary_clinical) = self.parse_zipcode(date, zipcode_values)

                self.summary_locations.append(summary_location)
                self.summary_clinicals.append(summary_clinical)

    def parse_zipcode(self, date, zipcode_values):
        """
        From county-level data, generate the data we can submit via Sheepdog
        """
        zipcode = zipcode_values["zip"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "country": self.country,
                "state": self.state,
                "zipcode": zipcode
            },
        )

        summary_location = {
            "submitter_id": summary_location_submitter_id,
            "country_region": self.country,
            "province_state": self.state,
            "zipcode": zipcode,
            "projects": [{
                "code": self.project_code
            }],
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {"date": date},
        )
        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "confirmed": zipcode_values["confirmed_cases"],
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        if "demographics" in zipcode_values:
            demographic = zipcode_values["demographics"]

            for k, v in fields_mapping.items():
                field, mapping = v
                demographic_group = demographic[k]

                for item in demographic_group:
                    dst_field = mapping[item[field]]
                    if dst_field:
                        if "count" in item:
                            age_group_count_field = "{}_{}".format(
                                mapping[item[field]], "count")
                            summary_clinical[age_group_count_field] = item[
                                "count"]
                        if "tested" in item:
                            age_group_tested_field = "{}_{}".format(
                                mapping[item[field]], "tested")
                            summary_clinical[age_group_tested_field] = item[
                                "tested"]

        return summary_location, summary_clinical

    def submit_metadata(self):
        """
        Submits the data in `self.summary_locations` and `self.summary_clinicals` to Sheepdog.
        """
        print("Submitting data...")
        print("Submitting summary_location data")
        for sl in self.summary_locations:
            sl_record = {"type": "summary_location"}
            sl_record.update(sl)
            self.metadata_helper.add_record_to_submit(sl_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals:
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Exemple #7
0
class CHI_NBHD(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.summary_locations = []
        self.summary_clinicals = []

        self.program_name = "open"
        self.project_code = "CHI-NBHD"

        self.country = "US"
        self.state = "IL"

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def files_to_submissions(self):
        """
        Reads JSON file and convert the data to Sheepdog records
        """
        url = "https://covid19neighborhoods.southsideweekly.com/page-data/index/page-data.json"
        self.parse_file(url)

    def parse_file(self, url):
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            data = data["result"]["data"]
            build_time_str = data["build_time"]["nodes"][0]["buildTime"]
            build_time = datetime.datetime.strptime(
                build_time_str, "%Y-%m-%dT%H:%M:%S.%fZ"
            )
            current_date = build_time.strftime("%Y-%m-%d")
            nbhd_stats = data["community_areas_all"]["nodes"][0]["childGeoJson"][
                "features"
            ]

            for nbhd_object in nbhd_stats:
                summary_location, summary_clinical = self.parse_nbhd(
                    nbhd_object, current_date
                )

                self.summary_locations.append(summary_location)
                self.summary_clinicals.append(summary_clinical)

                print(summary_location)
                print(summary_clinical)

    def parse_nbhd(self, nbhd_object, date):
        properties = nbhd_object["properties"]
        nbhd = properties["community"]
        deaths = properties["value"]
        population = properties["population"]

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {"country": self.country, "state": self.state, "nbhd": nbhd},
        )

        summary_location = {
            "submitter_id": summary_location_submitter_id,
            "community_area": nbhd,
            "projects": [{"code": self.project_code}],
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {"date": date},
        )

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "deaths_per_10000": round(10000 * deaths / population, 2),
            "deaths": deaths,
            "summary_locations": [{"submitter_id": summary_location_submitter_id}],
        }

        return summary_location, summary_clinical

    def submit_metadata(self):
        print("Submitting summary_location data")
        for loc in self.summary_locations:
            loc_record = {"type": "summary_location"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals:
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Exemple #8
0
class NCBI(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "ncbi-covid-19"
        self.manifest_bucket = "sra-pub-sars-cov2"
        self.sra_src_manifest = "sra-src/Manifest"
        self.accession_number_filename_map = {}

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.file_helper = AsyncFileHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.data_file = NCBI_FILE(
            base_url=self.base_url,
            s3_bucket=self.project_code,
            access_token=access_token,
        )

        self.submitting_data = {
            "sample": [],
            "virus_sequence": [],
            "core_metadata_collection": [],
            "virus_sequence_run_taxonomy": [],
            "virus_sequence_contig": [],
            "virus_sequence_blastn": [],
            "virus_sequence_contig_taxonomy": [],
            "virus_sequence_peptide": [],
            "virus_sequence_hmm_search": [],
        }

        self.submitting_data["core_metadata_collection"].append({
            "submitter_id":
            format_submitter_id("cmc_ncbi_covid19", {}),
            "projects": [{
                "code": self.project_code
            }],
        })

        read_ncbi_manifest(
            self.manifest_bucket,
            self.sra_src_manifest,
            self.accession_number_filename_map,
        )

    def submit_metadata(self):

        start = time.strftime("%X")
        loop = asyncio.get_event_loop()
        tasks = []

        for node_name, _ in self.data_file.nodes.items():
            if node_name == "virus_sequence_run_taxonomy":
                continue
            else:
                tasks.append(
                    asyncio.ensure_future(
                        self.files_to_node_submissions(node_name)))

        try:
            results = loop.run_until_complete(asyncio.gather(*tasks))
            loop.run_until_complete(
                asyncio.gather(
                    self.files_to_virus_sequence_run_taxonomy_submission(
                        results[0])))
            if AsyncFileHelper.session:
                loop.run_until_complete(
                    asyncio.gather(AsyncFileHelper.close_session()))
        finally:
            loop.close()
        end = time.strftime("%X")

        for k, v in self.submitting_data.items():
            print(f"Submitting {k} data...")
            for node in v:
                node_record = {"type": k}
                node_record.update(node)
                self.metadata_helper.add_record_to_submit(node_record)
            self.metadata_helper.batch_submit_records()

        print(f"Running time: From {start} to {end}")

    async def files_to_virus_sequence_run_taxonomy_submission(
            self, submitting_accession_numbers):
        """get submitting data for virus_sequence_run_taxonomy node"""

        if not submitting_accession_numbers:
            return

        records = self._get_response_from_big_query(
            submitting_accession_numbers)

        # Keep track accession_numbers having link to virus_sequence nodes
        accession_number_set = set()
        for record in records:
            if record["acc"] in self.accession_number_filename_map:
                accession_number = record["acc"]
                print(f"Get from bigquery response {accession_number}")
                success = await self._parse_big_query_response(record)
                if success:
                    accession_number_set.add(accession_number)

        cmc_submitter_id = format_submitter_id("cmc_ncbi_covid19", {})
        for accession_number in submitting_accession_numbers:
            virus_sequence_run_taxonomy_submitter_id = format_submitter_id(
                "virus_sequence_run_taxonomy",
                {"accession_number": accession_number})
            submitted_json = {
                "submitter_id": virus_sequence_run_taxonomy_submitter_id,
                "core_metadata_collections": [{
                    "submitter_id": cmc_submitter_id
                }],
                "accession_number": accession_number,
                "data_type": "Virus Sequence Run Taxonomy Analysis",
                "data_format": "json",
                "data_category": "Kmer-based Taxonomy Analysis",
            }

            # Add link to virus sequence node
            if accession_number in accession_number_set:
                submitted_json["virus_sequences"] = [{
                    "submitter_id":
                    f"virus_sequence_{accession_number}"
                }]

            filename = f"virus_sequence_run_taxonomy_{accession_number}.csv"
            print(f"Get indexd info of {filename}")
            trying = True
            while trying:
                try:
                    (
                        did,
                        rev,
                        md5sum,
                        filesize,
                        file_name,
                        authz,
                    ) = await self.file_helper.async_find_by_name(
                        filename=filename)
                    trying = False
                except Exception as e:
                    print(
                        f"Can not get indexd record of {filename}. Detail {e}. Retrying..."
                    )

            assert (
                did
            ), f"file {filename} does not exist in the index, rerun NCBI_FILE ETL"

            if not authz:
                tries = 0
                while tries < MAX_RETRIES:
                    try:
                        await self.file_helper.async_update_authz(did=did,
                                                                  rev=rev)
                        break
                    except Exception as e:
                        tries += 1
                        print(
                            f"Can not update indexd for {did}. Detail {e}. Retrying..."
                        )

            submitted_json["file_size"] = filesize
            submitted_json["md5sum"] = md5sum
            submitted_json["object_id"] = did
            submitted_json["file_name"] = file_name

            self.submitting_data["virus_sequence_run_taxonomy"].append(
                submitted_json)

    async def files_to_node_submissions(self, node_name):
        """Get submitting data for the node"""

        retrying = True
        while retrying:
            try:
                submitting_accession_numbers = (
                    await self.get_submitting_accession_number_list(node_name))
                retrying = False
            except Exception as e:
                print(
                    f"Can not query peregine with {node_name}. Detail {e}. Retrying ..."
                )

        for accession_number in submitting_accession_numbers:
            submitter_id = format_submitter_id(
                node_name, {"accession_number": accession_number})

            cmc_submitter_id = format_submitter_id("cmc_ncbi_covid19", {})

            contig_submitter_id = format_submitter_id(
                "virus_sequence_contig",
                {"accession_number": accession_number})
            peptide_submitter_id = format_submitter_id(
                "virus_sequence_peptide",
                {"accession_number": accession_number})
            run_taxonomy_submitter_id = format_submitter_id(
                "virus_sequence_run_taxonomy",
                {"accession_number": accession_number})

            contig_taxonomy_submitter_id = format_submitter_id(
                "virus_sequence_contig_taxonomy",
                {"accession_number": accession_number})

            if node_name == "virus_sequence_contig":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequences_run_taxonomies": [{
                        "submitter_id":
                        run_taxonomy_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Virus Sequence Contig",
                    "data_format":
                    "json",
                    "data_category":
                    "Nucleotide Contig",
                }
            elif node_name == "virus_sequence_blastn":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequence_contigs": [{
                        "submitter_id":
                        contig_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Virus Sequence Blastn",
                    "data_format":
                    "tsv",
                    "data_category":
                    "Nucleotide Blast",
                }
            elif node_name == "virus_sequence_peptide":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequence_contigs": [{
                        "submitter_id":
                        contig_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Peptides Annotation Using VIGOR3",
                    "data_format":
                    "json",
                    "data_category":
                    "Peptides Annotation",
                }
            elif node_name == "virus_sequence_hmm_search":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequence_peptides": [{
                        "submitter_id":
                        peptide_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Virus Sequence HMM Search",
                    "data_format":
                    "json",
                    "data_category":
                    "HMMER Scab of Contigs",
                }
            elif node_name == "virus_sequence_contig_taxonomy":
                submitted_json = {
                    "submitter_id":
                    submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        cmc_submitter_id
                    }],
                    "virus_sequence_contigs": [{
                        "submitter_id":
                        contig_submitter_id
                    }],
                    "accession_number":
                    accession_number,
                    "data_type":
                    "Contig Taxonomy",
                    "data_format":
                    "json",
                    "data_category":
                    "Kmer-based Taxonomy Analysis of Contigs",
                }

            else:
                raise Exception(f"ERROR: {node_name} does not exist")

            ext = re.search("\.(.*)$",
                            self.data_file.nodes[node_name][0]).group(1)
            filename = f"{node_name}_{accession_number}.{ext}"

            print(f"Get indexd record of {filename}")

            retrying = True
            while retrying:
                try:
                    (
                        did,
                        rev,
                        md5sum,
                        filesize,
                        file_name,
                        authz,
                    ) = await self.file_helper.async_find_by_name(
                        filename=filename)
                    retrying = False
                except Exception as e:
                    print(
                        f"ERROR: Fail to query indexd for {filename}. Detail {e}. Retrying ..."
                    )
                    await asyncio.sleep(5)

            assert (
                did
            ), f"file {filename} does not exist in the index, rerun NCBI_FILE ETL"

            if not authz:
                tries = 0
                while tries < MAX_RETRIES:
                    try:
                        await self.file_helper.async_update_authz(did=did,
                                                                  rev=rev)
                        break
                    except Exception as e:
                        tries += 1
                        print(
                            f"ERROR: Fail to update indexd for {filename}. Detail {e}. Retrying ..."
                        )
                        await asyncio.sleep(5)

            submitted_json["file_size"] = filesize
            submitted_json["md5sum"] = md5sum
            submitted_json["object_id"] = did
            submitted_json["file_name"] = file_name

            self.submitting_data[node_name].append(submitted_json)
        return submitting_accession_numbers

    async def get_submitting_accession_number_list_for_run_taxonomy(self):
        """get submitting number list for run_taxonomy file"""

        node_name = "virus_sequence_run_taxonomy"
        submitting_accession_numbers = set()
        existed_accession_numbers = await self.data_file.get_existed_accession_numbers(
            node_name)

        s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED))
        s3_object = s3.Object(self.data_file.bucket,
                              self.data_file.nodes[node_name][0])
        file_path = f"{DATA_PATH}/virus_sequence_run_taxonomy.gz"
        s3_object.download_file(file_path)

        n_lines = 0
        with gzip.open(file_path, "rb") as f:
            while True:
                bline = f.readline()
                if not bline:
                    break
                n_lines += 1
                if n_lines % 10000 == 0:
                    print(f"Finish process {n_lines} of file {node_name}")
                line = bline.decode("UTF-8")
                r1 = re.findall("[SDE]RR\d+", line)
                if len(r1) == 0:
                    continue
                read_accession_number = r1[0]
                if (f"{node_name}_{read_accession_number}"
                        not in existed_accession_numbers):
                    submitting_accession_numbers.add(read_accession_number)
        return list(submitting_accession_numbers)

    async def get_submitting_accession_number_list(self, node_name):
        """get submitting acession number list"""

        submitting_accession_numbers = set()
        existed_accession_numbers = await self.data_file.get_existed_accession_numbers(
            node_name)

        s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED))
        s3_object = s3.Object(self.data_file.bucket,
                              self.data_file.nodes[node_name][0])
        line_stream = codecs.getreader("utf-8")
        n_lines = 0
        for line in line_stream(s3_object.get()["Body"]):
            r1 = re.findall("[SDE]RR\d+", line)
            n_lines += 1
            if n_lines % 10000 == 0:
                print(f"Finish process {n_lines} of file {node_name}")
            if len(r1) == 0:
                continue
            read_accession_number = r1[0]
            if (f"{node_name}_{read_accession_number}".lower()
                    not in existed_accession_numbers):
                submitting_accession_numbers.add(read_accession_number)

        return list(submitting_accession_numbers)

    def _get_response_from_big_query(self, accession_numbers):
        """
        Get data from big query. The format of the response json is
        described as below:
        [{
            "acc": "DRR220591",
            "assay_type": "RNA-Seq",
            "center_name": "KUMAMOTO",
            "consent": "public",
            "experiment": "DRX210904",
            "sample_name": "SAMD00217265",
            "instrument": "Illumina NovaSeq 6000",
            "librarylayout": "PAIRED",
            "libraryselection": "RANDOM",
            "librarysource": "TRANSCRIPTOMIC",
            "platform": "ILLUMINA",
            "sample_acc": "DRS139760",
            "biosample": "SAMD00217265",
            "organism": "Mus musculus",
            "sra_study": "DRP006149",
            #'releasedate': datetime.datetime(2020, 6, 4, 0, 0, tzinfo=<UTC>),
            "bioproject": "PRJDB9618",
            "mbytes": 2160,
            "loaddate": None,
            "avgspotlen": 300,
            "mbases": 6395,
            "insertsize": None,
            "library_name": None,
            "biosamplemodel_sam": [],
            "collection_date_sam": [],
            "geo_loc_name_country_calc": None,
            "geo_loc_name_country_continent_calc": None,
            "geo_loc_name_sam": [],
            "ena_first_public_run": [],
            "ena_last_update_run": [],
            "sample_name_sam": ["WT3_plus"],
            "datastore_filetype": ["sra"],
            "datastore_provider": ["gs", "ncbi", "s3"],
            "datastore_region": ["gs.US", "ncbi.public", "s3.us-east-1"],
        }]
        """

        assert accession_numbers != [], "accession_numbers is not empty"

        start = 0
        offset = 100
        client = bigquery.Client()
        while start < len(accession_numbers):
            end = min(start + offset, len(accession_numbers))
            stm = 'SELECT * FROM `nih-sra-datastore`.sra.metadata where consent = "public"'

            stm = stm + f' and (acc = "{accession_numbers[start]}"'
            for accession_number in accession_numbers[start + 1:end]:
                stm = stm + f' or acc = "{accession_number}"'
            stm = stm + ")"

            query_job = client.query(stm)

            results = query_job.result()  # Waits for job to complete.

            for row in results:
                yield dict(row)
            start = end

    async def _parse_big_query_response(self, response):
        """
        Parse the big query response and get indexd record

        Return True if success

        """

        accession_number = response["acc"]

        sample = {}
        virus_sequence = {}

        sample["submitter_id"] = f"sample_{accession_number}"
        sample["projects"] = [{"code": self.project_code}]

        for field in [
                "ncbi_bioproject",
                "ncbi_biosample",
                "sample_accession",
                "host_associated_environmental_package_sam",
                "organism",
                "collection_date",
                "country_region",
                "continent",
        ]:
            if field in SPECIAL_MAP_FIELDS:
                old_name, dtype, handler = SPECIAL_MAP_FIELDS[field]
                sample[field] = handler(response.get(old_name))
            elif field in response:
                sample[field] = str(response.get(field))

        virus_sequence["submitter_id"] = f"virus_sequence_{accession_number}"
        for field in [
                "assay_type",
                "avgspotlen",
                "bytes",
                "center_name",
                "consent",
                "datastore_provider",
                "datastore_region",
                "description_sam",
                "ena_checklist_sam",
                "ena_first_public_run",
                "ena_last_update_run",
                "experiment",
                "insdc_center_name_sam",
                "insdc_first_public_sam",
                "insdc_center_alias_sam",
                "insdc_last_update_sam",
                "investigation_type_sam",
                "insdc_status_sam",
                "instrument",
                "library_name",
                "libraryselection",
                "librarysource",
                "mbases",
                "mbytes",
                "platform",
                "sra_accession_sam",
                "sra_study",
                "title_sam",
                "release_date",
                "data_format",
                "librarylayout",
        ]:
            if field in SPECIAL_MAP_FIELDS:
                old_name, dtype, handler = SPECIAL_MAP_FIELDS[field]
                virus_sequence[field] = handler(response.get(old_name))
            elif field in response:
                virus_sequence[field] = str(response.get(field))

        virus_sequence["samples"] = [{"submitter_id": sample["submitter_id"]}]
        virus_sequence["data_category"] = "Nucleotide"
        virus_sequence["data_type"] = "Sequence"
        virus_sequence["file_name"] = self.accession_number_filename_map[
            accession_number]

        virus_sequence["data_format"] = get_file_extension(
            virus_sequence["file_name"])
        filename = virus_sequence["file_name"]

        retrying = True
        while retrying:
            try:
                (
                    did,
                    rev,
                    md5sum,
                    filesize,
                    file_name,
                    authz,
                ) = await self.file_helper.async_find_by_name(filename=filename
                                                              )
                retrying = False
            except Exception as e:
                print(
                    f"ERROR: Fail to get indexd for {filename}. Detail {e}. Retrying ..."
                )
                await asyncio.sleep(5)

        if not did:
            print(
                f"file {filename} does not exist in the index, rerun NCBI_MANIFEST ETL"
            )
            return False

        if not authz:
            retries = 0
            while retries < MAX_RETRIES:
                try:
                    await self.file_helper.async_update_authz(did=did, rev=rev)
                    break
                except Exception as e:
                    print(
                        f"ERROR: Fail to update indexd for {filename}. Detail {e}. Retrying ..."
                    )
                    retries += 1
                    await asyncio.sleep(5)

        virus_sequence["file_size"] = filesize
        virus_sequence["md5sum"] = md5sum
        virus_sequence["object_id"] = did

        self.submitting_data["virus_sequence"].append(virus_sequence)
        self.submitting_data["sample"].append(sample)
        return True
Exemple #9
0
class IDPH(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "IDPH"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.country = "US"
        self.state = "IL"

        self.county_dict = {}
        self.il_counties()

        self.summary_locations = []
        self.summary_clinicals = []

    def get_location_and_clinical_submitter_id(self, county, date):
        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {"country": self.country, "state": self.state, "county": county}
            if county is not None
            else {"country": self.country, "state": self.state},
        )
        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {"date": date},
        )
        return summary_location_submitter_id, summary_clinical_submitter_id

    def il_counties(self):
        with open(
            os.path.join(CURRENT_DIR, "data/IL_counties_central_coords_lat_long.tsv")
        ) as f:
            counties = f.readlines()
            counties = counties[1:]
            counties = map(lambda l: l.strip().split("\t"), counties)

        for county, lat, lon in counties:
            self.county_dict[county] = {"lat": lat, "lon": lon}

    def files_to_submissions(self):
        """
        Reads JSON file and convert the data to Sheepdog records.
        """
        latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph()
        today = datetime.date.today()
        if latest_submitted_date == today:
            print("Nothing to submit: today and latest submitted date are the same.")
            return

        today_str = today.strftime("%Y%m%d")
        print(f"Getting data for date: {today_str}")

        # they changed the URL on April 1, 2020
        if today > datetime.date(2020, 3, 31):
            url = "http://www.dph.illinois.gov/sitefiles/COVIDTestResults.json"
        else:
            url = f"https://www.dph.illinois.gov/sites/default/files/COVID19/COVID19CountyResults{today_str}.json"
        self.parse_file(latest_submitted_date, url)

    def parse_file(self, latest_submitted_date, url):
        """
        Converts a JSON files to data we can submit via Sheepdog. Stores the
        records to submit in `self.summary_locations` and `self.summary_clinicals`.

        Args:
            latest_submitted_date (date): date for latest submitted date
            url (str): URL at which the JSON file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            date = idph_get_date(data["LastUpdateDate"])

            if latest_submitted_date and date == latest_submitted_date.strftime(
                "%Y-%m-%d"
            ):
                print(
                    "Nothing to submit: latest submitted date and date from data are the same."
                )
                return

            for county in data["characteristics_by_county"]["values"]:
                demographic = data.get("demographics", None)
                summary_location, summary_clinical = self.parse_county(
                    date, county, demographic
                )

                self.summary_locations.append(summary_location)
                self.summary_clinicals.append(summary_clinical)

            for illinois_data in data["state_testing_results"]["values"]:
                illinois_historic_data = self.parse_historical_data(illinois_data)
                self.summary_clinicals.append(illinois_historic_data)

    def parse_historical_data(self, illinois_data):
        """
        Parses historical state-level data. "summary_location" node is created
        from "characteristics_by_county" data.

        Args:
            illinois_data (dict): data JSON with "testDate", "total_tested",
                "confirmed_cases" and "deaths"

        Returns:
            dict: "summary_clinical" node for Sheepdog
        """
        county = "Illinois"

        date = datetime.datetime.strptime(
            illinois_data["testDate"], "%m/%d/%Y"
        ).strftime("%Y-%m-%d")

        (
            summary_location_submitter_id,
            summary_clinical_submitter_id,
        ) = self.get_location_and_clinical_submitter_id(county, date)

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "confirmed": illinois_data["confirmed_cases"],
            "testing": illinois_data["total_tested"],
            "deaths": illinois_data["deaths"],
            "summary_locations": [{"submitter_id": summary_location_submitter_id}],
        }

        return summary_clinical

    def parse_county(self, date, county_json, demographic):
        """
        From county-level data, generate the data we can submit via Sheepdog

        Args:
            date (date): date
            county_json (dict): JSON for county statistics

        Returns:
            (dict, dict): "summary_location" and "summary_clinical" records
        """
        county = county_json["County"]

        (
            summary_location_submitter_id,
            summary_clinical_submitter_id,
        ) = self.get_location_and_clinical_submitter_id(county, date)

        summary_location = {
            "submitter_id": summary_location_submitter_id,
            "country_region": self.country,
            "province_state": self.state,
            "projects": [{"code": self.project_code}],
        }

        # the IDPH data use Illinois in "County" field for aggregated data
        # in Gen3 it would equal to location with "province_state" equal to "IL" and no "County" field
        if county != "Illinois":
            summary_location["county"] = county

        if county in self.county_dict:
            summary_location["latitude"] = self.county_dict[county]["lat"]
            summary_location["longitude"] = self.county_dict[county]["lon"]
        else:
            if county_json["lat"] != 0:
                summary_location["latitude"] = str(county_json["lat"])
            if county_json["lon"] != 0:
                summary_location["longitude"] = str(county_json["lon"])

        summary_clinical = {
            "submitter_id": summary_clinical_submitter_id,
            "date": date,
            "confirmed": county_json["confirmed_cases"],
            "testing": county_json["total_tested"],
            "deaths": county_json["deaths"],
            "summary_locations": [{"submitter_id": summary_location_submitter_id}],
        }

        if "negative" in county_json:
            summary_clinical["negative"] = county_json["negative"]

        if county == "Illinois" and demographic:
            for k, v in fields_mapping.items():
                field, mapping = v
                demographic_group = demographic[k]

                for item in demographic_group:
                    dst_field = mapping[item[field]]
                    if dst_field:
                        if "count" in item:
                            age_group_count_field = "{}_{}".format(
                                mapping[item[field]], "count"
                            )
                            summary_clinical[age_group_count_field] = item["count"]
                        if "tested" in item:
                            age_group_tested_field = "{}_{}".format(
                                mapping[item[field]], "tested"
                            )
                            summary_clinical[age_group_tested_field] = item["tested"]
        return summary_location, summary_clinical

    def submit_metadata(self):
        """
        Submits the data in `self.summary_locations` and `self.summary_clinicals` to Sheepdog.
        """
        print("Submitting data...")
        print("Submitting summary_location data")
        for sl in self.summary_locations:
            sl_record = {"type": "summary_location"}
            sl_record.update(sl)
            self.metadata_helper.add_record_to_submit(sl_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals:
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Exemple #10
0
class COXRAY(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "COXRAY"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.file_helper = FileHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.nodes = {
            "core_metadata_collection": [],
            "study": [],
            "subject": [],
            "observation": [],
            "follow_up": [],
            "demographic": [],
            "imaging_file": [],
        }

    def files_to_submissions(self):
        with open(Path(COXRAY_DATA_PATH).joinpath("metadata.csv")) as f:
            reader = csv.reader(f, delimiter=",", quotechar='"')
            headers = next(reader)
            for row in reader:
                row_nodes = self.parse_row(headers, row)
                for k, v in row_nodes.items():
                    self.nodes[k].append(v)

    def parse_row(self, headers, row):
        cmc_submitter_id = format_submitter_id("cmc_coxray", {})
        subject_submitter_id = format_submitter_id(
            "subject_coxray", {"patientid": row[headers.index("patientid")]})
        observation_submitter_id = derived_submitter_id(
            subject_submitter_id, "subject_coxray", "observation_coxray", {})
        follow_up_submitter_id = derived_submitter_id(
            subject_submitter_id,
            "subject_coxray",
            "follow_up_coxray",
            {"offset": row[headers.index("offset")]},
        )
        demographic_submitter_id = derived_submitter_id(
            subject_submitter_id, "subject_coxray", "demographic_coxray", {})
        imaging_file_submitter_id = format_submitter_id(
            "imaging_file_coxray",
            {"filename": row[headers.index("filename")]})
        study_submitter_id = format_submitter_id(
            "study_coxray", {"doi": row[headers.index("doi")]})

        filename = row[headers.index("filename")]
        filename = Path(filename)
        filepath = Path(COXRAY_DATA_PATH).joinpath("images", filename)
        filepath_exist = filepath.exists()

        nodes = {
            "core_metadata_collection": {
                "submitter_id": cmc_submitter_id,
                "projects": [{
                    "code": self.project_code
                }],
            },
            "study": {
                "submitter_id": study_submitter_id,
                "projects": [{
                    "code": self.project_code
                }],
            },
            "subject": {
                "submitter_id": subject_submitter_id,
                "projects": [{
                    "code": self.project_code
                }],
                "studies": [{
                    "submitter_id": study_submitter_id
                }],
            },
            "observation": {
                "submitter_id": observation_submitter_id,
                "subjects": [{
                    "submitter_id": subject_submitter_id
                }],
            },
            "follow_up": {
                "submitter_id": follow_up_submitter_id,
                "subjects": [{
                    "submitter_id": subject_submitter_id
                }],
            },
            "demographic": {
                "submitter_id": demographic_submitter_id,
                "subjects": [{
                    "submitter_id": subject_submitter_id
                }],
            },
        }

        if filepath_exist:
            data_type = "".join(filename.suffixes)
            did, rev, md5sum, filesize = self.file_helper.find_by_name(
                filename=filename)
            assert (
                did
            ), f"file {filename} does not exist in the index, rerun COXRAY_FILE ETL"
            self.file_helper.update_authz(did=did, rev=rev)

            nodes["imaging_file"] = {
                "submitter_id": imaging_file_submitter_id,
                "subjects": [{
                    "submitter_id": subject_submitter_id
                }],
                "follow_ups": [{
                    "submitter_id": follow_up_submitter_id
                }],
                "core_metadata_collections": [{
                    "submitter_id": cmc_submitter_id
                }],
                "data_type": data_type,
                "data_format": "Image File",
                "data_category": "X-Ray Image",
                "file_size": filesize,
                "md5sum": md5sum,
                "object_id": did,
            }
        else:
            print(
                f"subject references the file that doesn't exist as a file: {filepath}"
            )

        for k, (node, field, converter) in fields_mapping.items():
            value = row[headers.index(k)]
            if node in nodes and value:
                if converter:
                    nodes[node][field] = converter(value)
                else:
                    nodes[node][field] = value

        return nodes

    def submit_metadata(self):
        print("Submitting data...")

        for k, v in self.nodes.items():
            submitter_id_exist = []
            print(f"Submitting {k} data...")
            for node in v:
                node_record = {"type": k}
                node_record.update(node)
                submitter_id = node_record["submitter_id"]
                if submitter_id not in submitter_id_exist:
                    submitter_id_exist.append(submitter_id)
                    self.metadata_helper.add_record_to_submit(node_record)
            self.metadata_helper.batch_submit_records()
Exemple #11
0
class OWID(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.summary_locations = []
        self.summary_clinicals = []

        self.program_name = "open"
        self.project_code = "OWID"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        # structure is
        # (csv field name, (node type, node field name, type of field))
        testing_fields = [
            ("ISO code", ("summary_location", "iso3", str)),
            ("Entity", (None, None, split_entity)),
            ("Date", ("summary_clinical", "date", str)),
            ("Source URL", ("summary_clinical", "source_url", str)),
            ("Source label", ("summary_clinical", "source_label", str)),
            ("Notes", ("summary_clinical", "notes", str)),
            ("Number of observations", ("summary_clinical", "num_observations",
                                        int)),
            ("Cumulative total", ("summary_clinical", "testing", int)),
            (
                "Cumulative total per thousand",
                ("summary_clinical", "cumulative_total_per_thousand", int),
            ),
            (
                "Daily change in cumulative total",
                ("summary_clinical", "daily_change_in_cumulative_total", int),
            ),
            (
                "Daily change in cumulative total per thousand",
                (
                    "summary_clinical",
                    "daily_change_in_cumulative_total_per_thousand",
                    int,
                ),
            ),
            (
                "7-day smoothed daily change",
                ("summary_clinical", "seven_day_smoothed_daily_change", int),
            ),
            (
                "7-day smoothed daily change per thousand",
                (
                    "summary_clinical",
                    "seven_day_smoothed_daily_change_per_thousand",
                    float,
                ),
            ),
            ("Short-term positive rate", (None, None, None)),
            ("Short-term tests per case", (None, None, None)),
            ("General source label", ("summary_clinical",
                                      "general_source_label", str)),
            ("General source URL", ("summary_clinical", "general_source_url",
                                    str)),
            ("Short description", ("summary_clinical", "short_description",
                                   str)),
            ("Detailed description", ("summary_clinical",
                                      "detailed_description", str)),
        ]

        self.headers_mapping = {
            field: (k, mapping)
            for k, (field, mapping) in enumerate(testing_fields)
        }

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/testing/covid-testing-latest-data-source-details.csv"
        self.parse_file(url)

    def parse_file(self, url):
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            assert (
                headers[0] != "404: Not Found"
            ), "  Unable to get file contents, received {}.".format(headers)

            expected_h = list(self.headers_mapping.keys())
            obtained_h = headers[:len(expected_h)]
            assert (
                obtained_h == expected_h
            ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format(
                expected_h, obtained_h)

            for row in reader:
                summary_location, summary_clinical = self.parse_row(
                    row, self.headers_mapping)

                if summary_location not in self.summary_locations:
                    self.summary_locations.append(summary_location)
                self.summary_clinicals.append(summary_clinical)

    def parse_row(self, row, mapping):
        summary_location = {}
        summary_clinical = {}

        for k, (i, (node_type, node_field, type_conv)) in mapping.items():
            if k == "Entity":
                country, test_type = split_entity(row[i])
                summary_location["country_region"] = country
                summary_clinical["test_type"] = test_type
            if node_field:
                value = row[i]
                if value:
                    if node_type == "summary_location":
                        summary_location[node_field] = type_conv(value)
                    if node_type == "summary_clinical":
                        if type_conv == int:
                            summary_clinical[node_field] = type_conv(
                                float(value))
                        else:
                            summary_clinical[node_field] = type_conv(value)

        summary_location_submitter_id = format_location_submitter_id(
            summary_location)

        summary_location["submitter_id"] = summary_location_submitter_id
        summary_location["projects"] = [{"code": self.project_code}]

        summary_clinical[
            "submitter_id"] = format_summary_clinical_submitter_id(
                summary_location_submitter_id,
                test_type=summary_clinical["test_type"],
                date=datetime.date.today().strftime("%Y-%m-%d"),
            )
        summary_clinical["summary_locations"] = [{
            "submitter_id":
            summary_location_submitter_id
        }]

        return summary_location, summary_clinical

    def submit_metadata(self):
        print("Submitting summary_location data")
        for loc in self.summary_locations:
            loc_record = {"type": "summary_location"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for rep in self.summary_clinicals:
            rep_record = {"type": "summary_clinical"}
            rep_record.update(rep)
            self.metadata_helper.add_record_to_submit(rep_record)
        self.metadata_helper.batch_submit_records()
Exemple #12
0
class CTP(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.summary_locations = []
        self.summary_clinicals = []
        self.header_to_column = {}

        self.program_name = "open"
        self.project_code = "CTP"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.expected_file_headers = set([
            "date",
            "state",
            "positive",
            "negative",
            "pending",
            "totalTestResults",
            "hospitalizedCurrently",
            "hospitalizedCumulative",
            "inIcuCurrently",
            "inIcuCumulative",
            "onVentilatorCurrently",
            "onVentilatorCumulative",
            "recovered",
            "dataQualityGrade",
            "lastUpdateEt",
            "dateModified",
            "checkTimeEt",
            "death",
            "hospitalized",
            "dateChecked",
            "totalTestsViral",
            "positiveTestsViral",
            "negativeTestsViral",
            "positiveCasesViral",
            "deathConfirmed",
            "deathProbable",
            "totalTestEncountersViral",
            "totalTestsPeopleViral",
            "totalTestsAntibody",
            "positiveTestsAntibody",
            "negativeTestsAntibody",
            "totalTestsPeopleAntibody",
            "positiveTestsPeopleAntibody",
            "negativeTestsPeopleAntibody",
            "totalTestsPeopleAntigen",
            "positiveTestsPeopleAntigen",
            "totalTestsAntigen",
            "positiveTestsAntigen",
            "fips",
            "positiveIncrease",
            "negativeIncrease",
            "total",
            "totalTestResultsSource",
            "totalTestResultsIncrease",
            "posNeg",
            "deathIncrease",
            "hospitalizedIncrease",
            "hash",
            "commercialScore",
            "negativeRegularScore",
            "negativeScore",
            "positiveScore",
            "score",
            "grade",
        ])

        self.expected_race_headers = set([
            "Date",
            "State",
            "Cases_Total",
            "Cases_White",
            "Cases_Black",
            "Cases_Latinx",
            "Cases_Asian",
            "Cases_AIAN",
            "Cases_NHPI",
            "Cases_Multiracial",
            "Cases_Other",
            "Cases_Unknown",
            "Cases_Ethnicity_Hispanic",
            "Cases_Ethnicity_NonHispanic",
            "Cases_Ethnicity_Unknown",
            "Deaths_Total",
            "Deaths_White",
            "Deaths_Black",
            "Deaths_Latinx",
            "Deaths_Asian",
            "Deaths_AIAN",
            "Deaths_NHPI",
            "Deaths_Multiracial",
            "Deaths_Other",
            "Deaths_Unknown",
            "Deaths_Ethnicity_Hispanic",
            "Deaths_Ethnicity_NonHispanic",
            "Deaths_Ethnicity_Unknown",
        ])

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        url = "https://api.covidtracking.com/v1/states/daily.csv"
        self.parse_file(url)

    def extract_races(self):
        """
        Extract race information. Store the data to a dictionary for
        fast lookup during merging process.

        """
        url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vS8SzaERcKJOD_EzrtCDK1dX1zkoMochlA9iHoHg_RSw3V8bkpfk1mpw4pfL5RdtSOyx_oScsUtyXyk/pub?gid=43720681&single=true&output=csv"
        print("Getting data from {}".format(url))
        races = {}
        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')
            headers = next(reader)

            assert (
                headers[0] != "404: Not Found"
            ), "Unable to get file contents, received {}.".format(headers)
            assert len(headers) >= 3, "Unexpected headers: {}".format(headers)
            assert (headers[0], headers[1], headers[2]) == (
                "Date",
                "State",
                "Cases_Total",
            ), "The first 3 column names of the race data must be Dat, State, Cases_Total. Got: {}".format(
                headers)
            assert self.expected_race_headers.issubset(
                set(headers)
            ), "CSV headers have changed (expected {} is a subset of {}). We may need to update the ETL code".format(
                self.expected_race_headers, headers)

            for row in reader:
                if not row:
                    continue
                try:
                    races[(row[0], row[1], row[2])] = row[3:]
                except Exception as e:
                    print(
                        f"Error processing race row: {row}.\nSkipping row. Detail: {e}"
                    )
        return races, headers

    def parse_file(self, url):
        """
        Converts a CSV file to data we can submit via Sheepdog. Stores the
        records to submit in `self.location_data` and `self.time_series_data`.
        Ignores any records that are already in Sheepdog (relies on unique
        `submitter_id` to check)

        Args:
            url (str): URL at which the CSV file is available
        """
        races, race_headers = self.extract_races()
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            assert (
                headers[0] != "404: Not Found"
            ), "Unable to get file contents, received {}.".format(headers)

            assert self.expected_file_headers.issubset(
                set(headers)
            ), "CSV headers have changed (expected {} is a subset of {}). We may need to update the ETL code".format(
                self.expected_file_headers, headers)

            headers = headers + race_headers[3:]

            for i in range(0, len(headers)):
                self.header_to_column[headers[i]] = i

            summary_location_list = []

            for row in reader:
                if (row[0], row[1], row[2]) in races:
                    [row.append(k) for k in races[(row[0], row[1], row[2])]]
                else:
                    [
                        row.append("")
                        for _ in range(len(self.expected_race_headers) - 3)
                    ]

                summary_location, summary_clinical = self.parse_row(row)

                summary_location_submitter_id = summary_location[
                    "submitter_id"]
                if summary_location_submitter_id not in summary_location_list:
                    self.summary_locations.append(summary_location)
                    summary_location_list.append(summary_location_submitter_id)

                self.summary_clinicals.append(summary_clinical)

    def parse_row(self, row):
        """
        Converts a row of a CSV file to data we can submit via Sheepdog

        Args:
            row (list(str)): row of data

        Returns:
            (dict, dict) tuple:
                - location data, in a format ready to be submitted to Sheepdog
                - { "date1": <value>, "date2": <value> } from the row data
        """

        date = row[self.header_to_column["date"]]
        date = datetime.strptime(date, "%Y%m%d").date()
        date = date.strftime("%Y-%m-%d")

        country = "US"
        state = row[self.header_to_column["state"]]
        summary_location_submitter_id = format_location_submitter_id(
            country, state)

        summary_location = {
            "country_region": country,
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "province_state": state,
        }

        fips = row[self.header_to_column["fips"]]
        if fips:
            summary_location["FIPS"] = int(fips)

        summary_clinical_submitter_id = format_summary_clinical_submitter_id(
            summary_location_submitter_id, date)
        summary_clinical = {
            "date": date,
            "submitter_id": summary_clinical_submitter_id,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        map_csv_fields = {
            "confirmed": "positive",
            "negative": "negative",
            "pending": "pending",
            "hospitalizedCurrently": "hospitalizedCurrently",
            "hospitalizedCumulative": "hospitalizedCumulative",
            "inIcuCurrently": "inIcuCurrently",
            "inIcuCumulative": "inIcuCumulative",
            "onVentilatorCurrently": "onVentilatorCurrently",
            "recovered": "recovered",
            "totalTestsViral": "totalTestsViral",
            "positiveTestsViral": "positiveTestsViral",
            "negativeTestsViral": "negativeTestsViral",
            "positiveCasesViral": "positiveCasesViral",
            "positiveIncrease": "positiveIncrease",
            "negativeIncrease": "negativeIncrease",
            "totalTestResultsIncrease": "totalTestResultsIncrease",
            "deathIncrease": "deathIncrease",
            "hospitalizedIncrease": "hospitalizedIncrease",
            "race_white_count": "Cases_White",
            "race_black_count": "Cases_Black",
            "race_hispanic_count": "Cases_Latinx",
            "race_asian_count": "Cases_Asian",
            "race_ai_an_count": "Cases_AIAN",
            "race_nh_pi_count": "Cases_NHPI",
            "race_multiracial_count": "Cases_Multiracial",
            "race_other_count": "Cases_Other",
            "race_left_blank_count": "Cases_Unknown",
            "ethnicity_hispanic_count": "Cases_Ethnicity_Hispanic",
            "ethnicity_nonhispanic_count": "Cases_Ethnicity_NonHispanic",
            "ethnicity_unknown_count": "Cases_Ethnicity_Unknown",
            "deaths": "Deaths_Total",
            "race_white_deaths": "Deaths_White",
            "race_black_deaths": "Deaths_Black",
            "race_hispanic_deaths": "Deaths_Latinx",
            "race_asian_deaths": "Deaths_Asian",
            "race_ai_an_deaths": "Deaths_AIAN",
            "race_nh_pi_deaths": "Deaths_NHPI",
            "race_multiracial_deaths": "Deaths_Multiracial",
            "race_other_deaths": "Deaths_Other",
            "race_left_blank_deaths": "Deaths_Unknown",
            "ethnicity_hispanic_deaths": "Deaths_Ethnicity_Hispanic",
            "ethnicity_nonhispanic_deaths": "Deaths_Ethnicity_NonHispanic",
            "ethnicity_unknown_deaths": "Deaths_Ethnicity_Unknown",
        }

        for k, v in map_csv_fields.items():
            value = row[self.header_to_column[v]]
            if value and value.lower() not in ["nan", "n/a"]:
                summary_clinical[k] = int(value.replace(",", ""))

        dataQualityGrade = row[self.header_to_column["dataQualityGrade"]]
        if dataQualityGrade:
            summary_clinical["dataQualityGrade"] = dataQualityGrade

        lastUpdateEt = row[self.header_to_column["lastUpdateEt"]]
        if lastUpdateEt:
            summary_clinical["lastUpdateEt"] = lastUpdateEt

        return summary_location, summary_clinical

    def submit_metadata(self):
        """
        Converts the data in `self.time_series_data` to Sheepdog records.
        `self.location_data already contains Sheepdog records. Batch submits
        all records in `self.location_data` and `self.time_series_data`
        """

        # Commented
        # Only required for one time submission of summary_location
        print("Submitting summary_location data")
        for loc in self.summary_locations:
            loc_record = {"type": "summary_location"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals:
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Exemple #13
0
class LOAD_VIRUS_METADATA(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        # Get all input strings from YAML
        script = path.splitext(path.basename(__file__))[0].strip("/")
        script = path.join(CURRENT_DIR, script + ".yaml")
        with open(script) as f:
            config = yaml.load(f, Loader=yaml.FullLoader)

        self.verbose = config["verbose"]
        self.program_name = config["program_name"]
        self.project_code = config["project_code"]
        self.virus_genome_data_category = config["virus_genome_data_category"]
        self.virus_genome_data_type = config["virus_genome_data_type"]
        self.virus_genome_data_format = config["virus_genome_data_format"]
        self.virus_genome_source = config["virus_genome_source"]
        self.virus_genome_type = config["virus_genome_type"]
        self.virus_sequence_type = config["virus_sequence_type"]
        self.virus_sequence_data_type = config["virus_sequence_data_type"]
        self.virus_sequence_data_format = config["virus_sequence_data_format"]
        self.virus_sequence_alignment_type = config["virus_sequence_alignment_type"]
        self.virus_sequence_alignment_data_type = config[
            "virus_sequence_alignment_data_type"
        ]
        self.virus_sequence_alignment_data_format = config[
            "virus_sequence_alignment_data_format"
        ]
        self.virus_sequence_alignment_tool = config["virus_sequence_alignment_tool"]
        self.virus_sequence_hmm_type = config["virus_sequence_hmm_type"]
        self.virus_sequence_hmm_data_type = config["virus_sequence_hmm_data_type"]
        self.virus_sequence_hmm_data_format = config["virus_sequence_hmm_data_format"]
        self.virus_genomes = []
        self.virus_sequences = []
        self.virus_sequence_alignments = []
        self.virus_sequence_hmms = []

        self.metadata_helper = MetadataHelper(
            base_url=base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def checksum(self, filename):
        with open(filename, "rb") as f:
            bytes = f.read()
        return hashlib.md5(bytes).hexdigest()

    def files_to_submissions(self):
        latest_submitted_date = (
            self.metadata_helper.get_latest_submitted_data_virus_genome()
        )
        today = datetime.date.today()
        if latest_submitted_date == today:
            print("Nothing to submit: today and latest submitted date are the same.")
            return

    def submit_metadata(self):
        latest_submitted_date = (
            self.metadata_helper.get_latest_submitted_data_virus_genome()
        )
        today = datetime.date.today()
        if latest_submitted_date == today:
            print("Nothing to submit: today and latest submitted date are the same.")
            return
        self.read()
        self.write()

    def read(self):
        self.genomes = glob.glob("*.gb", recursive=False)
        self.seqs = glob.glob("*.fasta", recursive=False)
        self.alns = glob.glob("*.aln", recursive=False)
        self.hmms = glob.glob("*.hmm", recursive=False)

    def write(self):
        # Genomes
        for genome in self.genomes:
            virus_genome_submitter_id = genome.replace(".", "_")
            virus_genome = {
                "data_category": self.virus_genome_data_category,
                "data_type": self.virus_genome_data_type,
                "data_format": self.virus_genome_data_format,
                "source": self.virus_genome_source,
                "submitter_id": virus_genome_submitter_id,
                "file_name": genome,
                "md5sum": self.checksum(genome),
                "file_size": path.getsize(genome),
                "projects": [{"code": self.project_code}],
            }
            self.virus_genomes.append(virus_genome)

        if self.verbose:
            print("Submitting virus_genome data")
        for genome in self.virus_genomes:
            genome_record = {"type": self.virus_genome_type}
            genome_record.update(genome)
            self.metadata_helper.add_record_to_submit(genome_record)
        self.metadata_helper.batch_submit_records()

        # Sequences
        for seq in self.seqs:
            virus_sequence_id = seq.replace(".", "_")
            # Data Category: Protein or Nucleotide
            seqtype = "Protein" if "-aa.fasta" in seq else "Nucleotide"
            virus_sequence = {
                "data_category": seqtype,
                "data_type": self.virus_sequence_data_type,
                "data_format": self.virus_sequence_data_format,
                "submitter_id": virus_sequence_id,
                "file_name": seq,
                "md5sum": self.checksum(seq),
                "file_size": path.getsize(seq),
                "projects": [{"code": self.project_code}],
            }
            self.virus_sequences.append(virus_sequence)

        if self.verbose:
            print("Submitting virus_sequence data")
        for seq in self.virus_sequences:
            seq_record = {"type": self.virus_sequence_type}
            seq_record.update(seq)
            self.metadata_helper.add_record_to_submit(seq_record)
        self.metadata_helper.batch_submit_records()

        # Alignments
        for aln in self.alns:
            virus_sequence_alignment_id = aln.replace(".", "_")
            # Data Category: Protein or Nucleotide
            seqtype = "Protein" if "-aa.aln" in aln else "Nucleotide"
            virus_sequence_alignment = {
                "data_category": seqtype,
                "data_type": self.virus_sequence_alignment_data_type,
                "data_format": self.virus_sequence_alignment_data_format,
                "submitter_id": virus_sequence_alignment_id,
                "file_name": aln,
                "md5sum": self.checksum(aln),
                "file_size": path.getsize(aln),
                "projects": [{"code": self.project_code}],
                "alignment_tool": self.virus_sequence_alignment_tool,
            }
            self.virus_sequence_alignments.append(virus_sequence_alignment)

        if self.verbose:
            print("Submitting virus_sequence_alignment data")
        for aln in self.virus_sequence_alignments:
            aln_record = {"type": self.virus_sequence_alignment_type}
            aln_record.update(aln)
            self.metadata_helper.add_record_to_submit(aln_record)
        self.metadata_helper.batch_submit_records()

        # HMMs
        for hmm in self.hmms:
            virus_sequence_hmm_id = hmm.replace(".", "_")
            # Data Category: Protein or Nucleotide
            seqtype = "Protein" if "-aa.hmm" in hmm else "Nucleotide"
            virus_sequence_hmm = {
                "data_category": seqtype,
                "data_type": self.virus_sequence_hmm_data_type,
                "data_format": self.virus_sequence_hmm_data_format,
                "submitter_id": virus_sequence_hmm_id,
                "file_name": hmm,
                "md5sum": self.checksum(hmm),
                "file_size": path.getsize(hmm),
                "projects": [{"code": self.project_code}],
            }
            self.virus_sequence_hmms.append(virus_sequence_hmm)

        if self.verbose:
            print("Submitting virus_sequence_hmm data")
        for hmm in self.virus_sequence_hmms:
            hmm_record = {"type": self.virus_sequence_hmm_type}
            hmm_record.update(hmm)
            self.metadata_helper.add_record_to_submit(hmm_record)
        self.metadata_helper.batch_submit_records()
Exemple #14
0
class IDPH_VACCINE(IDPH):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "IDPH-Vaccine"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )
        self.country = "US"
        self.state = "IL"
        self.date = ""
        self.counties_inventory = {}

        self.summary_locations = {}
        self.summary_clinicals = {}
        self.summary_group_demographic = {}

    def parse_list_of_counties(self):
        """
        Store into `self.date` the date the data was last updated, and
        into `self.counties_inventory` the data in format:
            { <county name>: { <county properties> } }
        """
        response = requests.get(ROOT_URL, headers={"content-type": "json"})
        json_response = json.loads(response.text)
        self.date = idph_get_date(json_response.get("lastUpdatedDate"))
        print(f"Dataset's last updated date: {self.date}")
        root_json = json_response.get("VaccineAdministration")
        if root_json is None:
            return
        for item in root_json:
            county = item.get("CountyName")
            self.counties_inventory[county] = item

    def files_to_submissions(self):
        """
        Reads JSON file and convert the data to Sheepdog records
        """

        # latest_submitted_date = (
        #     self.metadata_helper.get_latest_submitted_date_idph()
        # )
        # if latest_submitted_date != None and latest_submitted_date == self.date:
        #     print(
        #         "Nothing to submit: data of latest submitted date and IDPH are the same."
        #     )
        #     return

        self.parse_link()

    def get_group_clinical_demographic_submitter_id(
            self, summary_clinical_submitter_id, key_dict):
        summary_group_demographic_submitter_id = derived_submitter_id(
            summary_clinical_submitter_id,
            "summary_clinical",
            "summary_group_demographic",
            key_dict,
        )
        return summary_group_demographic_submitter_id

    def map_race(self, value, prop_name):
        race_mapping = {
            "Black or African-American": "Black",
            "Other race": "Other",
            "Native Hawaiian or Other Pacif":
            "Native Hawaiian or Other Pacific Islander",
            "American Indian or Alaska Nati":
            "American Indian or Alaska Native",
            "Hispanic or Latino": "Hispanic",
        }
        gender_mapping = {
            "Unknown": "Unknown or Left Blank",
        }
        age_group_mapping = {"65+": "greater than 65"}
        if prop_name == "Race" and value in race_mapping:
            return race_mapping.get(value)
        if prop_name == "Gender" and value in gender_mapping:
            return gender_mapping.get(value)
        if prop_name == "AgeGroup" and value in age_group_mapping:
            return age_group_mapping.get(value)
        return value

    def parse_group_clinical_demographic(self, props_mapping, props_value):
        key_props_name = ["AgeGroup", "Race", "Gender"]
        key_props = {}
        for k in key_props_name:
            key = props_mapping.get(k)
            if key is not None:
                key_props[key] = props_value.get(k)
        props_data = {}
        for (k, v) in props_mapping.items():
            if k in props_value:
                if k in key_props_name:
                    value = props_value.get(k)
                    if k == "AgeGroup":
                        value = value.replace("-", " to ")
                    props_data[v] = self.map_race(value, k)
                else:
                    props_data[v] = props_value.get(k)
        return key_props, props_data

    def parse_link(self):
        """
        Converts the source data to data we can submit via Sheepdog. Stores
        the records to submit in `self.summary_locations`,
        `self.summary_clinicals` and `self.summary_group_demographic`.
        """
        illinois_summary_clinical_submitter_id = self.parse_county_data()
        self.parse_total_state_wide(illinois_summary_clinical_submitter_id)

    def parse_county_data(self):
        """
        For each county, converts the raw data into Sheepdog submissions by
        mapping properties to match the PRC data dictionary.
        Return the `submitter_id` for the state-wide `summary_clinical` record.
        """
        county_vaccine_mapping = {
            "AdministeredCount": "vaccine_administered_count",
            "AdministeredCountChange": "vaccine_administered_count_change",
            "AdministeredCountRollAvg": "vaccine_administered_count_roll_avg",
            "PersonsFullyVaccinated": "vaccine_persons_fully_vaccinated",
            "Report_Date": "date",
            "PctVaccinatedPopulation": "vaccine_persons_fully_vaccinated_pct",
        }

        county_demo_mapping = {
            "AgeGroup": "age_group",
            "Race": "race",
            "Gender": "gender",
            "AdministeredCount": "vaccine_administered_count",
            "PersonsFullyVaccinated": "vaccine_persons_fully_vaccinated",
        }

        inventory_reported = {
            "LHDReportedInventory": "vaccine_LHDR_reported_inventory",
            "CommunityReportedInventory":
            "vaccine_community_reported_inventory",
            "TotalReportedInventory": "vaccine_reported_inventory",
            "InventoryReportDate": "date",
        }

        self.parse_list_of_counties()
        illinois_summary_clinical_submitter_id = ""
        for county in self.counties_inventory:
            county_covid_response = requests.get(
                COUNTY_COVID_LINK_FORMAT.format(county),
                headers={"content-type": "json"},
            )
            county_covid_data = json.loads(
                county_covid_response.text).get("CurrentVaccineAdministration")
            county_demo_response = requests.get(
                COUNTY_DEMO_LINK_FORMAT.format(county),
                headers={"content-type": "json"})
            county_demo_data = json.loads(county_demo_response.text)

            (
                summary_location_submitter_id,
                summary_clinical_submitter_id,
            ) = self.get_location_and_clinical_submitter_id(county, self.date)
            if county.lower() == "illinois":
                illinois_summary_clinical_submitter_id = summary_clinical_submitter_id

            for k in ["Age", "Race", "Gender"]:
                data = county_demo_data.get(k)
                for item in data:
                    keys, props = self.parse_group_clinical_demographic(
                        county_demo_mapping, item)
                    group_demographics_submitter_id = (
                        self.get_group_clinical_demographic_submitter_id(
                            summary_clinical_submitter_id, keys))
                    props["submitter_id"] = group_demographics_submitter_id
                    props["summary_clinicals"] = [{
                        "submitter_id":
                        summary_clinical_submitter_id
                    }]
                    self.summary_group_demographic[
                        group_demographics_submitter_id] = props

            summary_location = {
                "country_region": self.country,
                "submitter_id": summary_location_submitter_id,
                "projects": [{
                    "code": self.project_code
                }],
                "province_state": self.state,
                "county": county,
            }
            summary_clinical = {
                "submitter_id":
                summary_clinical_submitter_id,
                "date":
                self.date,
                "summary_locations": [{
                    "submitter_id":
                    summary_location_submitter_id
                }],
            }
            for (key, value) in county_vaccine_mapping.items():
                if value == "vaccine_persons_fully_vaccinated_pct":
                    summary_clinical[value] = int(
                        county_covid_data.get(key) * 100)
                elif value == "vaccine_administered_count_roll_avg":
                    summary_clinical[value] = int(county_covid_data.get(key))
                elif value == "date":
                    summary_clinical[value] = remove_time_from_date_time(
                        county_covid_data.get(key))
                else:
                    summary_clinical[value] = county_covid_data.get(key)
            # for (key, value) in county_demo_mapping.items():
            #     summary_clinical[value] = county_demo_data.get(key)
            for (key, value) in inventory_reported.items():
                summary_clinical[value] = (
                    self.counties_inventory[county].get(key)
                    if value != "date" else remove_time_from_date_time(
                        self.counties_inventory[county].get(key)))

            self.summary_locations[
                summary_location_submitter_id] = summary_location
            self.summary_clinicals[
                summary_clinical_submitter_id] = summary_clinical
        return illinois_summary_clinical_submitter_id

    def parse_total_state_wide(self, state_summary_clinical_submitter_id):
        """
        Parse the Illinois total stats
        """
        county_covid_response = requests.get(TOTAL_VACCINE_LINK,
                                             headers={"content-type": "json"})
        state_total_data = json.loads(county_covid_response.text)
        total_vaccine_mapping = {
            "Total_Delivered": "vaccine_total_delivered_vaccine_doses",
            "Total_Administered":
            "vaccine_IL_total_administered_vaccine_doses",
            "Persons_Fully_Vaccinated":
            "vaccine_IL_total_persons_fully_vaccinated",
            "LTC_Allocated": "vaccine_long_term_care_allocated",
            "LTC_Administered": "vaccine_long_term_care_administered",
            "Report_Date": "date",
        }
        for (key, value) in total_vaccine_mapping.items():
            if value != "date":
                self.summary_clinicals[state_summary_clinical_submitter_id][
                    value] = state_total_data.get(key)
            else:
                self.summary_clinicals[state_summary_clinical_submitter_id][
                    value] = remove_time_from_date_time(
                        state_total_data.get(key))

    def submit_metadata(self):
        print("Submitting data...")
        print("Submitting summary_location data")
        for sl in self.summary_locations.values():
            sl_record = {"type": "summary_location"}
            sl_record.update(sl)
            self.metadata_helper.add_record_to_submit(sl_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals.values():
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_group_demographic data")
        for sc in self.summary_group_demographic.values():
            sc_record = {"type": "summary_group_demographics"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Exemple #15
0
class CHESTXRAY8(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "ChestX-ray8"

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.file_helper = FileHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.cmc_submitter_id = format_submitter_id("cmc_chestxray8", {})
        self.core_metadata_collection = [{
            "submitter_id":
            self.cmc_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
        }]
        self.imaging_file = []

    def files_to_submissions(self):
        for image_type in ("No_findings", "Pneumonia"):
            for image_filepath in (
                    Path(CHESTXRAY8_DATA_PATH).joinpath("COVID-19").joinpath(
                        "X-Ray Image DataSet").joinpath(image_type).iterdir()):
                did, rev, md5, size = self.file_helper.find_by_name(
                    image_filepath.name)
                if not did:
                    guid = self.file_helper.upload_file(image_filepath)
                    print(
                        f"file {image_filepath.name} uploaded with guid: {guid}"
                    )
                else:
                    print(
                        f"file {image_filepath.name} exists in indexd... skipping..."
                    )

                imaging_file_submitter_id = format_submitter_id(
                    "imaging_file_chestxray8",
                    {"filename": image_filepath.name})
                uploaded_imaging_file = {
                    "submitter_id":
                    imaging_file_submitter_id,
                    "core_metadata_collections": [{
                        "submitter_id":
                        self.cmc_submitter_id
                    }],
                    "data_type":
                    "PNG",
                    "data_format":
                    "Image File",
                    "data_category":
                    "X-Ray Image",
                    "file_name":
                    image_filepath.name,
                    "file_size":
                    size,
                    "md5sum":
                    md5,
                    "object_id":
                    did,
                    "clinical_notes":
                    image_type,
                }

                self.imaging_file.append(uploaded_imaging_file)

    def submit_metadata(self):
        print("Submitting data...")

        print("Submitting core_metadata_collection data")
        for cmc in self.core_metadata_collection:
            cmc_record = {"type": "core_metadata_collection"}
            cmc_record.update(cmc)
            self.metadata_helper.add_record_to_submit(cmc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting imaging_file data")
        for ifile in self.imaging_file:
            if_record = {"type": "imaging_file"}
            if_record.update(ifile)
            self.metadata_helper.add_record_to_submit(if_record)
        self.metadata_helper.batch_submit_records()
Exemple #16
0
class DSFSI(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.subjects = []
        self.demographics = []
        self.observations = []

        self.program_name = "open"
        self.project_code = "DSFSI"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        # structure is
        # (csv field name, (node type, node field name, type of field))
        self.countries_fields = [
            ("case_id", ("subject", "submitter_id", str)),
            ("origin_case_id", (None, None, None)),
            ("date", ("observation", "reporting_date", normalize_date)),
            ("age", ("demographic", "age", normalize_age)),
            ("gender", ("demographic", "gender", normalize_gender)),
            ("city", ("demographic", "city", str)),
            ("province/state", ("demographic", "province_state", str)),
            ("country", ("demographic", "country_region", str)),
            (
                "current_status",
                ("subject", "tmp_current_status", normalize_current_status),
            ),
            (
                "source",
                ("observation", "reporting_source_url", str),
            ),  # type of fields "None" is used to remove the value
            ("symptoms", ("observation", "symptoms", normalize_symptoms)),
            (
                "date_onset_symptoms",
                ("observation", "date_onset_symptoms", normalize_date),
            ),
            (
                "date_admission_hospital",
                ("observation", "date_admission_hospital", normalize_date),
            ),
            ("date_confirmation", ("subject", "date_confirmation", normalize_date)),
            ("underlying_conditions", (None, None, None)),
            ("travel_history_dates", ("subject", "travel_history_dates", str)),
            ("travel_history_location", ("subject", "travel_history_location", str)),
            ("death_date", ("subject", "deceased_date", normalize_date)),
            ("notes_for_discussion", (None, None, None)),
        ]

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        urls = {
            "Algeria": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-algeria.csv",
            "Angola": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-angola.csv",
            "Benin": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-benin.csv",
            "Burkina Faso": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-burkina-faso.csv",
            "Cabo Verde": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-cabo-verde.csv",
            "Cameroon": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-cameroon.csv",
            "Central African Republic": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-central-african-republic.csv",
            "Chad": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-chad.csv",
            "Côte d'Ivoire": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-cote-divoire.csv",
            "Democratic Republic of the Congo": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-democratic-republic-of-the-congo.csv",
            "Djibouti": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-djibouti.csv",
            # here should be an Egypt dataset, but it's not useful and omitted on purpose
            "Equatorial Guinea": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-equatorial-guinea.csv",
            "Eritrea": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-eritrea.csv",
            "Eswatini": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-eswatini.csv",
            "Ethiopia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-ethiopia.csv",
            "Gabon": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-gabon.csv",
            "Gambia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-gambia.csv",
            "Ghana": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-ghana.csv",
            "Guinea Bissau": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-guinea-bissau.csv",
            "Guinea": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-guinea.csv",
            "Kenya": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-kenya.csv",
            "Liberia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-liberia.csv",
            "Madagascar": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-madagascar.csv",
            "Mali": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mali.csv",
            "Mauritania": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mauritania.csv",
            "Mauritius": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mauritius.csv",
            "Mozambique": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-mozambique.csv",
            "Namibia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-namibia.csv",
            "Niger": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-niger.csv",
            "Nigeria": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-nigeria.csv",
            "Republic of Congo": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-republic-of-congo.csv",
            "Rwanda": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-rwanda.csv",
            "Senegal": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-senegal.csv",
            "Seychelles": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-seychelles.csv",
            "Somalia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-somalia.csv",
            "South Africa": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-south-africa.csv",
            "Sudan": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-sudan.csv",
            "Tanzania": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-tanzania.csv",
            "Togo": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-togo.csv",
            "Uganda": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-uganda.csv",
            "Zambia": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-zambia.csv",
            "Zimbabwe": "https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-zimbabwe.csv",
        }

        for k, url in urls.items():
            self.parse_file(k, url)

    def parse_file(self, country, url):
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            assert (
                headers[0] != "404: Not Found"
            ), "  Unable to get file contents, received {}.".format(headers)

            countries_with_empty_columns = [
                "Angola",
                "Burkina Faso",
                "Cabo Verde",
                "Cameroon",
                "Central African Republic",
                "Chad",
                "Côte d'Ivoire",
                "Democratic Republic of the Congo",
                "Djibouti",
                "Equatorial Guinea",
                "Eritrea",
                "Eswatini",
                "Gabon",
                "Guinea Bissau",
                "Guinea",
                "Liberia",
                "Madagascar",
                "Mali",
                "Mauritania",
                "Mauritius",
                "Mozambique",
                "Republic of Congo",
                "Senegal",
                "Seychelles",
                "Somalia",
                "Sudan",
                "Tanzania",
                "Togo",
                "Uganda",
                "Zambia",
            ]

            countries_with_mistyped_column = ["South Africa"]

            countries_without_notes = [
                "Eritrea",
                "Eswatini",
                "Gabon",
                "Madagascar",
                "Mali",
                "Mauritania",
                "Mauritius",
                "Mozambique",
                "Republic of Congo",
                "Senegal",
                "Seychelles",
                "Somalia",
                "Sudan",
                "Tanzania",
                "Togo",
                "Uganda",
                "Zambia",
            ]

            # Ok, this is ugly... But, almost all the countries have some ugliness in the CSV format...
            # And this code deals with it
            tmp = copy.deepcopy(self.countries_fields)
            if country in countries_with_empty_columns:
                tmp.insert(0, ("", (None, None, None)))

            if country in countries_with_mistyped_column:
                tmp[14] = ("underlyng_conditions", (None, None, None))

            if country in countries_without_notes:
                del tmp[-1]

            if country == "Ethiopia":
                tmp.insert(8, ("original_status", (None, None, None)))
                del tmp[10]
                tmp.insert(14, ("closed_date", (None, None, None)))
                tmp.insert(16, ("quarantine_status", (None, None, None)))
                del tmp[19]
                tmp.insert(19, ("contact", (None, None, None)))
                tmp.append(("source", (None, None, None)))

            if country == "Niger":
                del tmp[9]
                tmp.insert(9, ("source 1", (None, None, None)))
                tmp.insert(10, ("source 2", (None, None, None)))

            updated_headers_mapping = {
                field: (k, mapping) for k, (field, mapping) in enumerate(tmp)
            }
            expected_h = list(updated_headers_mapping.keys())
            obtained_h = headers[: len(expected_h)]
            obtained_h = [header.strip() for header in obtained_h]

            assert (
                obtained_h == expected_h
            ), "CSV headers have changed\nexpected: {}\n     got: {})".format(
                expected_h, obtained_h
            )

            # South Africa dataset has only 274 nice cases
            # Everything after has the same data and don't have any meaningful information
            idx = 0
            last = None
            if country == "South Africa":
                last = 275

            for row in reader:
                idx += 1
                if last and idx == last:
                    break

                subject, demographic, observation = self.parse_row(
                    country, row, updated_headers_mapping
                )

                self.subjects.append(subject)
                self.demographics.append(demographic)
                self.observations.append(observation)

    def parse_row(self, country, row, mapping):
        subject = {}
        demographic = {}
        observation = {}

        for (i, (node_type, node_field, type_conv)) in mapping.values():
            if node_field:
                value = row[i]
                if value:
                    if node_type == "subject":
                        if type_conv is None:
                            subject[node_field] = None
                            continue
                        subject[node_field] = type_conv(value)
                    if node_type == "demographic":
                        if type_conv is None:
                            demographic[node_field] = None
                            continue
                        demographic[node_field] = type_conv(value)

        # init subject node
        case_id = subject["submitter_id"]
        subject["submitter_id"] = format_subject_submitter_id(
            country, subject["submitter_id"]
        )
        subject["projects"] = [{"code": self.project_code}]

        # Only South Africa dataset has a record with the same case_id...
        # Because this code deals only with individual rows, it's hard coded right now
        if country == "South Africa" and case_id == "110":
            if demographic["age"] == 34:
                subject["submitter_id"] += "_1"
            elif demographic["age"] == 27:
                subject["submitter_id"] += "_2"

        # init demographic node
        demographic["submitter_id"] = format_node_submitter_id(
            subject["submitter_id"], "demographic"
        )
        demographic["subjects"] = [{"submitter_id": subject["submitter_id"]}]

        # init observation node
        observation["submitter_id"] = format_node_submitter_id(
            subject["submitter_id"], "observation"
        )
        observation["subjects"] = [{"submitter_id": subject["submitter_id"]}]

        if subject.get("date_confirmation"):
            subject["covid_19_status"] = "Positive"

        state = subject.get("tmp_current_status")
        if "tmp_current_status" in subject:
            del subject["tmp_current_status"]
        if state == "deceased":
            subject["vital_status"] = "Dead"
        elif state in ["alive"]:
            subject["vital_status"] = state.capitalize()
        elif state in ["positive"]:
            subject["covid_19_status"] = state.capitalize()
        elif state == "isolated":
            observation["isolation_status"] = state.capitalize()
        elif state in ["released", "recovered", "in recovery", "in treatment"]:
            observation["treatment_status"] = state.capitalize()
        elif state in ["stable", "unstable", "critical"]:
            observation["condition"] = state.capitalize()
        elif state:
            raise Exception('State "{}" is unknown'.format(state))

        if "travel_history_dates" in subject:
            date_list = normalize_date_list(subject["travel_history_dates"])
            if date_list:
                subject["travel_history_dates"] = date_list
            else:
                del subject["travel_history_dates"]

        if "travel_history_location" in subject:
            loc_list = normalize_location_list(subject["travel_history_location"])
            if loc_list:
                subject["travel_history_location"] = loc_list
            else:
                del subject["travel_history_location"]

        return subject, demographic, observation

    def submit_metadata(self):
        print("Submitting subject data")
        for loc in self.subjects:
            loc_record = {"type": "subject"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting demographic data")
        for dem in self.demographics:
            dem_record = {"type": "demographic"}
            dem_record.update(dem)
            self.metadata_helper.add_record_to_submit(dem_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting observation data")
        for obs in self.observations:
            obs_record = {"type": "observation"}
            obs_record.update(obs)
            self.metadata_helper.add_record_to_submit(obs_record)
        self.metadata_helper.batch_submit_records()
Exemple #17
0
class COM_MOBILITY(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "Com-Mobility"

        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.expected_file_headers = [
            "country_region_code",
            "country_region",
            "sub_region_1",
            "sub_region_2",
            "metro_area",
            "iso_3166_2_code",
            "census_fips_code",
            "date",
            "retail_and_recreation_percent_change_from_baseline",
            "grocery_and_pharmacy_percent_change_from_baseline",
            "parks_percent_change_from_baseline",
            "transit_stations_percent_change_from_baseline",
            "workplaces_percent_change_from_baseline",
            "residential_percent_change_from_baseline",
        ]

        self.summary_locations = []
        self.summary_socio_demographics = []

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv"
        self.parse_file(url)

    def parse_file(self, url):
        """
        Converts a CSV file to data we can submit via Sheepdog. Stores the
        records to submit in `self.location_data` and `self.time_series_data`.
        Ignores any records that are already in Sheepdog (relies on unique
        `submitter_id` to check)

        Args:
            url (str): URL at which the CSV file is available
        """

        self.last_submission_date_time = self.metadata_helper.get_last_submission()
        the_lattest_data_datetime = None

        print("Getting data from {}".format(url))

        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            assert (
                headers[0] != "404: Not Found"
            ), "Unable to get file contents, received {}.".format(headers)

            assert set(self.expected_file_headers).issubset(
                set(headers)
            ), "CSV headers have changed (expected {} is a subset of {}). We may need to update the ETL code".format(
                self.expected_file_headers, headers
            )

            for row in reader:
                # ignore any empty row
                if not row:
                    continue

                row_dict = dict(zip(headers, row))
                if row_dict["country_region_code"] != "US":
                    continue

                if (
                    not self.last_submission_date_time
                    or parse(row_dict["date"]) > self.last_submission_date_time
                ):
                    if (
                        the_lattest_data_datetime is None
                        or the_lattest_data_datetime < parse(row_dict["date"])
                    ):
                        the_lattest_data_datetime = parse(row_dict["date"])

                    summary_location = {}
                    summary_socio_demographic = {}

                    summary_location_submitter_id = format_submitter_id(
                        "summary_location",
                        row_dict["country_region_code"],
                        row_dict["sub_region_1"],
                        row_dict["sub_region_2"],
                        row_dict["metro_area"],
                        row_dict["date"],
                    )

                    summary_socio_demographic_submitter_id = format_submitter_id(
                        "summary_socio_demographic",
                        row_dict["country_region_code"],
                        row_dict["sub_region_1"],
                        row_dict["sub_region_2"],
                        row_dict["metro_area"],
                        row_dict["date"],
                    )

                    summary_location = {
                        "submitter_id": summary_location_submitter_id,
                        "projects": [{"code": self.project_code}],
                    }

                    summary_socio_demographic = {
                        "submitter_id": summary_socio_demographic_submitter_id,
                        "summary_locations": [
                            {"submitter_id": summary_location_submitter_id}
                        ],
                    }

                    for field in [
                        "country_region_code",
                        "country_region",
                        "sub_region_1",
                        "sub_region_2",
                        "metro_area",
                        "iso_3166_2_code",
                        "census_fips_code",
                    ]:
                        gen3_field, func = SPECIAL_MAP_FIELDS[field]
                        summary_location[gen3_field] = func(row_dict[field])

                    for field in [
                        "retail_and_recreation_percent_change_from_baseline",
                        "grocery_and_pharmacy_percent_change_from_baseline",
                        "parks_percent_change_from_baseline",
                        "transit_stations_percent_change_from_baseline",
                        "workplaces_percent_change_from_baseline",
                        "residential_percent_change_from_baseline",
                        "date",
                    ]:
                        gen3_field, func = SPECIAL_MAP_FIELDS[field]
                        summary_socio_demographic[gen3_field] = func(row_dict[field])

                    self.summary_locations.append(summary_location)
                    self.summary_socio_demographics.append(summary_socio_demographic)
        if the_lattest_data_datetime:
            self.last_submission_date_time = the_lattest_data_datetime

    def submit_metadata(self):
        """
        Converts the data in `self.time_series_data` to Sheepdog records.
        `self.location_data already contains Sheepdog records. Batch submits
        all records in `self.location_data` and `self.time_series_data`
        """
        # Commented
        # Only required for one time submission of summary_location
        print("Submitting summary_location data")
        for loc in self.summary_locations:
            loc_record = {"type": "summary_location"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_socio_demographic data")
        for sc in self.summary_socio_demographics:
            sc_record = {"type": "summary_socio_demographic"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
        self.metadata_helper.update_last_submission(
            self.last_submission_date_time.strftime("%Y-%m-%d")
        )
Exemple #18
0
class CCMAP(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.summary_locations = []
        self.summary_clinicals = []
        self.summary_socio_demographics = []

        self.program_name = "open"
        self.project_code = "CCMap"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        # structure is
        # (csv field name, (node type, node field name, type of field))
        county_fields = [
            ("fips_code", ("summary_location", "FIPS", int)),
            ("State", ("summary_location", "province_state", str)),
            ("County Name", ("summary_location", "county", str)),
            ("Staffed All Beds", ("summary_clinical", "staffed_all_beds",
                                  int)),
            ("Staffed ICU Beds", ("summary_clinical", "staffed_icu_beds",
                                  int)),
            ("Licensed All Beds", ("summary_clinical", "licensed_all_beds",
                                   int)),
            (
                "All Bed Occupancy Rate",
                ("summary_clinical", "all_bed_occupancy_rate", float),
            ),
            (
                "ICU Bed Occupancy Rate",
                ("summary_clinical", "icu_bed_occupancy_rate", float),
            ),
            ("Population", ("summary_clinical", "population", int)),
            ("Population (20+)", ("summary_clinical", "population_gtr_20",
                                  int)),
            ("Population (65+)", ("summary_clinical", "population_gtr_65",
                                  int)),
            (
                "Staffed All Beds [Per 1000 People]",
                ("summary_clinical", "staffed_all_beds_per_1000", float),
            ),
            (
                "Staffed All Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "staffed_all_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Staffed All Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "staffed_all_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Staffed ICU Beds [Per 1000 People]",
                ("summary_clinical", "staffed_icu_beds_per_1000", float),
            ),
            (
                "Staffed ICU Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "staffed_icu_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Staffed ICU Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "staffed_icu_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Licensed All Beds [Per 1000 People]",
                ("summary_clinical", "licensed_all_beds_per_1000", float),
            ),
            (
                "Licensed All Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "licensed_all_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Licensed All Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "licensed_all_beds_per_1000_gtr_65",
                 float),
            ),
        ]

        state_fields = [
            ("State", ("summary_location", None, int)),
            ("State Name", ("summary_location", "province_state", str)),
            ("Staffed All Beds", ("summary_clinical", "staffed_all_beds",
                                  int)),
            ("Staffed ICU Beds", ("summary_clinical", "staffed_icu_beds",
                                  int)),
            ("Licensed All Beds", ("summary_clinical", "licensed_all_beds",
                                   int)),
            (
                "All Bed Occupancy Rate",
                ("summary_clinical", "all_bed_occupancy_rate", float),
            ),
            (
                "ICU Bed Occupancy Rate",
                ("summary_clinical", "icu_bed_occupancy_rate", float),
            ),
            ("Population", ("summary_clinical", "population", int)),
            (
                "Population (20+)",
                ("summary_socio_demographic", "population_gtr_20", int),
            ),
            (
                "Population (65+)",
                ("summary_socio_demographic", "population_gtr_65", int),
            ),
            (
                "Staffed All Beds [Per 1000 People]",
                ("summary_clinical", "staffed_all_beds_per_1000", float),
            ),
            (
                "Staffed All Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "staffed_all_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Staffed All Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "staffed_all_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Staffed ICU Beds [Per 1000 People]",
                ("summary_clinical", "staffed_icu_beds_per_1000", float),
            ),
            (
                "Staffed ICU Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "staffed_icu_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Staffed ICU Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "staffed_icu_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Licensed All Beds [Per 1000 People]",
                ("summary_clinical", "licensed_all_beds_per_1000", float),
            ),
            (
                "Licensed All Beds [Per 1000 Adults (20+)]",
                ("summary_clinical", "licensed_all_beds_per_1000_gtr_20",
                 float),
            ),
            (
                "Licensed All Beds [Per 1000 Elderly (65+)]",
                ("summary_clinical", "licensed_all_beds_per_1000_gtr_65",
                 float),
            ),
            (
                "Estimated No. Full-Featured Mechanical Ventilators (2010 study estimate)",
                ("summary_clinical", "estimated_full_mech_ventilators", int),
            ),
            (
                "Estimated No. Full-Featured Mechanical Ventilators per 100,000 Population (2010 study estimate)",
                (
                    "summary_clinical",
                    "estimated_full_mech_ventilators_per_100000",
                    float,
                ),
            ),
            (
                "Estimated No. Pediatrics-Capable Full-Feature Mechanical Ventilators (2010 study estimate)",
                ("summary_clinical",
                 "estimated_full_mech_pediatric_ventilators", int),
            ),
            (
                "Estimated No. Full-Feature Mechanical Ventilators, Pediatrics Capable per 100,000 Population <14 y (2010 study estimate)",
                (
                    "summary_clinical",
                    "estimated_full_mech_pediatric_ventilators_per_100000",
                    float,
                ),
            ),
        ]

        self.headers_mapping = {
            "county": {field: mapping
                       for field, mapping in county_fields},
            "state": {field: mapping
                      for field, mapping in state_fields},
        }

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        repo = "covidcaremap/covid19-healthsystemcapacity"
        branch = "master"
        files = {
            "county":
            "data/published/us_healthcare_capacity-county-CovidCareMap.csv",
            "state":
            "data/published/us_healthcare_capacity-state-CovidCareMap.csv",
        }

        for k, url in files.items():
            self.parse_file(repo, branch, url, csv_type=k)

    def get_last_update_date_file(self, repo, url):
        """
        Gets latest update time for specific file in the repository

        :param repo: "user/repository" for Github repository
        :param url: path to file
        :return: last update (commit) datetime for the file
        """
        api_url = "https://api.github.com/repos"
        commit_info_url = "{}/{}/{}{}{}".format(api_url, repo, "commits?path=",
                                                url, "&page=1&per_page=1")

        with closing(requests.get(commit_info_url, stream=True)) as r:
            commit_info = r.json()
            last_update_date = commit_info[0]["commit"]["committer"]["date"]

        return datetime.datetime.strptime(last_update_date,
                                          "%Y-%m-%dT%H:%M:%SZ")

    def parse_file(self, repo, branch, file_url, csv_type):
        last_update_date = self.get_last_update_date_file(repo, file_url)

        raw_url = "https://raw.githubusercontent.com"
        url = "{}/{}/{}/{}".format(raw_url, repo, branch, file_url)

        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            assert (
                headers[0] != "404: Not Found"
            ), "  Unable to get file contents, received {}.".format(headers)

            expected_h = list(self.headers_mapping[csv_type].keys())
            assert (
                set(expected_h).issubset(set(headers)) == True
            ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format(
                expected_h, headers)

            for i, f in enumerate(headers):
                if f in self.headers_mapping[csv_type]:
                    old_value = self.headers_mapping[csv_type][f]
                    self.headers_mapping[csv_type][f] = (i, old_value)

            for row in reader:
                (
                    summary_location,
                    summary_clinical,
                    summary_socio_demographic,
                ) = self.parse_row(row, self.headers_mapping[csv_type],
                                   last_update_date)

                self.summary_locations.append(summary_location)
                self.summary_clinicals.append(summary_clinical)
                self.summary_socio_demographics.append(
                    summary_socio_demographic)

    def parse_row(self, row, mapping, last_update_date):
        summary_location = {"country_region": "US"}
        summary_clinical = {}
        summary_socio_demographic = {}

        for k, (i, (node_type, node_field, type_conv)) in mapping.items():
            try:
                if node_field:
                    value = row[i]
                    if value:
                        if node_type == "summary_location":
                            summary_location[node_field] = type_conv(value)
                        if node_type == "summary_clinical":
                            if type_conv == int:
                                summary_clinical[node_field] = type_conv(
                                    float(value))
                            else:
                                summary_clinical[node_field] = type_conv(value)
                        if node_type == "summary_socio_demographic":
                            if type_conv == int:
                                summary_socio_demographic[
                                    node_field] = type_conv(float(value))
                            else:
                                summary_socio_demographic[
                                    node_field] = type_conv(value)
                            summary_clinical[
                                node_field] = None  # TODO: remove when the properties are removed from dictionary
            except Exception as ex:
                print("Error with field: {}, problematic value: {}".format(
                    node_field, row[i]))

        summary_location_submitter_id = format_location_submitter_id(
            summary_location)

        summary_location["submitter_id"] = summary_location_submitter_id
        summary_location["projects"] = [{"code": self.project_code}]

        state = summary_location["province_state"]
        if len(state) == 2:
            summary_location["province_state"] = state_to_long(state)

        summary_clinical[
            "submitter_id"] = format_summary_clinical_submitter_id(
                summary_location_submitter_id,
                date=last_update_date.strftime("%Y-%m-%d"))
        summary_clinical["summary_locations"] = [{
            "submitter_id":
            summary_location_submitter_id
        }]

        summary_socio_demographic[
            "submitter_id"] = format_summary_socio_demographic_id(
                summary_location_submitter_id,
                date=last_update_date.strftime("%Y-%m-%d"))
        summary_socio_demographic["summary_locations"] = [{
            "submitter_id":
            summary_location_submitter_id
        }]

        return summary_location, summary_clinical, summary_socio_demographic

    def submit_metadata(self):
        print("Submitting summary_location data")
        for loc in self.summary_locations:
            loc_record = {"type": "summary_location"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals:
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_socio_demographic data")
        for sc in self.summary_socio_demographics:
            sc_record = {"type": "summary_socio_demographic"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Exemple #19
0
class DS4C(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "DS4C"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.subjects = []
        self.demographics = []
        self.observations = []

    def files_to_submissions(self):
        with open(
            os.path.join(CURRENT_DIR, "data/ds4c_PatientInfo.csv"), newline=""
        ) as csvfile:
            reader = csv.reader(csvfile, delimiter=",")
            header = next(reader)
            print("Headers:", header)
            header = {k: v for v, k in enumerate(header)}
            n_1200012238 = 1

            for row in reader:
                patient_id = row[header["patient_id"]].strip()
                if patient_id == "1200012238":
                    # there are 2 rows for the same ID
                    patient_id = f"{patient_id}_{n_1200012238}"
                    n_1200012238 += 1

                # generate subject record
                subject = {
                    "submitter_id": patient_id,
                    "projects": [{"code": self.project_code}],
                }

                confirmed_date = row[header["confirmed_date"]].strip()
                if confirmed_date:
                    check_date_format(confirmed_date)
                    subject["date_confirmation"] = confirmed_date
                    subject["covid_19_status"] = "Positive"

                infected_by = row[header["infected_by"]].strip()
                if infected_by:
                    subject["infected_by"] = list(
                        map(lambda v: v.strip(), infected_by.split(","))
                    )

                deceased_date = row[header["deceased_date"]].strip()
                if deceased_date:
                    check_date_format(deceased_date)
                    subject["deceased_date"] = deceased_date

                # generate demographic record
                demographic = {
                    "submitter_id": f"demographic_{patient_id}",
                    "subjects": {"submitter_id": patient_id},
                    "age_decade": row[header["age"]].strip(),
                    "province_state": row[header["province"]].strip(),
                    "city": row[header["city"]].strip(),
                }

                country = row[header["country"]].strip()
                if country == "Korea":
                    demographic["country_region"] = "South Korea"
                elif country == "United States":
                    demographic["country_region"] = "USA"
                else:
                    demographic["country_region"] = country

                gender = row[header["sex"]].strip()
                demographic["gender"] = harmonize_gender(gender)

                demographic["year_of_birth"] = None

                # generate observation record
                observation = {
                    "submitter_id": f"observation_{patient_id}",
                    "subjects": {"submitter_id": patient_id},
                    "exposure": row[header["infection_case"]].strip(),
                }
                date_onset_symptoms = row[header["symptom_onset_date"]].strip()
                if date_onset_symptoms:
                    check_date_format(row[header["symptom_onset_date"]])
                    observation["date_onset_symptoms"] = date_onset_symptoms

                state = row[header["state"]].strip()
                if state == "deceased":
                    subject["vital_status"] = "Dead"
                elif state == "isolated":
                    observation["isolation_status"] = "Isolated"
                elif state == "released":
                    observation["treatment_status"] = "Released"
                elif state:
                    raise Exception('State "{}" is unknown'.format(state))

                released_date = row[header["released_date"]].strip()
                if released_date:
                    check_date_format(released_date)
                    observation["released_date"] = released_date

                subject = {k: v if v else None for k, v in subject.items()}
                self.subjects.append(subject)

                demographic = {k: v for k, v in demographic.items() if v}
                self.demographics.append(demographic)

                observation = {k: v for k, v in observation.items() if v}
                self.observations.append(observation)

    def submit_metadata(self):
        print("Submitting data")
        print("Submitting subject data")
        for loc in self.subjects:
            loc_record = {"type": "subject"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting demographic data")
        for dem in self.demographics:
            dem_record = {"type": "demographic"}
            dem_record.update(dem)
            self.metadata_helper.add_record_to_submit(dem_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting observation data")
        for obs in self.observations:
            obs_record = {"type": "observation"}
            obs_record.update(obs)
            self.metadata_helper.add_record_to_submit(obs_record)
        self.metadata_helper.batch_submit_records()
Exemple #20
0
class DSCI(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "DSCI"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.subjects = []
        self.demographics = []
        self.observations = []

    def files_to_submissions(self):
        with open(
            os.path.join(CURRENT_DIR, "data/dsci_patient.csv"), newline=""
        ) as csvfile:
            reader = csv.reader(csvfile, delimiter=",", quotechar="|")
            header = next(reader)
            print("Headers:", header)
            header = {k: v for v, k in enumerate(header)}

            for row in reader:
                patient_id = row[header["patient_id"]].strip()

                # generate subject record
                subject = {
                    "submitter_id": patient_id,
                    "projects": [{"code": self.project_code}],
                }

                infected_by = row[header["contacted_with"]].strip()
                if infected_by:
                    subject["infected_by"] = list(
                        map(lambda v: v.strip(), infected_by.split(","))
                    )

                confirmed_date = row[header["confirmed_date"]].strip()
                if confirmed_date:
                    confirmed_date = format_date(confirmed_date)
                    check_date_format(confirmed_date)
                    subject["date_confirmation"] = confirmed_date
                    subject["covid_19_status"] = "Positive"

                deceased_date = row[header["deceased_date"]].strip()
                if deceased_date:
                    deceased_date = format_date(deceased_date)
                    check_date_format(deceased_date)
                    subject["deceased_date"] = deceased_date

                # generate demographic record
                demographic = {
                    "submitter_id": f"demographic_{patient_id}",
                    "subjects": {"submitter_id": f"{patient_id}"},
                }

                cols = {"age": "age", "province": "province_state"}

                for k, v in cols.items():
                    value = row[header[k]].strip()
                    if value:
                        demographic[v] = value

                if "age" in demographic:
                    demographic["age"] = int(demographic["age"])

                gender = row[header["gender"]].strip()
                demographic["gender"] = harmonize_gender(gender)

                nationality = row[header["nationality"]].strip()
                if nationality == "indonesia":
                    demographic["country_region"] = "Indonesia"
                elif nationality == "foreigner":
                    pass
                elif nationality:
                    raise Exception('Nationality "{}" is unknown'.format(nationality))

                # generate observation record
                observation = {
                    "submitter_id": f"observation_{patient_id}",
                    "subjects": {"submitter_id": f"{patient_id}"},
                }

                hospital = row[header["hospital"]].strip()
                if hospital:
                    observation["hospital"] = hospital

                state = row[header["current_state"]].strip()
                if state == "deceased":
                    subject["vital_status"] = "Dead"
                elif state == "isolated":
                    observation["isolation_status"] = "Isolated"
                elif state == "released":
                    observation["treatment_status"] = "Released"
                elif state:
                    raise Exception('State "{}" is unknown'.format(state))

                released_date = row[header["released_date"]].strip()
                if released_date:
                    released_date = format_date(released_date)
                    check_date_format(released_date)
                    observation["released_date"] = released_date

                self.subjects.append(subject)
                self.demographics.append(demographic)
                self.observations.append(observation)

    def submit_metadata(self):
        print("Submitting data")
        print("Submitting subject data")
        for loc in self.subjects:
            loc_record = {"type": "subject"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting demographic data")
        for dem in self.demographics:
            dem_record = {"type": "demographic"}
            dem_record.update(dem)
            self.metadata_helper.add_record_to_submit(dem_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting observation data")
        for obs in self.observations:
            obs_record = {"type": "observation"}
            obs_record.update(obs)
            self.metadata_helper.add_record_to_submit(obs_record)
        self.metadata_helper.batch_submit_records()
Exemple #21
0
class IDPH_FACILITY(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)

        self.program_name = "open"
        self.project_code = "IDPH-Facility"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.country = "US"
        self.state = "IL"

        self.summary_locations = {}
        self.summary_clinicals = {}

    def files_to_submissions(self):
        """
        Reads JSON file and convert the data to Sheepdog records
        """

        latest_submitted_date = self.metadata_helper.get_latest_submitted_date_idph(
        )
        today = datetime.date.today()
        if latest_submitted_date == today:
            print(
                "Nothing to submit: today and latest submitted date are the same."
            )
            return
        today_str = today.strftime("%Y%m%d")

        print(f"Getting data for date: {today_str}")
        url = "https://dph.illinois.gov/sitefiles/COVIDLTC.json"
        self.parse_file(latest_submitted_date, url)

    def parse_file(self, latest_submitted_date, url):
        """
        Converts a JSON files to data we can submit via Sheepdog. Stores the
        records to submit in `self.summary_locations` and `self.summary_clinicals`.

        Args:
            latest_submitted_date (date): the date of latest available "summary_clinical" for project
            url (str): URL at which the JSON file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            date = idph_get_date(data["LastUpdateDate"])

            if latest_submitted_date and date == latest_submitted_date.strftime(
                    "%Y-%m-%d"):
                print(
                    "Nothing to submit: latest submitted date and date from data are the same."
                )
                return

            if "LTC_Reported_Cases" in data:
                summary_location_submitter_id = format_submitter_id(
                    "summary_location", {
                        "country": self.country,
                        "state": self.state
                    })

                summary_location = {
                    "country_region": self.country,
                    "submitter_id": summary_location_submitter_id,
                    "projects": [{
                        "code": self.project_code
                    }],
                    "province_state": self.state,
                }

                summary_clinical_submitter_id = derived_submitter_id(
                    summary_location_submitter_id,
                    "summary_location",
                    "summary_clinical",
                    {"date": date},
                )
                summary_clinical = {
                    "confirmed":
                    data["LTC_Reported_Cases"]["confirmed_cases"],
                    "deaths":
                    data["LTC_Reported_Cases"]["deaths"],
                    "submitter_id":
                    summary_clinical_submitter_id,
                    "lastUpdateEt":
                    date,
                    "date":
                    date,
                    "summary_locations": [{
                        "submitter_id":
                        summary_location_submitter_id
                    }],
                }
                self.summary_locations[
                    summary_location_submitter_id] = summary_location
                self.summary_clinicals[
                    summary_clinical_submitter_id] = summary_clinical

            for facility in data["FacilityValues"]:
                (summary_location,
                 summary_clinical) = self.parse_facility(date, facility)
                summary_location_submitter_id = summary_location[
                    "submitter_id"]
                summary_clinical_submitter_id = summary_clinical[
                    "submitter_id"]

                self.summary_locations[
                    summary_location_submitter_id] = summary_location

                if summary_clinical_submitter_id in self.summary_clinicals:
                    existed = self.summary_clinicals[
                        summary_clinical_submitter_id]
                    summary_clinical["confirmed"] = max(
                        summary_clinical["confirmed"], existed["confirmed"])
                    summary_clinical["deaths"] = max(
                        summary_clinical["deaths"], existed["deaths"])

                self.summary_clinicals[
                    summary_clinical_submitter_id] = summary_clinical

    def parse_facility(self, date, facility):
        """
        From county-level data, generate the data we can submit via Sheepdog
        """
        county = facility["County"]
        facility_name = facility["FacilityName"]
        confirmed_cases = facility["confirmed_cases"]
        deaths = facility["deaths"]
        status = facility.get("status", None)

        summary_location_submitter_id = format_submitter_id(
            "summary_location",
            {
                "country": self.country,
                "state": self.state,
                "facility_name": facility_name,
                "reporting_org_status": status,
            },
        )

        summary_location = {
            "country_region": self.country,
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
            "province_state": self.state,
            "county": county,
            "reporting_org": facility_name,
            "reporting_org_status": status,
        }

        summary_clinical_submitter_id = derived_submitter_id(
            summary_location_submitter_id,
            "summary_location",
            "summary_clinical",
            {"date": date},
        )

        summary_clinical = {
            "confirmed": confirmed_cases,
            "deaths": deaths,
            "submitter_id": summary_clinical_submitter_id,
            "lastUpdateEt": date,
            "date": date,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        return summary_location, summary_clinical

    def submit_metadata(self):
        print("Submitting data...")
        print("Submitting summary_location data")
        for sl in self.summary_locations.values():
            sl_record = {"type": "summary_location"}
            sl_record.update(sl)
            self.metadata_helper.add_record_to_submit(sl_record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals.values():
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
Exemple #22
0
class JHU(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.location_data = {}
        self.time_series_data = defaultdict(lambda: defaultdict(dict))
        self.program_name = "open"
        self.project_code = "JHU"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )
        self.expected_csv_headers = {
            "global":
            ["Province/State", "Country/Region", "Lat", "Long", "1/22/20"],
            "US_counties": {
                "confirmed": [
                    "UID",
                    "iso2",
                    "iso3",
                    "code3",
                    "FIPS",
                    "Admin2",
                    "Province_State",
                    "Country_Region",
                    "Lat",
                    "Long_",
                    "Combined_Key",
                    "1/22/20",
                ],
                "deaths": [
                    "UID",
                    "iso2",
                    "iso3",
                    "code3",
                    "FIPS",
                    "Admin2",
                    "Province_State",
                    "Country_Region",
                    "Lat",
                    "Long_",
                    "Combined_Key",
                    "Population",  # TODO use this
                    "1/22/20",
                ],
            },
        }
        self.header_to_column = {
            "global": {
                "province": 0,
                "country": 1,
                "latitude": 2,
                "longitude": 3,
                "dates_start": 4,
            },
            "US_counties": {
                "confirmed": {
                    "iso2": 1,
                    "iso3": 2,
                    "code3": 3,
                    "FIPS": 4,
                    "county": 5,
                    "province": 6,
                    "country": 7,
                    "latitude": 8,
                    "longitude": 9,
                    "dates_start": 11,
                },
                "deaths": {
                    "iso2": 1,
                    "iso3": 2,
                    "code3": 3,
                    "FIPS": 4,
                    "county": 5,
                    "province": 6,
                    "country": 7,
                    "latitude": 8,
                    "longitude": 9,
                    "dates_start": 12,
                },
            },
        }
        self.existing_summary_locations = []
        self.last_date = ""

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        urls = {
            "global": {
                "confirmed":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",
                "deaths":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
                "recovered":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv",
                # "testing": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_testing_global.csv",
            },
            "US_counties": {
                "confirmed":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv",
                "deaths":
                "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv",
            },
        }

        (
            self.existing_summary_locations,
            self.last_date,
        ) = self.metadata_helper.get_existing_data_jhu()

        for file_type in ["global", "US_counties"]:
            for data_type, url in urls[file_type].items():
                self.parse_file(file_type, data_type, url)

    def parse_file(self, file_type, data_type, url):
        """
        Converts a CSV file to data we can submit via Sheepdog. Stores the
        records to submit in `self.location_data` and `self.time_series_data`.
        Ignores any records that are already in Sheepdog (relies on unique
        `submitter_id` to check)

        Args:
            file_type (str): type of this file - one
                of ["global", "US_counties"]
            data_type (str): type of the data in this file - one
                of ["confirmed", "deaths", "recovered"]
            url (str): URL at which the CSV file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            if headers[0] == "404: Not Found":
                print("  Unable to get file contents, received {}.".format(
                    headers))
                return

            expected_h = self.expected_csv_headers[file_type]
            if isinstance(expected_h, dict):
                expected_h = expected_h[data_type]
            obtained_h = headers[:len(expected_h)]
            assert (
                obtained_h == expected_h
            ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format(
                expected_h, obtained_h)

            first_date_i = [
                i for i, h in enumerate(headers) if h.endswith("/20")
            ][0]
            last_date = headers[-1]
            print("  First date: {}; last date: {}".format(
                headers[first_date_i], last_date))

            for row in reader:
                if not row:  # ignore empty rows
                    continue
                location, date_to_value = self.parse_row(
                    file_type, data_type, headers, row)
                if not location:
                    # We are using US data by state instead of global
                    continue

                location_submitter_id = location["submitter_id"]
                if (location_submitter_id not in self.location_data
                        # do not re-submit location data that already exist
                        and location_submitter_id
                        not in self.existing_summary_locations):
                    self.location_data[location_submitter_id] = location

                for date, value in date_to_value.items():
                    # do not re-submit summary_clinical data that
                    # already exist. Assume anything older than the last
                    # submitted date has already been submitted
                    if (time_series_date_to_string(date) >
                            time_series_date_to_string(self.last_date)
                            or LAST_DATE_ONLY):
                        self.time_series_data[location_submitter_id][date][
                            data_type] = value

    def parse_row(self, file_type, data_type, headers, row):
        """
        Converts a row of a CSV file to data we can submit via Sheepdog

        Args:
            file_type (str): type of this file - one
                of ["global", "US_counties"]
            data_type (str): type of the data in this file - one
                of ["confirmed", "deaths", "recovered"]
            headers (list(str)): CSV file headers (first row of the file)
            row (list(str)): row of data

        Returns:
            (dict, dict) tuple:
                - location data, in a format ready to be submitted to Sheepdog
                - { "date1": <value>, "date2": <value> } from the row data
        """
        header_to_column = self.header_to_column[file_type]
        if "country" not in header_to_column:
            header_to_column = header_to_column[data_type]

        country = row[header_to_column["country"]]
        province = row[header_to_column["province"]]
        latitude = row[header_to_column["latitude"]] or "0"
        longitude = row[header_to_column["longitude"]] or "0"

        if country == "US" and province == "":
            # We are using US data by state instead of global
            return None, None

        if int(float(latitude)) == 0 and int(float(longitude)) == 0:
            # Data with "Out of <state>" or "Unassigned" county value have
            # unknown coordinates of (0,0). We don't submit them for now
            return None, None

        submitter_id = format_location_submitter_id(country, province)
        location = {
            "country_region": country,
            "latitude": latitude,
            "longitude": longitude,
            "projects": [{
                "code": self.project_code
            }],
        }
        if province:
            location["province_state"] = province
        if file_type == "US_counties":
            county = row[header_to_column["county"]]
            iso2 = row[header_to_column["iso2"]]
            iso3 = row[header_to_column["iso3"]]
            code3 = row[header_to_column["code3"]]
            fips = row[header_to_column["FIPS"]]
            if county:
                location["county"] = county
                submitter_id = format_location_submitter_id(
                    country, province, county)
            if iso2:
                location["iso2"] = iso2
            if iso3:
                location["iso3"] = iso3
            if code3:
                location["code3"] = int(code3)
            if fips:
                location["FIPS"] = int(float(fips))
        location["submitter_id"] = submitter_id

        date_to_value = {}
        dates_start = header_to_column["dates_start"]
        dates_indices = range(dates_start, len(headers))
        if LAST_DATE_ONLY:
            dates_indices = [len(headers) - 1]
        for i in dates_indices:
            date = headers[i]
            date = get_unified_date_format(date)

            if row[i] == "":  # ignore empty values
                continue
            try:
                val = int(float(row[i]))
            except ValueError:
                print(
                    'Unable to convert {} to int for "{}", "{}" at {}'.format(
                        row[i], province, country, date))
                raise
            date_to_value[date] = val

        return location, date_to_value

    def submit_metadata(self):
        """
        Converts the data in `self.time_series_data` to Sheepdog records.
        `self.location_data already contains Sheepdog records. Batch submits
        all records in `self.location_data` and `self.time_series_data`
        """
        if LAST_DATE_ONLY:
            # delete the old data from the Sheepdog DB
            print("Deleting old summary_clinical data")
            self.metadata_helper.delete_nodes(["summary_clinical"])

        print("Submitting summary_location data")
        for location in self.location_data.values():
            record = {"type": "summary_location"}
            record.update(location)
            self.metadata_helper.add_record_to_submit(record)
        self.metadata_helper.batch_submit_records()

        print("Submitting summary_clinical data")
        for location_submitter_id, time_series in self.time_series_data.items(
        ):
            for date, data in time_series.items():
                submitter_id = format_summary_clinical_submitter_id(
                    location_submitter_id, date)
                record = {
                    "type": "summary_clinical",
                    "submitter_id": submitter_id,
                    "summary_locations": [{
                        "submitter_id": location_submitter_id
                    }],
                    "date": date,
                }
                for data_type, value in data.items():
                    record[data_type] = value
                self.metadata_helper.add_record_to_submit(record)
        self.metadata_helper.batch_submit_records()
Exemple #23
0
class VAC_TRACKER(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.clinical_trials = []
        self.program_name = "open"
        self.project_code = "VacTracker"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

    def files_to_submissions(self):
        """
        Reads json files and converts the data to Sheepdog records
        """
        url = "https://biorender.com/page-data/covid-vaccine-tracker/page-data.json"
        self.parse_file(url)

    def parse_file(self, url):
        """
        Converts a json file to data we can submit via Sheepdog. Stores the
        records to submit in `self.location_data` and `self.time_series_data`.
        Ignores any records that are already in Sheepdog (relies on unique
        `submitter_id` to check)

        Args:
            url (str): URL at which the file is available
        """
        print("Getting data from {}".format(url))
        with closing(requests.get(url, stream=True)) as r:
            data = r.json()
            try:
                for treatment in data["result"]["pageContext"]["treatments"]:
                    node = treatment["node"]
                    clinical_trial = self.parse_node(node)
                    self.clinical_trials.append(clinical_trial)
            except ValueError as e:
                print(f"ERROR: value error. Detail {e}")

    def parse_node(self, node):
        """
        Converts an element of an JSON file to data we can submit via Sheepdog

        Args:
            node (dict): node data

        Returns:
            dict:
                - clinical trial data, in a format ready to be submitted to Sheepdog
        """
        clinical_trial = {
            "projects": [{"code": self.project_code}],
            "type": "clinical_trials",
        }

        for key, value in node.items():
            if key not in MAP_FIELDS:
                continue
            gen3_field = MAP_FIELDS.get(key)[0]
            gen3_field_type = MAP_FIELDS.get(key)[1]
            if type(value) != gen3_field_type:
                print(
                    f"ERROR: The type of {key} does not match with the one in Gen3. Skip it"
                )
                continue
            if key == "fdaApproved":
                if "FDA-approved" in value:
                    value = "Yes"
                elif value == "":
                    value = "Unknown"
                elif value in ["N/A", "N//A", "N/A*"]:
                    value = "NA"
                elif value not in ["Yes", "No", "Unknown", "NA", None]:
                    value = "Unknown"
            if key == "customClinicalPhase":
                if value.lower() == "phase na":
                    value = "Phase N/A"
                elif value.lower() in ["preclinical", "pre-clinical"]:
                    value = "Preclinical Phase"
                elif value not in [
                    "Preclinical Phase",
                    "Phase I",
                    "Phase I/II",
                    "Phase II",
                    "Phase I/II/III",
                    "Phase III",
                    "Phase III/IV",
                    "Phase IV",
                    "Phase I/III/IV",
                    "Phase I/IV",
                    "Phase II/IV",
                    "Phase II/III/IV",
                    "Phase I/II/III/IV",
                    "Phase II/III",
                    "Phase N/A",
                    None,
                ]:
                    value = None
            if key == "technology":
                value = value.replace("*", "")
                if "to repurpose" in value.lower():
                    value = "Repurposed"
                if value not in [
                    "Antibodies",
                    "Antivirals",
                    "Cell-based therapies",
                    "Device",
                    "DNA-based",
                    "Inactivated virus",
                    "Modified APC",
                    "Non-replicating viral vector",
                    "Protein subunit",
                    "RNA-based treatments",
                    "RNA-based vaccine",
                    "Repurposed",
                    "Virus Like Particle",
                    "Other",
                    None,
                ]:
                    value = "Other"
            if key == "developmentStage":
                if value.lower() in ["preclinical", "pre-clinical"]:
                    value = "Preclinical Phase"
                elif value not in ["Preclinical Phase", "Clinical", "Withdrawn", None]:
                    value = "Other"

            if gen3_field_type == list:
                value = [str(v) for v in value]
            clinical_trial[gen3_field] = value
        return clinical_trial

    def submit_metadata(self):
        """
        Converts the data in `self.time_series_data` to Sheepdog records.
        `self.location_data already contains Sheepdog records. Batch submits
        all records in `self.clinical_trials`
        """

        print("Submitting clinical_trial data")
        for clinical_trial in self.clinical_trials:
            self.metadata_helper.add_record_to_submit(clinical_trial)
        self.metadata_helper.batch_submit_records()
Exemple #24
0
class OWID2(base.BaseETL):
    def __init__(self, base_url, access_token, s3_bucket):
        super().__init__(base_url, access_token, s3_bucket)
        self.summary_locations = []
        self.summary_clinicals = []
        self.summary_socio_demographics = []

        self.program_name = "open"
        self.project_code = "OWID"
        self.metadata_helper = MetadataHelper(
            base_url=self.base_url,
            program_name=self.program_name,
            project_code=self.project_code,
            access_token=access_token,
        )

        self.expected_csv_headers = [
            "iso_code",
            "continent",
            "location",
            "date",
            "total_cases",
            "new_cases",
            "new_cases_smoothed",
            "new_deaths",
            "new_deaths_smoothed",
            "total_cases_per_million",
            "new_cases_per_million",
            "new_cases_smoothed_per_million",
            "total_deaths_per_million",
            "new_deaths_per_million",
            "new_deaths_smoothed_per_million",
            "new_tests",
            "total_tests",
            "total_tests_per_thousand",
            "new_tests_per_thousand",
            "new_tests_smoothed",
            "new_tests_smoothed_per_thousand",
            "tests_per_case",
            "positive_rate",
            "tests_units",
            "stringency_index",
            "population",
            "population_density",
            "median_age",
            "aged_65_older",
            "aged_70_older",
            "gdp_per_capita",
            "extreme_poverty",
            "cardiovasc_death_rate",
            "diabetes_prevalence",
            "female_smokers",
            "male_smokers",
            "handwashing_facilities",
            "hospital_beds_per_thousand",
            "life_expectancy",
        ]

        self.header_to_column = {
            k: self.expected_csv_headers.index(k)
            for k in self.expected_csv_headers
        }

    def files_to_submissions(self):
        """
        Reads CSV files and converts the data to Sheepdog records
        """
        url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"
        self.parse_file(url)

    def insert_row_value(self, row_value):
        summary_location_list = []
        (
            summary_location,
            summary_clinical,
            summary_socio_demographic,
        ) = row_value
        summary_location_submitter_id = summary_location["submitter_id"]
        if summary_location_submitter_id not in summary_location_list:
            self.summary_locations.append(summary_location)
            summary_location_list.append(summary_location_submitter_id)

        self.summary_clinicals.append(summary_clinical)
        self.summary_socio_demographics.append(summary_socio_demographic)

    def parse_file(self, url):
        """
        Converts a CSV file to data we can submit via Sheepdog. Stores the
        records to submit in `self.location_data` and `self.time_series_data`.
        Ignores any records that are already in Sheepdog (relies on unique
        `submitter_id` to check)

        Args:
            url (str): URL at which the CSV file is available
        """
        print("Getting data from {}".format(url))

        with closing(requests.get(url, stream=True)) as r:
            f = (line.decode("utf-8") for line in r.iter_lines())
            reader = csv.reader(f, delimiter=",", quotechar='"')

            headers = next(reader)

            if headers[0] == "404: Not Found":
                print("  Unable to get file contents, received {}.".format(
                    headers))
                return

            expected_h = self.expected_csv_headers
            assert (
                set(expected_h).issubset(headers) == True
            ), "CSV headers have changed (expected {}, got {}). We may need to update the ETL code".format(
                expected_h, headers)

            pre_row = None

            for row in reader:
                res = self.parse_row(pre_row, row)
                if res is not None:
                    self.insert_row_value(res)
                pre_row = row
            if pre_row is not None:
                res = self.parse_row(pre_row, None)
                if res is not None:
                    self.insert_row_value(res)

    def create_clinical(self, row, date, summary_location_submitter_id):
        summary_clinical_submitter_id = format_summary_clinical_submitter_id(
            summary_location_submitter_id, date)

        summary_clinical = {
            "date": date,
            "submitter_id": summary_clinical_submitter_id,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        map_csv_fields = {
            # "iso_code": "iso_code",
            # "continent": "continent",
            # "location": "location",
            # "date": "date",
            "confirmed": ("total_cases", int),
            "new_cases": ("new_cases", int),
            "new_cases_smoothed": ("new_cases_smoothed", float),
            # "total_deaths": ("total_deaths", int),
            "new_deaths": ("new_deaths", int),
            "new_deaths_smoothed": ("new_deaths_smoothed", float),
            "total_cases_per_million": ("total_cases_per_million", float),
            "new_cases_per_million": ("new_cases_per_million", float),
            "new_cases_smoothed_per_million":
            ("new_cases_smoothed_per_million", float),
            "total_deaths_per_million": ("total_deaths_per_million", float),
            "new_deaths_per_million": ("new_deaths_per_million", float),
            "new_deaths_smoothed_per_million": (
                "new_deaths_smoothed_per_million",
                float,
            ),
            "new_tests": ("new_tests", int),
            "testing": ("total_tests", int),
            "total_tests_per_thousand": ("total_tests_per_thousand", float),
            "new_tests_per_thousand": ("new_tests_per_thousand", float),
            "new_tests_smoothed": ("new_tests_smoothed", float),
            "new_tests_smoothed_per_thousand": (
                "new_tests_smoothed_per_thousand",
                float,
            ),
            "tests_per_case": ("tests_per_case", float),
            "positive_rate": ("positive_rate", float),
            "tests_units": ("tests_units", str),
            "cardiovasc_death_rate": ("cardiovasc_death_rate", float),
            "diabetes_prevalence": ("diabetes_prevalence", float)
            # "hospital_beds_per_thousand": ("hospital_beds_per_thousand", float)
            # "human_development_index": ("human_development_index", float),
        }

        for k, (v, dtype) in map_csv_fields.items():
            value = row[self.header_to_column[v]]
            if value and value.lower() != "nan":
                try:
                    if dtype == int:
                        summary_clinical[k] = int(float(value.replace(",",
                                                                      "")))
                    elif dtype == float:
                        summary_clinical[k] = float(value.replace(",", ""))
                except Exception:
                    pass

        return summary_clinical

    def create_summary_socio_demographic(self, row, date,
                                         summary_location_submitter_id):
        summary_socio_demographic_submitter_id = (
            format_summary_summary_socio_demographic(
                summary_location_submitter_id, date))

        summary_socio_demographic = {
            "submitter_id": summary_socio_demographic_submitter_id,
            "summary_locations": [{
                "submitter_id": summary_location_submitter_id
            }],
        }

        map_csv_socio_fields = {
            "stringency_index": ("stringency_index", float),
            "population": ("population", int),
            "population_density": ("population_density", float),
            "median_age": ("median_age", float),
            "aged_65_older": ("aged_65_older", float),
            "aged_70_older": ("aged_70_older", float),
            "gdp_per_capita": ("gdp_per_capita", float),
            "extreme_poverty": ("extreme_poverty", float),
            "female_smokers": ("female_smokers", float),
            "male_smokers": ("male_smokers", float),
            "handwashing_facilities": ("handwashing_facilities", float),
            "life_expectancy": ("life_expectancy", float),
        }

        for k, (v, dtype) in map_csv_socio_fields.items():
            value = row[self.header_to_column[v]]
            if value and value.lower() != "nan":
                try:
                    if dtype == int:
                        summary_socio_demographic[k] = int(
                            float(value.replace(",", "")))
                    elif dtype == float:
                        summary_socio_demographic[k] = float(
                            value.replace(",", ""))
                except Exception:
                    pass

        return summary_socio_demographic

    def parse_row(self, pre_row, row):
        """
        Converts a row of a CSV file to data we can submit via Sheepdog

        Args:
            row (list(str)): row of data

        Returns:
            (dict, dict) tuple:
                - location data, in a format ready to be submitted to Sheepdog
                - { "date1": <value>, "date2": <value> } from the row data
        """
        if pre_row is None:
            return None

        pre_date = pre_row[self.header_to_column["date"]]
        pre_country = pre_row[self.header_to_column["location"]]
        pre_iso_code = pre_row[self.header_to_column["iso_code"]]

        if row is not None:
            iso_code = row[self.header_to_column["iso_code"]]

        if row is not None and pre_iso_code == iso_code:
            return None

        summary_location_submitter_id = format_location_submitter_id(
            pre_country)
        summary_location = {
            "country_region": pre_country,
            "submitter_id": summary_location_submitter_id,
            "projects": [{
                "code": self.project_code
            }],
        }

        return (
            summary_location,
            self.create_clinical(pre_row, pre_date,
                                 summary_location_submitter_id),
            self.create_summary_socio_demographic(
                pre_row, pre_date, summary_location_submitter_id),
        )

    def submit_metadata(self):
        """
        Converts the data in `self.time_series_data` to Sheepdog records.
        `self.location_data already contains Sheepdog records. Batch submits
        all records in `self.location_data` and `self.time_series_data`
        """

        # Commented
        # Only required for one time submission of summary_location
        print("Submitting summary_location data")
        for loc in self.summary_locations:
            loc_record = {"type": "summary_location"}
            loc_record.update(loc)
            self.metadata_helper.add_record_to_submit(loc_record)
        self.metadata_helper.batch_submit_records()
        print("Submitting summary_clinical data")
        for sc in self.summary_clinicals:
            sc_record = {"type": "summary_clinical"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()
        print("Submitting summary_socio_demographic data")
        for sc in self.summary_socio_demographics:
            sc_record = {"type": "summary_socio_demographic"}
            sc_record.update(sc)
            self.metadata_helper.add_record_to_submit(sc_record)
        self.metadata_helper.batch_submit_records()