def get_digits(name, canvas, driver): cases = getCanvas(driver.find_element_by_xpath(canvas), driver).replace(",", "").replace(" ", "") reDigit = re.compile(r"(\d+)") match = reDigit.match(cases.strip().lower()) if match: return match.group(1) else: print(f"Warning: no {name} extracted; got string", cases) return None
def run_LA(args): # Parameters raw_name = '../LA/raw' data_name = '../LA/data/data.csv' parish_race_name = '../LA/data/parish_race_data.csv' now = str(datetime.now()) fulldat = {} raw = requests.get( "https://services5.arcgis.com/O5K6bb5dZVZcTo5M/arcgis/rest/services/Combined_COVID_Reporting/FeatureServer/0/query?f=json&where=Measure%3D%27Age%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&groupByFieldsForStatistics=Group_Num%2CValueType&outStatistics=%5B%7B%22statisticType%22%3A%22sum%22%2C%22onStatisticField%22%3A%22Value%22%2C%22outStatisticFieldName%22%3A%22value%22%7D%5D&outSR=102100&resultType=standard&cacheHint=true" ).json() with open("%s/age_%s.json" % (raw_name, now), "w") as fp: json.dump(raw, fp) groups_death = [ "%s_0_17", "%s_18_29", "%s_30_39", "%s_40_49", "%s_50_59", "%s_60_69", "%s_70_plus" ] groups_case = [ "%s_0_4", "%s_5_17", "%s_18_29", "%s_30_39", "%s_40_49", "%s_50_59", "%s_60_69", "%s_70_plus" ] if len(raw["features"]) != 15: raise Exception("Unexpected number of ages in LA: " + str(len(raw["features"]))) raw_cases = [] raw_deaths = [] for entry in raw["features"]: if entry["attributes"]["ValueType"] == "case": raw_cases.append(entry["attributes"]) else: raw_deaths.append(entry["attributes"]) if len(raw_cases) != 8: raise Exception("Unexpected number of entries for age cases: " + str(len(raw_cases))) if len(raw_deaths) != 7: raise Exception("Unexpected number of entries for age deaths: " + str(len(raw_deaths))) for apos in range(8): fulldat[groups_case[apos] % "Case"] = raw_cases[apos]["value"] for apos in range(7): fulldat[groups_death[apos] % "Deaths"] = raw_deaths[apos]["value"] # for apos in range(8): # for atype, aname in [("case", "Cases"), ("death", "Deaths")]: # print(len(raw["features"])) # exit() # dat = [x["attributes"] for x in raw["features"] if x["attributes"]["Group_Num"] == apos+1 and x["attributes"]["ValueType"] == atype] # if len(dat) != 1: # print(dat) # raise Exception("Missing some age data") # if atype == "case": # fulldat[groups_case[apos] % aname] = dat[0]["value"] # else: # fulldat[groups_death[apos] % aname] = dat[0]["value"] raw = requests.get( "https://services5.arcgis.com/O5K6bb5dZVZcTo5M/arcgis/rest/services/Combined_COVID_Reporting/FeatureServer/0/query?f=json&where=Measure%3D%27Gender%27%20AND%20ValueType%3D%27case%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&groupByFieldsForStatistics=Group_Num&outStatistics=%5B%7B%22statisticType%22%3A%22sum%22%2C%22onStatisticField%22%3A%22Value%22%2C%22outStatisticFieldName%22%3A%22value%22%7D%5D&outSR=102100&resultType=standard&cacheHint=true" ).json() with open("%s/gender_%s.json" % (raw_name, now), "w") as fp: json.dump(raw, fp) if len(raw["features"]) != 3: raise Exception("Unexpected number of genders in LA") groups = [(1, "Case_Pct_Male"), (2, "Case_Pct_Female"), (3, "Case_Pct_Other")] for gnum, name in groups: dat = [ x["attributes"] for x in raw["features"] if x["attributes"]["Group_Num"] == gnum ] if len(dat) != 1: raise Exception("Missing some gender data") fulldat[name] = dat[0]["value"] raw = requests.get( "https://services5.arcgis.com/O5K6bb5dZVZcTo5M/arcgis/rest/services/Combined_COVID_Reporting/FeatureServer/0/query?f=json&where=Measure%3D%27State%20Tests%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&outStatistics=%5B%7B%22statisticType%22%3A%22sum%22%2C%22onStatisticField%22%3A%22Value%22%2C%22outStatisticFieldName%22%3A%22value%22%7D%5D&outSR=102100&resultType=standard&cacheHint=true" ).json() with open("%s/statelab_%s.json" % (raw_name, now), "w") as fp: json.dump(raw, fp) fulldat["TestsByStateLab"] = int(raw["features"][0]["attributes"]["value"]) raw = requests.get( "https://services5.arcgis.com/O5K6bb5dZVZcTo5M/arcgis/rest/services/Combined_COVID_Reporting/FeatureServer/0/query?f=json&where=Measure%3D%27Commercial%20Tests%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&outStatistics=%5B%7B%22statisticType%22%3A%22sum%22%2C%22onStatisticField%22%3A%22Value%22%2C%22outStatisticFieldName%22%3A%22value%22%7D%5D&outSR=102100&resultType=standard&cacheHint=true" ).json() with open("%s/commercial_%s.json" % (raw_name, now), "w") as fp: json.dump(raw, fp) fulldat["CommercialTests"] = int(raw["features"][0]["attributes"]["value"]) # raw = requests.get("https://www.arcgis.com/sharing/rest/content/items/69b726e2b82e408f89c3a54f96e8f776/data?f=json").json() # with open("%s/hospital_%s.json" % (raw_name, now), "w") as fp: # json.dump(raw, fp) # hospInfo = [x for x in raw["widgets"] if "defaultSettings" in x and "bottomSection" in x["defaultSettings"] and "textInfo" in x["defaultSettings"]["bottomSection"] and "text" in x["defaultSettings"]["bottomSection"]["textInfo"] and "ventilators" in x["defaultSettings"]["bottomSection"]["textInfo"]["text"]] # if len(hospInfo) != 1: # raise Exception("Bad ventilator layout in LA") # fulldat["OnVentilator"] = int(hospInfo[0]["defaultSettings"]["bottomSection"]["textInfo"]["text"].split()[0]) # ds = [x for x in hospInfo[0]["datasets"] if x["type"] == "staticDataset" and x["name"] == "reference"] # if len(ds) != 1: # raise Exception("Bad hospitalized layout") # fulldat["Hospitalized"] = int(ds[0]["data"]) fulldat["Scrape_Time"] = now raw = requests.get( "https://services5.arcgis.com/O5K6bb5dZVZcTo5M/arcgis/rest/services/Combined_COVID_Reporting/FeatureServer/0/query?f=json&where=Measure%3D%27Beds%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&groupByFieldsForStatistics=Geography%2CGroup_Num&outStatistics=%5B%7B%22statisticType%22%3A%22sum%22%2C%22onStatisticField%22%3A%22Value%22%2C%22outStatisticFieldName%22%3A%22value%22%7D%5D&outSR=102100&resultType=standard&cacheHint=true" ).json() with open("%s/bedsbyregion_%s.json" % (raw_name, now), "w") as fp: json.dump(raw, fp) if len(raw["features"]) != 18: raise Exception("Unexpected number of bed/regions in LA") for region in range(1, 10): for gnum, cat in [(1, "InUse"), (2, "StillAvailable")]: dat = [ x["attributes"] for x in raw["features"] if x["attributes"]["Geography"] == "LDH Region %d" % region and x["attributes"]["Group_Num"] == gnum ] if len(dat) != 1: raise Exception("Bad bed/region") fulldat["Beds_" + cat + "Region" + str(region)] = dat[0]["value"] raw = requests.get( "https://services5.arcgis.com/O5K6bb5dZVZcTo5M/arcgis/rest/services/Combined_COVID_Reporting/FeatureServer/0/query?f=json&where=Measure%3D%27ICU%20Beds%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&groupByFieldsForStatistics=Geography%2CGroup_Num&outStatistics=%5B%7B%22statisticType%22%3A%22sum%22%2C%22onStatisticField%22%3A%22Value%22%2C%22outStatisticFieldName%22%3A%22value%22%7D%5D&outSR=102100&resultType=standard&cacheHint=true" ).json() with open("%s/ICUbedsbyregion_%s.json" % (raw_name, now), "w") as fp: json.dump(raw, fp) if len(raw["features"]) != 18: raise Exception("Unexpected number of ICU bed/regions in LA") for region in range(1, 10): for gnum, cat in [(1, "InUse"), (2, "StillAvailable")]: dat = [ x["attributes"] for x in raw["features"] if x["attributes"]["Geography"] == "LDH Region %d" % region and x["attributes"]["Group_Num"] == gnum ] if len(dat) != 1: raise Exception("Bad ICU bed/region") fulldat["ICUBeds_" + cat + "Region" + str(region)] = dat[0]["value"] raw = requests.get( "https://services5.arcgis.com/O5K6bb5dZVZcTo5M/arcgis/rest/services/Combined_COVID_Reporting/FeatureServer/0/query?f=json&where=Measure%3D%27Hospital%20Vents%27&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&groupByFieldsForStatistics=Geography%2CGroup_Num&outStatistics=%5B%7B%22statisticType%22%3A%22sum%22%2C%22onStatisticField%22%3A%22Value%22%2C%22outStatisticFieldName%22%3A%22value%22%7D%5D&outSR=102100&resultType=standard&cacheHint=true" ).json() with open("%s/ventbyregion_%s.json" % (raw_name, now), "w") as fp: json.dump(raw, fp) if len(raw["features"]) != 18: raise Exception("Unexpected number of ventilator/regions in LA") for region in range(1, 10): for gnum, cat in [(1, "InUse"), (2, "StillAvailable")]: dat = [ x["attributes"] for x in raw["features"] if x["attributes"]["Geography"] == "LDH Region %d" % region and x["attributes"]["Group_Num"] == gnum ] if len(dat) != 1: raise Exception("Bad ventilator/region") fulldat["Vent_" + cat + "Region" + str(region)] = dat[0]["value"] # New data - Race by region raw = requests.get( 'https://services5.arcgis.com/O5K6bb5dZVZcTo5M/arcgis/rest/services/Case_Deaths_Race_Region_new/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&groupByFieldsForStatistics=LDH_Region%2CRace&outStatistics=%5B%7B%22statisticType%22%3A%22sum%22%2C%22onStatisticField%22%3A%22Deaths%22%2C%22outStatisticFieldName%22%3A%22value%22%7D%5D&outSR=102100&resultType=standard&cacheHint=true' ).json() with open("%s/DeathRacebyRegion_%s.json" % (raw_name, now), "w") as fp: json.dump(raw, fp) race_data = raw["features"] if len(race_data) != 63: raise Exception("Unexpected number of regions/races") expected_regions = [ "Region 1", "Region 2", "Region 3", "Region 4", "Region 5", "Region 6", "Region 7", "Region 8", "Region 9" ] expected_races = [ "White", "Black", "Unknown", "Asian", "Native Hawaiian/Other Pacific Islander", "American Indian/Alaskan Native", "Other" ] for attribute in race_data: race_data = attribute["attributes"] if race_data["LDH_Region"] not in expected_regions: raise Exception("Unexpeted region " + race_data["LDH_Region"]) if race_data["Race"] not in expected_races: raise Exception("Unexpected race " + race_data["Race"]) fulldat["Deaths_" + race_data["LDH_Region"].strip() + "_race_" + race_data["Race"]] = race_data["value"] #Case Race by region raw = requests.get( 'https://services5.arcgis.com/O5K6bb5dZVZcTo5M/arcgis/rest/services/Case_Deaths_Race_Region_new/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&groupByFieldsForStatistics=LDH_Region%2CRace&outStatistics=%5B%7B%22statisticType%22%3A%22sum%22%2C%22onStatisticField%22%3A%22Cases%22%2C%22outStatisticFieldName%22%3A%22value%22%7D%5D&outSR=102100&resultType=standard&cacheHint=true' ).json() with open("%s/CaseRacebyRegion_%s.json" % (raw_name, now), "w") as fp: json.dump(raw, fp) race_data = raw["features"] if len(race_data) != 63: raise Exception("Unexpected number of regions/races") expected_regions = [ "Region 1", "Region 2", "Region 3", "Region 4", "Region 5", "Region 6", "Region 7", "Region 8", "Region 9" ] expected_races = [ "White", "Black", "Unknown", "Asian", "Native Hawaiian/Other Pacific Islander", "American Indian/Alaskan Native", "Other" ] for attribute in race_data: race_data = attribute["attributes"] if race_data["LDH_Region"] not in expected_regions: raise Exception("Unexpeted region " + race_data["LDH_Region"]) if race_data["Race"] not in expected_races: raise Exception("Unexpected race " + race_data["Race"]) fulldat["Casess_" + race_data["LDH_Region"].strip() + "_race_" + race_data["Race"]] = race_data["value"] # New data - Race by parish out_parish = [] raw = requests.get( 'https://services5.arcgis.com/O5K6bb5dZVZcTo5M/arcgis/rest/services/Cases_and_Deaths_by_Race_by_Parish_and_Region/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&resultOffset=0&resultRecordCount=65&resultType=standard&cacheHint=true' ).json() with open("%s/RacebyParish_%s.json" % (raw_name, now), "w") as fp: json.dump(raw, fp) race_parish_data = raw["features"] expected_races = [ "White", "Black", "Unknown", "Asian", "Native_Hawaiian_Other_Pacific_Islander", "American_Indian_Alaskan_Native", "Other" ] for row in race_parish_data: parish_data = row["attributes"] parish_dict = {"Parish": parish_data["Parish"], "Scrape Time": now} for key in parish_data: if "Deaths_" in key: val = parish_data[key] if val == "": val = 0 parish_dict[key + "_race"] = val elif "Cases_" in key: val = parish_data[key] if val == "": val = 0 parish_dict[key + "_race"] = val elif "LDHH" in key: parish_dict[key] = parish_data[key] # for race in expected_races: # try: # parish_dict["Deaths_Race_" + race] = parish_data[race] # except: # print("Unexpected race: " + race) # raise out_parish.append(parish_dict) # Tableau - Probable Deaths # Using Selenium # driver = webdriver.Safari() driver = webdriver.Chrome( executable_path="andrew/ChromeDriver/chromedriver.exe") driver.maximize_window() driver.get( "https://public.tableau.com/profile/lee.mendoza#!/vizhome/COVID19demog/DataonCOVIN-19RelatedDeathsToDate" ) time.sleep(5) # Get raw driver.save_screenshot(raw_name + "/probable_deaths_pt1_" + now + ".png") driver.execute_script("window.scrollTo(0, 400)") driver.save_screenshot(raw_name + "/probable_deaths_pt2_" + now + ".png") frame = driver.find_element_by_xpath( '//*[@id="ng-app"]/body/div[1]/div[2]/section/div/div[2]/section[2]/figure/js-api-viz/div/iframe' ) driver.switch_to.frame(frame) # # Total Probable Deaths # total_prob_deaths = driver.find_element_by_xpath('//*[@id="tabZoneId19"]/div/div/div/div[1]/div/span/div[1]/span').text # total_prob_deaths_num = re.sub('[^0-9]', '', total_prob_deaths) # fulldat["Total Probable Deaths"] = total_prob_deaths_num # Probable Deaths by Race headers_race = driver.find_element_by_xpath( '//*[@id="tabZoneId3"]/div/div/div/div[1]/div[5]/div[1]/canvas') values_race = driver.find_element_by_xpath( '//*[@id="view13678703414402932068_2418008377866606056"]/div[1]/div[2]/canvas[1]' ) head = getCanvas(headers_race, driver).replace("\n\n", "\n") val = getCanvas(values_race, driver).replace("\n\n", "\n") expected_race = [ "American Indian/Alaska N..", "Asian", "Black", "Native Hawaiian/Pl", "Other", "Unknown", "White" ] extracted_races = [] for line in head.splitlines(): if line != "\n" or line != "": extracted_races.append(line) for race, pct in zip(extracted_races, val.splitlines()): percent = pct.replace("%", "") if race.strip() == "Native Hawaiian/PI": race = "Native Hawaiian/Pl" if race.strip() not in expected_race: raise Exception("Unexpected race in Probable Deaths " + race) fulldat["% Probable Deaths by Race: " + race] = percent # Probable Deaths by Ethnicity headers_ethnicity = driver.find_element_by_xpath( '//*[@id="tabZoneId10"]/div/div/div/div[1]/div[5]/div[1]/canvas') values_ethnicity = driver.find_element_by_xpath( '//*[@id="view13678703414402932068_2377024103324179123"]/div[1]/div[2]/canvas[1]' ) head = getCanvas(headers_ethnicity, driver).replace("\n\n", "\n") val = getCanvas(values_ethnicity, driver).replace("\n\n", "\n") expected_ethn = ["Hispanic/Latino", "Non-Hispanic/Latino", "Unknown"] for ethn, pct in zip(head.splitlines(), val.splitlines()): if ethn.strip() not in expected_ethn: raise Exception("Unexpected Ethnicity " + ethn) percent = pct.replace("%", "") fulldat["% Probable Deaths by Ethnicity: " + ethn] = percent # Probable Deaths by Gender headers_gender = driver.find_element_by_xpath( '//*[@id="tabZoneId18"]/div/div/div/div[1]/div[5]/div[1]/canvas') values_gender = driver.find_element_by_xpath( '//*[@id="view13678703414402932068_1339666610323305087"]/div[1]/div[2]/canvas[1]' ) head = getCanvas(headers_gender, driver).replace("\n\n", "\n") val = getCanvas(values_gender, driver).replace("\n\n", "\n") expected_gender = ["Female", "Male", "Unknown/Other"] for gen, pct in zip(head.splitlines(), val.splitlines()): if gen.strip() not in expected_gender: raise Exception("Unexpected Gender " + gen) percent = pct.replace("%", "") fulldat["% Probable Deaths by Gender: " + gen] = percent # Probable Deaths by Underlying Conditions headers_conditions = driver.find_element_by_xpath( '//*[@id="tabZoneId5"]/div/div/div/div[1]/div[5]/div[1]/canvas') values_conditions = driver.find_element_by_xpath( '//*[@id="view13678703414402932068_5659047270258252395"]/div[1]/div[2]/canvas[1]' ) head = getCanvas(headers_conditions, driver).replace("\n\n", "\n") val = getCanvas(values_conditions, driver).replace("\n\n", "\n") expected_conditions = [ "Asthma", "Cancer", "Cardiac Disease", "Chronic Kidney Disease", "Congestive Heart Failure", "Diabetes", "Hypertension", "Neurological", "Obesity", "Pulmonary", "None" ] for con, pct in zip(head.splitlines(), val.splitlines()): if con.strip() not in expected_conditions: raise Exception("Unexpected Underlying Condition " + con) percent = pct.replace("%", "") fulldat["% Probable Deaths by Underlying Condition: " + con] = percent # Probable Deaths - Avg and Median Ages headers_age = driver.find_element_by_xpath( '//*[@id="tabZoneId21"]/div/div/div/div[1]/div[5]/div[1]/canvas') values_age = driver.find_element_by_xpath( '//*[@id="view13678703414402932068_10330976522668559202"]/div[1]/div[2]/canvas[1]' ) head = getCanvas(headers_age, driver).replace("\n\n", "\n") val = getCanvas(values_age, driver).replace("\n\n", "\n") expected_metrics = ["Average", "Median"] for metric, age in zip(head.splitlines(), val.splitlines()): if metric.strip() not in expected_metrics: raise Exception("Unexpected Age Metric " + con) fulldat["Probable Deaths Age: " + metric] = age # Output fields = sorted([x for x in fulldat]) exists = os.path.exists(data_name) with open(data_name, "a") as fp: writer = csv.writer(fp) if not exists: writer.writerow(fields) writer.writerow([fulldat[x] for x in fields]) # Output - Parish for parish in out_parish: fields = sorted([x for x in parish]) exists = os.path.exists(parish_race_name) with open(parish_race_name, "a") as fp: writer = csv.writer(fp) if not exists: writer.writerow(fields) writer.writerow([parish[x] for x in fields]) # Merge Parish Race data merge_parish()
def run_ID(args): # Parameters raw_name = '../ID/raw' data_name = '../ID/data/data.csv' now = str(datetime.now()) out = {} # driver = webdriver.Safari() driver = webdriver.Chrome( executable_path="andrew/ChromeDriver/chromedriver.exe") driver.maximize_window() driver.get( "https://public.tableau.com/profile/idaho.division.of.public.health#!/vizhome/DPHIdahoCOVID-19Dashboard_V2/Story1" ) time.sleep(10) # More robust to wait for elements to appear... driver.switch_to.frame( driver.find_element_by_xpath( '//*[@id="ng-app"]/body/div[1]/div[2]/section/div/div[2]/section[2]/figure/js-api-viz/div/iframe' )) # out["TotalTested"] = None # Removed this one # OCR scan text info texts = [ ("TotalCases", r"(\d+)\s+\(\d+\s+new\)\s+statewide\s+cases", '//*[@id="view11831741491762752444_11141899506553115835"]/div[1]/div[2]/canvas[1]', False), ("TotalHospitalizations", r"(\d+)\s+cases hospitalized", '//*[@id="view11831741491762752444_14784563920108749745"]/div[1]/div[2]/canvas[1]', True), ("ICUAdmissions", r"(\d+)\s+cases admitted to icu", '//*[@id="view11831741491762752444_8851338240052320464"]/div[1]/div[2]/canvas[1]', True), ("CasesAmongHCW", r"(\d+)\s+cases among health care workers", '//*[@id="view11831741491762752444_378066509776727316"]/div[1]/div[2]/canvas[1]', False), ("CasesRecovered", r"(\d+)\s+cases estimated recovered", '//*[@id="view11831741491762752444_15348675858672874598"]/div[1]/div[2]/canvas[1]', True) ] # ("TotalDeaths", r"(\d+)", '//*[@id="view2142284533943777519_7098283575370063084"]/div[1]/div[2]/canvas[1]', False) texts.append(( "TotalDeaths", r"total deaths:+\s*(\d+)\s*\(\s*\d+\s*confirmed\s+\d+\s+probable\)\s+rate per 100000 population:+\s+(\S+)\s*", '//*[@id="view13810090252421852225_17430862024409208946"]/div[1]/div[2]/canvas[1]', False)) # Click Demographics tab driver.find_element_by_xpath( '//*[@id="tabZoneId4"]/div/div/div/span[2]/div/span/span/span[2]' ).click() time.sleep(10) for field, regex, xpath, flipBW in texts: if field == "TotalDeaths": # Click Deaths Tab driver.find_element_by_xpath( '//*[@id="tabZoneId4"]/div/div/div/span[2]/div/span/span/span[7]' ).click() time.sleep(10) # print(field) text = getCanvas(driver.find_element_by_xpath(xpath), driver, flipBW).replace(",", "") rr = re.compile(regex) match = rr.search(text.strip().lower()) if match: out[field] = match.group(1).strip() else: if field == "TotalDeaths": list_deaths = text.split() print(list_deaths) print(len(list_deaths)) exit() if len(list_deaths) != 8: raise Exception("Check Total Deaths") total_deaths = None death_rate_100k = None try: total_deaths = int(list_deaths[2]) except ValueError: print("Total Deaths not Int - Check!") raise try: death_rate_100k = float(list_deaths[7]) except ValueError: print("Death Rate not Number - Check!") raise out["TotalDeaths"] = total_deaths out["Death_Rate_Per_100000"] = death_rate_100k else: out[field] = None # print(field) raise Exception("Warning: No " + field + " extracted for Idaho; got string " + text) # Grab a few data points in the DOM # out["TestsStateLab"] = None # No longer convenient to pull # out["TestsCommercialLab"] = None # No longer convenient to pull # Click Demographics tab driver.find_element_by_xpath( '//*[@id="tabZoneId4"]/div/div/div/span[2]/div/span/span/span[2]' ).click() time.sleep(10) # Grab graphs for demographics genders = getGraph( driver.find_element_by_xpath( '//*[@id="view11831741491762752444_4953159310065112757"]/div[1]/div[2]/canvas[1]' ), (78, 121, 167, 255), driver) genderLabels = [ x.title() for x in getCanvas( driver.find_element_by_xpath( '//*[@id="tabZoneId65"]/div/div/div/div[1]/div[5]/div[1]/canvas' ), driver).strip().split() ] if len(genders) != 2 or len(genderLabels) != 2 or " ".join( sorted(genderLabels)) != "Female Male": raise Exception("Wrong gender vals for ID") for gender, val in zip(genderLabels, genders): out["Pct_Gender_" + gender] = round(val, 1) ages = getGraph( driver.find_element_by_xpath( '//*[@id="view11831741491762752444_7110063204799374782"]/div[1]/div[2]/canvas[1]' ), (78, 121, 167, 255), driver) ageLabels = getCanvas( driver.find_element_by_xpath( '//*[@id="tabZoneId77"]/div/div/div/div[1]/div[5]/div[1]/canvas'), driver).strip().split() ageExpect = [ "<18", "18-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80-89", "90-99", "100+" ] if len(ages) != 10 or sorted(ageLabels) != sorted(ageExpect): raise Exception("Wrong age groups for ID") for age, val in zip(ageLabels, ages): out["Pct_Age_" + age.replace("-", "_").replace("<18", "0_17").replace( "+", "_plus")] = round(val, 1) # raceCanvas = driver.find_element_by_xpath('//*[@id="view2142284533943777519_17257039996537996977"]/div[1]/div[2]/canvas[1]') # cols = getColors(raceCanvas, driver) # nonGray = sorted([x for x in cols if cols[x] > 100 and (x[0] != x[1] or x[0] != x[2])]) # if nonGray != [(43, 92, 138, 255)]: # raise Exception("Unexpected colors in race plot for ID: " + str(nonGray)) # raceDat = getStackedGraph(raceCanvas, [(43, 92, 138, 255)], driver) # raceLabels = ["White"] # for dat, lab in zip(raceDat, raceLabels): # out["Death_Pct_Race_" + lab] = round(dat, 1) # if not "Asian" in raceLabels: # out["Death_Pct_Race_Asian"] = None # # Click Deaths Tab # driver.find_element_by_xpath('//*[@id="tabZoneId4"]/div/div/div/span[2]/div/span/span/span[6]').click() # time.sleep(10) # # Get Ethnicity Death Pct # ethCanvas = driver.find_element_by_xpath('//*[@id="view13810090252421852225_17815945649314726624"]/div[1]/div[2]/canvas[1]') # cols = getColors(ethCanvas, driver) # nonGray = sorted([x for x in cols if cols[x] > 100 and (x[0] != x[1] or x[0] != x[2])]) # if nonGray != [(44, 89, 133, 255), (196, 216, 243, 255)]: # raise Exception("Unexpected colors in ethnicity plot for ID: " + str(nonGray)) # ethDat = getStackedGraph(ethCanvas, [(44, 89, 133, 255), (196, 216, 243, 255)], driver) # ethLabels = ["NotHispanic", "Hispanic"] # for dat, lab in zip(ethDat, ethLabels): # out["Death_Pct_Eth_" + lab] = round(dat, 1) driver.close() # # Manually collect race and ethn data print( "Please load https://public.tableau.com/profile/idaho.division.of.public.health#!/vizhome/DPHIdahoCOVID-19Dashboard_V2/Story1 and click on the COVID-19 Demographics Tab" ) # Cases goodRace = input( "Are there exactly 7 races in the dashboard: White, Asian, Black, Other Race, Multiple Race, American Indian, Native Hawaiian? Are there exactly two ethnicities: Non-hispanic and hispanic? (Y/N) " ).lower() if goodRace not in ["y", "n"]: raise Exception("Invalid input") if goodRace == "n": raise Exception("Invalid races and/or ethnicities") out["Case_Pct_Race_White"] = float(input("Case Pct White? ")) out["Case_Pct_Race_Other"] = float(input("Case Pct Other? ")) out["Case_Pct_Race_NativeHawaiian"] = float( input("Case Pct Native Hawaiian? ")) out["Case_Pct_Race_MultipleRaces"] = float( input("Case Pct Multiple Races? ")) out["Case_Pct_Race_Black"] = float(input("Case Pct Black? ")) out["Case_Pct_Race_Asian"] = float(input("Case Pct Asian? ")) out["Case_Pct_Race_AmericanIndian"] = float( input("Case Pct American Indian? ")) out["Case_Pct_Ethn_NonHispanic"] = float(input("Case Pct Non-Hispanic? ")) out["Case_Pct_Ethn_Hispanic"] = float(input("Case Pct Hispanic? ")) # Death print("Now click on the COVID-19 Related Deaths Demographics Tab") goodRace = input( "Are there exactly 6 races in the dashboard: White, Asian, Black, American Indian, Native Hawaiian and Other? Are there exactly 2 ethnicities: Non-hispanic and hispanic? (Y/N) " ).lower() if goodRace not in ["y", "n"]: raise Exception("Invalid input") if goodRace == "n": raise Exception("Invalid races and/or ethnicities") out["Death_Pct_Race_White"] = float(input("Death Pct White? ")) out["Death_Pct_Race_Other"] = float(input("Death Pct Other? ")) out["Death_Pct_Race_NativeHawaiian"] = float( input("Death Pct Native Hawaiian? ")) out["Death_Pct_Race_Black"] = float(input("Death Pct Black? ")) out["Death_Pct_Race_Asian"] = float(input("Death Pct Asian? ")) out["Death_Pct_Race_AmericanIndian"] = float( input("Death Pct American Indian? ")) out["Death_Pct_Ethn_NonHispanic"] = float( input("Death Pct Non-Hispanic? ")) out["Death_Pct_Ethn_Hispanic"] = float(input("Death Pct Hispanic? ")) out["Scrape_Time"] = now fields = sorted([x for x in out]) exists = os.path.exists(data_name) with open(data_name, "a") as fp: writer = csv.writer(fp) if not exists: writer.writerow(fields) writer.writerow([out[x] for x in fields]) # Let's make a best effort to get the raw data... img = requests.get( "https://public.tableau.com/static/images/DP/DPHIdahoCOVID-19Dashboard_V2/Story1/1.png" ) with open("%s/%s.png" % (raw_name, now), "wb") as fp: fp.write(img.content)
def run_CA(args): # Parameters raw_name = '../CA/raw' data_name = '../CA/data/data.csv' race_data_name = '../CA/data/race_data.csv' hospital_data_name = '../CA/data/hospital_data.csv' now = str(datetime.now()) new = True if new: run_new_CA() else: # driver = webdriver.Safari() driver = webdriver.Chrome(executable_path="andrew/ChromeDriver/chromedriver.exe") driver.maximize_window() driver.get("https://public.tableau.com/views/COVID-19PublicDashboard/Covid-19Public?:embed=y&:display_count=no&:showVizHome=no") time.sleep(10) # More robust to wait for elements to appear... #driver.switch_to.frame("viz_embedded_frame") out = {} out["TotalCases"] = get_digits("TotalCases", '//*[@id="view8860806102834544352_2954032034214900649"]/div[1]/div[2]/canvas[1]', driver) out["TotalFatalities"] = get_digits("TotalFatalities", '//*[@id="view8860806102834544352_10936283936734129650"]/div[1]/div[2]/canvas[1]', driver) out["TotalTested"] = get_digits("TotalTested", '//*[@id="view8860806102834544352_12188172174700680575"]/div[1]/div[2]/canvas[1]', driver) age_groups = getCanvas(driver.find_element_by_xpath( '//*[@id="tabZoneId257"]/div/div/div/div/div[5]/div[1]/canvas'), driver).replace("\n", " ") age_text = age_groups.replace(".", "-") try: age_perc = getCanvas(driver.find_element_by_xpath('//*[@id="view8860806102834544352_11651535759491462876"]/div[1]/div[2]/canvas[1]'), driver).replace("\n", " ") age_text = f"{age_text} {age_perc}" except: pass age_cats_together = True five_cats = False #match = re.search( # "([\d,-]+)[ ]+([\d,-]+)[ ]+([\d,-]+)[ ]+([A-Z][a-z]+)[ ]+([\d]+)%[ ]+([\d]+)%[ ]+([\d]+)%[ ]+([\d]+)%", age_text) match = re.search( "([\d,-]+)[ ]+([\d,-]+)[ ]+([\d,-]+)[ ]+([\d,+]+)[ ]+[‘]*([A-Z][a-z]+)[ ]+([\d]+)%[ ]+([\d]+)%[ ]+([\d]+)%[ ]+([\d]+)%[ ]+([\d]+)%", age_text) if match is None: match = re.search("([\d,-]+)[ ]+([\d,-]+)[ ]+([\d,-]+)[ ]+([\d,-]+)[ ]+([A-Z][a-z]+)[ ]+([\d]+%[ ]*)+", age_text) five_cats = match is not None #if match is None: # match = re.search( # "([\d]+)[ ]+([\d]+)%[ ]+([\d,-]+)[ ]+([\d]+)%[ ]+([\d,-]+)[ ]+([\d]+)%[ ]+([\d,+]+)[ ]+([\d]+)%", age_text) # age_cats_together = False if match is None: raise KeyError ("Failed at finding age groups") else: #age_groups = [f"Age_{i}" for i in match.groups()[::2]] #age_percentages = [int(i) for i in match.groups()[1::2]] if age_cats_together: if not five_cats: age_groups = [f"Age_{i}" for i in match.groups()[:len(match.groups())//2]] age_percentages = [int(i) for i in match.groups()[len(match.groups())//2:]] else: age_groups = [f"Age_{i}" for i in match.groups()[:5]] age_percentages = [int(i.replace("%", "")) for i in re.findall("[\d]+%", age_text)] if len(age_percentages)==3: age_percentages = [np.nan] + age_percentages + [np.nan] else: raise Exception("Not implemented") #unknown_age = getCanvas(driver.find_element_by_xpath('//*[@id="view8860806102834544352_17557392999253321305"]/div[1]/div[2]/canvas[2]'), driver) #match = re.search("([\d]+)%", unknown_age) #if match is None: # u_age = np.nan # raise Exception("Failed at collecting unkown age") #else: # u_age = int(match.groups()[0]) #age_groups.append("Age_Unknown") #age_percentages.append(u_age) for title, cnt in zip(age_groups, age_percentages): out[title] = cnt # Figure out how to do this... #ages = getGraph(driver.find_element_by_xpath('//*[@id="view8860806102834544352_11651535759491462876"]/div[1]/div[2]/canvas[1]'), driver) #ages = getCanvas(driver.find_element_by_xpath('//*[@id="view8860806102834544352_11651535759491462876"]/div[1]/div[2]/canvas[1]'), driver) #match = re.search("([\d]+)[ ]+[\d,-]+%[ ]+([\d,-]+)[ ]+[\d]+%[ ]+([\d,-]+)[ ]+[\d]+%[ ]+([\d,-]+)[ ]+[\d]+%", age_groups) # Sex #male = getCanvas(driver.find_element_by_xpath('//*[@id="view8860806102834544352_4610613615443112592"]/div[1]/div[2]/canvas[2]'), driver) #//*[@id="view8860806102834544352_4610613615443112592"]/div[1]/div[2]/canvas[1] #//*[@id="view8860806102834544352_4610613615443112592"]/div[1]/div[2]/canvas[2] #/html/body/div[2]/div[2]/div[1]/div[1]/div/div[2]/div[43]/div/div/div/div/div[11]/div[1]/div[2]/canvas[2] #male_match = re.search("([\d]+)%", male) #female = getCanvas(driver.find_element_by_xpath('//*[@id="view8860806102834544352_15384157321978781716"]/div[1]/div[2]/canvas[1]'), driver) #female_match = re.search("([\d]+)%", female) #unknown = getCanvas(driver.find_element_by_xpath('//*[@id="view8860806102834544352_6009561529893989681"]/div[1]/div[2]/canvas[2]'), driver) #//*[@id="view8860806102834544352_6009561529893989681"]/div[1]/div[2]/canvas[2] #//*[@id="view8860806102834544352_6009561529893989681"]/div[1]/div[2]/canvas[1] #'//*[@id="view8860806102834544352_6009561529893989681"]/div[2]/div[2]/canvas[1]'), driver) #unknown_match = re.search("([\d]+)%", unknown) #if (unknown_match is None) + (female_match is None) + (male_match is None)>1: # raise Exception("Cound not collect gender information") #else: sex = getCanvas(driver.find_element_by_xpath('//*[@id="tabZoneId247"]/div/div/div/div/div[5]/div[1]/canvas'), driver) sex = sex.replace("‘", "").replace("\n\n", " ") sex_perc = getCanvas(driver.find_element_by_xpath('//*[@id="view8860806102834544352_15384157321978781716"]/div[1]/div[2]/canvas[1]'), driver).replace("\n\n", " ") sex = f"{sex}\n\n{sex_perc}" match = re.search("Female\nMale Unknown\n\n([\d]+)% ([\d]+)% ([\d]+)%", sex) out["male_pos"] = int(match.groups()[1]) if match is not None else np.nan out["female_pos"] = int(match.groups()[0]) if match is not None else np.nan out["sex_unknown_pos"] = int(match.groups()[2]) if match is not None else np.nan #Race/ethnicity race = getCanvas(driver.find_element_by_xpath('//*[@id="tabZoneId246"]/div/div/div/div[1]/div[5]/div[1]/canvas'), driver) race_perc = getCanvas(driver.find_element_by_xpath('//*[@id="view8860806102834544352_2377024103324179123"]/div[1]/div[2]/canvas[1]'), driver) race = race.split("\n") + race_perc.split("\n") race = list(filter(len, race)) if len(race)!=16: raise ValueError("incorrect number of races") def fk_every_dk(x): try: value = int(x.replace("%", "")) except ValueError: value = np.nan return value race_cats = map(lambda x: x.replace('‘', "").replace(".", ""), race[:8]) race_perc = map(fk_every_dk , race[8:]) out_race = {x:y for x,y in zip(race_cats, race_perc)} driver.close() out["Scrape_Time"] = now fields = sorted([x for x in out]) exists = os.path.exists(data_name) with open(data_name, "a") as fp: writer = csv.writer(fp) if not exists: writer.writerow(fields) writer.writerow([out[x] for x in fields]) # Let's make a best effort to get the raw data... img = requests.get("https://public.tableau.com/static/images/CO/COVID-19PublicDashboard/Covid-19Public/1_rss.png") out_race["Scrape_Time"] = now fields = sorted([x for x in out_race]) exists = os.path.exists(race_data_name) with open(race_data_name, "a") as fp: writer = csv.writer(fp) if not exists: writer.writerow(fields) writer.writerow([out_race[x] for x in fields]) with open("%s/%s.png" % (raw_name, now), "wb") as fp: fp.write(img.content) # California hospital situation # driver = webdriver.Safari() driver = webdriver.Chrome(executable_path="andrew/ChromeDriver/chromedriver.exe") driver.get("https://public.tableau.com/views/COVID-19PublicDashboard/Covid-19Hospitals?%3Aembed=y&%3Adisplay_count=no&%3AshowVizHome=no") time.sleep(10) # More robust to wait for elements to appear... driver.implicitly_wait(5) stoi = lambda x: int(x.replace(",", "")) out = {} out["posPatients"] = get_digits("PosPatients", '//*[@id="view11327846829742299964_9307352602670595869"]/div[1]/div[2]/canvas[1]', driver) out["posICU"] = get_digits("PosICU", '//*[@id="view11327846829742299964_7957542083138737667"]/div[1]/div[2]/canvas[1]', driver) out["suspectedICU"] = get_digits("suspectedICU", '//*[@id="view11327846829742299964_12557426117314542746"]/div[1]/div[2]/canvas[1]', driver) out["suspectedPatients"] = get_digits("suspectedPatients", '//*[@id="view11327846829742299964_7498269305876793953"]/div[1]/div[2]/canvas[1]', driver) responding_fasilities = getCanvas(driver.find_element_by_xpath( '//*[@id="view11327846829742299964_17652355579425549403"]/div[1]/div[2]/canvas[1]'), driver) responding_beds = getCanvas(driver.find_element_by_xpath( '//*[@id="view11327846829742299964_12615353459920747640"]/div[1]/div[2]/canvas[1]'), driver) #match = re.search("[No]*\n\n([0-9]+)%\n\n[Yes\n]*([\d]+)", responding_fasilities) match = re.search("([\d,,]+) of ([\d,,]+)", responding_fasilities) #out["responding_facilities_yes_percent"] = int(match.groups()[0]) if match is not None else np.nan out["responding_facilities_yes_percent"] = stoi(match.groups()[0])/stoi(match.groups()[1]) #out["responding_fasilities_yes_num"] = int(match.groups()[1].replace(",", "")) if match is not None else np.nan out["responding_fasilities_yes_num"] = stoi(match.groups()[1]) #match = re.search("([\d,,]+)\n[\d\d%\n]*Yes[:]*\n([\d,,]+)", responding_beds) match = re.search("([\d,,]+) of ([\d,,]+)", responding_beds) #out["responding_beds_no_num"] = int(match.groups()[0].replace(",", "")) if match is not None else np.nan #out["responding_beds_yes_num"] = int(match.groups()[1].replace(",", "")) if match is not None else np.nan out["responding_beds_no_num"] = stoi(match.groups()[1]) - stoi(match.groups()[0]) out["responding_beds_yes_num"] = stoi(match.groups()[1]) driver.close() out["Scrape_Time"] = now fields = sorted([x for x in out]) exists = os.path.exists(hospital_data_name) with open(hospital_data_name, "a") as fp: writer = csv.writer(fp) if not exists: writer.writerow(fields) writer.writerow([out[x] for x in fields]) img = requests.get("https://public.tableau.com/static/images/CO/COVID-19PublicDashboard/Covid-19Hospitals/1_rss.png") with open("%s/%s_hospital.png" % (raw_name, now), "wb") as fp: fp.write(img.content)
def run_WY(args): # Parameters raw_name = '../WY/raw' data_name = '../WY/data/data.csv' now = str(datetime.now()) # driver = webdriver.Safari() driver = webdriver.Chrome( executable_path="andrew/ChromeDriver/chromedriver.exe") driver.maximize_window() driver.get( "https://public.tableau.com/profile/melissa.taylor#!/vizhome/EpiCOVIDtest/Dashboard" ) time.sleep(10) # More robust to wait for elements to appear... frames = driver.find_elements_by_tag_name('iframe') if len(frames) != 1: raise Exception("Could not find iframe") driver.switch_to.frame(frames[0]) out = {} cases = getCanvas( driver.find_element_by_xpath( '//*[@id="view3855800012607193825_5172391045180469540"]/div[1]/div[2]/canvas[1]' ), driver).replace(",", "").replace("/", "") reCases = re.compile( r"(\d+)\s+lab\s+confirmed\s+cases\s+(\d+)\s+recovered") match = reCases.match(cases.strip().lower()) if match: out["TotalConfirmedCases"] = match.group(1) out["RecoveredConfirmedCases"] = match.group(2) else: raise Exception( "Warning: no total cases extracted for Wyoming; got string" + cases) pcases = getCanvas( driver.find_element_by_xpath( '//*[@id="view3855800012607193825_2191712128240212356"]/div[1]/div[2]/canvas[1]' ), driver).replace(",", "").replace("/", "") rePCases = re.compile(r"(\d+)\s+probable\s+cases\s+(\d+)\s+recovered") match = rePCases.match(pcases.strip().lower()) if match: out["TotalProbableCases"] = match.group(1) out["RecoveredProbableCases"] = match.group(2) else: raise Exception( "Warning: no total cases extracted for Wyoming; got string" + cases) deaths = getCanvas( driver.find_element_by_xpath( '//*[@id="view3855800012607193825_11972683903544902318"]/div[1]/div[2]/canvas[1]' ), driver).replace(",", "").replace("/", "") reDeath = re.compile(r"(\d+)\s+death") match = reDeath.match(deaths.strip().lower()) if match: out["Deaths"] = match.group(1) else: out["Deaths"] = None print("Warning: no death count for Wyoming; got string", deaths) ages = getGraph( driver.find_element_by_xpath( '//*[@id="view3855800012607193825_719033729591027206"]/div[1]/div[2]/canvas[1]' ), (78, 121, 167, 255), driver) if len(ages) != 8: raise Exception("Wrong age count for WY") for age, val in zip([ "0_17", "18_29", "30_39", "40_49", "50_59", "60_69", "70_79", "80_plus" ], ages): out["Pct_Age_" + age] = round(val, 1) genders = getGraph( driver.find_element_by_xpath( '//*[@id="view3855800012607193825_14275175901841894353"]/div[1]/div[2]/canvas[1]' ), (78, 121, 167, 255), driver) genderLabels = [ x.title() for x in getCanvas( driver.find_element_by_xpath( '//*[@id="tabZoneId32"]/div/div/div/div[1]/div[5]/div[1]/canvas' ), driver).strip().split() ] if len(genders) != 4 or len(genderLabels) != 4 or " ".join( sorted(genderLabels)) != "Female Male Other Unknown": raise Exception("Wrong gender vals for WY") for gender, val in zip(genderLabels, genders): out["Pct_Gender_" + gender] = round(val, 1) symptoms = getGraph( driver.find_element_by_xpath( '//*[@id="view3855800012607193825_13010788587209822541"]/div[1]/div[2]/canvas[1]' ), (78, 121, 167, 255), driver) symptomLabels = getCanvas( driver.find_element_by_xpath( '//*[@id="tabZoneId49"]/div/div/div/div[1]/div[5]/div[1]/canvas'), driver).strip() symptomLabels = [ x.strip().title().replace(" ", "") for x in symptomLabels.split("\n") if x != "" ] if len(symptoms) != 14 or len(symptomLabels) != 14 or " ".join( sorted(symptomLabels) ) != 'AbdominalPain Chills Cough Diarrhea Fatigue Fever Headache LossOfSmell/Taste MuscleAches NauseaOrVomiting None RunnyNose ShortnessOfBreath SoreThroat': print(sorted(symptomLabels)) raise Exception("Unexpected symptoms in WY") for symptom, val in zip(symptomLabels, symptoms): out["Pct_Symptom_" + symptom] = round(val, 1) exposures = getGraph( driver.find_element_by_xpath( '//*[@id="view3855800012607193825_11422738650703355835"]/div[1]/div[2]/canvas[1]' ), (78, 121, 167, 255), driver) exposureLabels = getCanvas( driver.find_element_by_xpath( '//*[@id="tabZoneId52"]/div/div/div/div/div[5]/div[1]/canvas'), driver).strip() exposureLabels = [ x.strip().title().replace(" ", "").replace("CommunitySpread", "CommunityAcquired") for x in exposureLabels.split("\n") if x != "" ] expected = 'CommunalLiving CommunityAcquired ContactWithAKnownCase DomesticTravel InternationalTravel Other PendingInvestigation Unknown' if len(exposures) != 8 or len(exposureLabels) != 8 or " ".join( sorted(exposureLabels)) != expected: print(" ".join(sorted(exposureLabels))) print(expected) raise Exception("Unexpected exposures in WY") for exposure, val in zip(exposureLabels, exposures): out["Pct_Exposure_" + exposure] = round(val, 1) underlying = getGraph( driver.find_element_by_xpath( '//*[@id="view3855800012607193825_1672645675164053982"]/div[1]/div[2]/canvas[1]' ), (78, 121, 167, 255), driver) underlyingLabels = getCanvas( driver.find_element_by_xpath( '//*[@id="tabZoneId53"]/div/div/div/div[1]/div[5]/div[1]/canvas'), driver).strip() underlyingLabels = [ x.strip().title().replace(" ", "") for x in underlyingLabels.split("\n") if x != "" ] if len(underlyingLabels) != 3 or len(underlying) != 3 or " ".join( sorted(underlyingLabels)) != "No Unknown Yes": raise Exception("Unexpected underlying conditions in WY") for ul, val in zip(underlyingLabels, underlying): out["Pct_UnderlyingCond_" + ul] = round(val, 1) hosp = getGraph( driver.find_element_by_xpath( '//*[@id="view3855800012607193825_16872468006943659536"]/div[1]/div[2]/canvas[1]' ), (78, 121, 167, 255), driver) hospLabels = getCanvas( driver.find_element_by_xpath( '//*[@id="tabZoneId54"]/div/div/div/div[1]/div[5]/div[1]/canvas'), driver).strip() hospLabels = [ x.strip().title().replace(" ", "").replace( "NoHospitalization", "No").replace("Hospitalization", "Yes") for x in hospLabels.split("\n") if x != "" ] if len(hospLabels) != 3 or len(hosp) != 3 or " ".join( sorted(hospLabels)) != "No Unknown Yes": raise Exception("Unexpected hospitalization data in WY") for hh, val in zip(hospLabels, hosp): out["Pct_Hospitalized_" + hh] = round(val, 1) race = getGraph( driver.find_element_by_xpath( '//*[@id="view3855800012607193825_4426486129312330342"]/div[1]/div[2]/canvas[1]' ), (78, 121, 167, 255), driver) raceLabels = getCanvas( driver.find_element_by_xpath( '//*[@id="tabZoneId60"]/div/div/div/div[1]/div[5]/div[1]/canvas'), driver).strip() raceLabels = [ x.strip().title().replace(" ", "") for x in raceLabels.split("\n") if x != "" ] for idx in range(len(raceLabels)): if raceLabels[idx].find("Hawaii") >= 0: raceLabels[idx] = "PacificIslander" if len(race) != len(raceLabels) or sorted(raceLabels) != [ 'AmericanIndian', 'Asian', 'Black', 'Hispanic', 'Other', 'PacificIslander', 'Unknown', 'White' ]: raise Exception("Unexpected race data in WY") for rr, val in zip(raceLabels, race): out["Pct_Race_" + rr] = round(val, 1) driver.get( "https://public.tableau.com/profile/melissa.taylor#!/vizhome/shared/8BBTPD39D" ) # driver.get("https://public.tableau.com/profile/melissa.taylor#!/vizhome/WyomingCOVID-19TestingDataDashboard/Dashboard1") # https://health.wyo.gov/publichealth/infectious-disease-epidemiology-unit/disease/novel-coronavirus/covid-19-testing-data/ time.sleep(10) # More robust to wait for elements to appear... frames = driver.find_elements_by_tag_name('iframe') if len(frames) != 1: raise Exception("Could not find iframe on second page") driver.switch_to.frame(frames[0]) testing = getCanvas( driver.find_element_by_xpath( '//*[@id="view4597669659173455094_6899958757650081769"]/div[1]/div[2]/canvas[1]' ), driver).replace(",", "") testingLabels = getCanvas( driver.find_element_by_xpath( '//*[@id="tabZoneId10"]/div/div/div/div[1]/div[5]/div[1]/canvas'), driver) testingLabels = [ x.strip().title().replace(" ", "").replace("_", "-") for x in testingLabels.split("\n") if x != "" ] if sorted(testingLabels) != ['GrandTotal', 'Non-Wphl', 'Wphl']: print(testingLabels) raise Exception("Unexpected testing lab labels in WY") reTesting = re.compile(r"(\d+)\s+(\d+)\s+(\d+)") match = reTesting.match(testing.strip()) if match: out["WPHLTotalTest"] = match.group(testingLabels.index("Wphl") + 1) out["CommercialLabTotalTest"] = match.group( testingLabels.index("Non-Wphl") + 1) else: print(testing.strip()) print( "Warning: unexpected testing lab results in WY; skipping extraction" ) out["WPHLTotalTest"] = None out["CommercialLabTotalTest"] = None # Find width of the testing positive region within the whole image testPos = driver.find_element_by_xpath( '//*[@id="view4597669659173455094_10103530389136289716"]/div[1]/div[2]/canvas[1]' ) b64 = driver.execute_script( "return arguments[0].toDataURL('image/png').substring(21);", testPos) img = Image.open(BytesIO(base64.b64decode(b64))) pix = img.load() cols, rows = img.size # indexing is backward... maxCol = None for c in range(cols): for r in range(rows): if pix[c, r] == (252, 141, 98, 255): maxCol = c continue if maxCol is None: print(getColors(testPos, driver)) raise Exception("Could not find testing positive color in WY") out["TestPositivePercentage"] = (maxCol + 1) / cols * 100 testByAge = getGraph( driver.find_element_by_xpath( '//*[@id="view4597669659173455094_719033729591027206"]/div[1]/div[2]/canvas[1]' ), (191, 198, 212, 255), driver) ageLabels = getCanvas( driver.find_element_by_xpath( '//*[@id="tabZoneId12"]/div/div/div/div[1]/div[5]/div[1]/canvas'), driver) ageLabels = [ x.strip().title().replace(" ", "") for x in ageLabels.split("\n") if x != "" ] ageMap = { '<18Years': "0_18", '19-29Years': "19_29", '30-39Years': "30_39", '40-49Years': "40_49", '50-59Years': "50_59", '60-69Years': "60_69", '70-79Years': "70_79", '80+Years': "80_plus" } if len(ageLabels) != 8 or len(testByAge) != 8 or not all( x in ageMap for x in ageLabels): raise Exception("Unexpected test age layout") for dat, lab in zip(testByAge, ageLabels): out["Test_Pct_Age_" + ageMap[lab]] = dat driver.close() out["Scrape_Time"] = now fields = sorted([x for x in out]) exists = os.path.exists(data_name) with open(data_name, "a") as fp: writer = csv.writer(fp) if not exists: writer.writerow(fields) writer.writerow([out[x] for x in fields]) # Let's make a best effort to get the raw data... img = requests.get( "https://public.tableau.com/static/images/Ep/EpiCOVIDtest/Dashboard/1.png" ) with open("%s/%s.png" % (raw_name, now), "wb") as fp: fp.write(img.content) img = requests.get( "https://public.tableau.com/static/images/Wy/WyomingCOVID-19TestingDataDashboard/Dashboard1/1.png" ) with open("%s/testing_%s.png" % (raw_name, now), "wb") as fp: fp.write(img.content)