def handle_ut(res, mapping): tagged = [] prefix = "Overview_Total People Tested Seven-Day Rolling Average Percent Positive Rates by Specimen Collection" with zipContextManager(res[-1]) as zipdir: with os.scandir(zipdir) as it: for entry in it: if entry.is_file and entry.name.startswith(prefix): df = pd.read_csv(os.path.join(zipdir, entry.name), parse_dates=['Collection Date']) df = df.rename(columns=mapping) df['UNITS'] = 'People' ppr = df.loc[:, ['TIMESTAMP', 'PPR', 'UNITS']] ppr['WINDOW'] = 'Week' ppr['SID'] = 'ut-1' tagged.extend(ppr.to_dict(orient='records')) # add the daily values totals = df.loc[:, ['TIMESTAMP', 'UNITS', 'POSITIVE']] totals['TOTAL'] = df['POSITIVE'] + df['NEGATIVE'] totals['WINDOW'] = 'Day' totals['SID'] = 'ut-2' tagged.extend(totals.to_dict(orient='records')) break return tagged
def handle_ga(res, mapping): tagged = [] filename = "pcr_positives.csv" with zipContextManager(res[-1]) as zipdir: df = pd.read_csv(open(os.path.join(zipdir, filename), 'r'), parse_dates=['report_date']) df = df[df['county'] == 'Georgia'] sid = 1 def get_sid(): return "ga-{}".format(sid) # alltime/daily latest = df.sort_values('report_date').iloc[-1] # daily tagged.append({ 'TOTAL': latest['ALL PCR tests performed'], 'POSITIVE': latest['All PCR positive tests'], 'TIMESTAMP': latest['report_date'], 'WINDOW': 'Day', 'UNITS': 'Tests', 'SID': get_sid(), }) # all time sid += 1 tagged.append({ 'TOTAL': latest['Running total of all PCR tests'], 'POSITIVE': latest['Running total of all PCR tests.1'], 'TIMESTAMP': latest['report_date'], 'WINDOW': 'Alltime', 'UNITS': 'Tests', 'SID': get_sid(), }) # separate it to 7 & 14 rates windows = { 'Week': '7 day percent positive', '14Days': '14 day percent positive' } for window, column in windows.items(): sid += 1 pct = df.filter( mapping.keys()).rename(columns=mapping).drop(columns='PPR') pct['PPR'] = pd.to_numeric(df[column], errors='coerce') pct['WINDOW'] = window pct['UNITS'] = 'Tests' pct['SID'] = get_sid() tagged.append(pct.to_dict(orient='records')) return tagged
def handle_ga(res, mapping): tagged = [] file_mapping = build_leveled_mapping(mapping) with zipContextManager(res[0]) as zipdir: for filename in file_mapping.keys(): date_fields = [ k for k, v in file_mapping[filename].items() if v == 'TIMESTAMP' ] df = pd.read_csv(os.path.join(zipdir, filename), parse_dates=date_fields) df = df[df['county'] == 'Georgia'] by_date = file_mapping[filename].pop(DATE_USED) df = df.rename(columns=file_mapping[filename]) df[DATE_USED] = by_date tagged.extend(df.to_dict(orient='records')) return tagged
def handle_ga(res, mapping): tagged = {} for result in res[:-1]: partial = extract_arcgis_attributes(result, mapping, debug_state='GA') tagged.update(partial) tagged[Fields.CURR_HOSP.name] += tagged.pop('CURR_HOSP_PUI') # last item is zip files = ["total_testing.csv", "summary_totals.csv"] with zipContextManager(res[-1]) as zipdir: for filename in files: summary = csv.DictReader(open(os.path.join(zipdir, filename), 'r')) summary = list(summary) summary = summary[-1] partial = map_attributes(summary, mapping, 'GA') tagged.update(partial) return tagged
def handle_ut(res, mapping): zipurl = res[-1] mapped = [] tab_mapping = build_leveled_mapping(mapping) def find_entry_mapping(name, tab_mappings): for x in tab_mappings.keys(): if name.startswith(x): return tab_mappings[x] return None with zipContextManager(zipurl) as zipdir: with os.scandir(zipdir) as it: for entry in it: entry_mapping = find_entry_mapping(entry.name, tab_mapping) if not entry_mapping: continue df = pd.read_csv(os.path.join(zipdir, entry.name)) cumulative = any([x.find('umulative') > 0 for x in df.columns]) df = df.rename(columns=entry_mapping).set_index(DATE) df.index = pd.to_datetime(df.index) # 1. Special handling for testing files if 'Test Type' in df.columns: df = df.pivot(columns=['Test Type', 'Result'], values='Count') df.columns = df.columns.map("-".join) df = df.rename(columns=entry_mapping).sort_index() # sum columns df = df.groupby(df.columns.values, axis=1).sum() # 2. Decide whether cumulative or not if not cumulative: df = df.sort_index().cumsum() # 3. Add DATE_USED + administrativia df[TS] = df.index df[DATE_USED] = entry_mapping[DATE_USED] mapped.extend(df.to_dict(orient='records')) return mapped
def handle_ga(res, mapping): tagged = [] file_mapping = build_leveled_mapping(mapping) with zipContextManager(res[0]) as zipdir: for filename in file_mapping.keys(): date_fields = [ k for k, v in file_mapping[filename].items() if v == 'TIMESTAMP' ] df = pd.read_csv(os.path.join(zipdir, filename), parse_dates=date_fields) # funny stuff: if filename.startswith('pcr_positive'): # the columns have the same name #facepalm df.columns = [ 'county', 'TIMESTAMP', '_', 'SPECIMENS', '_', 'SPECIMENS_POS', '_', '_' ] df = df[df['county'] == 'Georgia'] by_date = file_mapping[filename].pop(DATE_USED) df = df.rename(columns=file_mapping[filename]) df[DATE_USED] = by_date tagged.extend(df.to_dict(orient='records')) return tagged
def handle_ut(res, mapping): tagged = {} soup_start = 1 for result in res[:soup_start]: partial = extract_arcgis_attributes(result, mapping, 'UT') tagged.update(partial) stats = res[1] for k, v in mapping.items(): x = stats.find(id=k) if x: value_item = x.find(class_='value') if not value_item: value_item = x.find(class_='value-output') if not value_item: continue value = atoi(value_item.get_text(strip=True)) tagged[v] = value # inverse mapping revmap = {v: k for k, v in mapping.items()} hosp = res[2] tables = hosp.find_all('table') curr_hosp_table = tables[0] tds = curr_hosp_table.find_all('td', string=re.compile( revmap[Fields.CURR_HOSP.name])) curr_hosp = 0 for td in tds: for x in td.next_siblings: if x.name == 'td': curr_hosp += atoi(x.get_text(strip=True)) tagged[Fields.CURR_HOSP.name] = curr_hosp # TODO: code here can be improved, combined with top part td = curr_hosp_table.find('td', string=re.compile(revmap[Fields.CURR_ICU.name])) for x in td.next_siblings: if x.name == 'td': val = atoi(x.get_text(strip=True)) tagged[Fields.CURR_ICU.name] = val for t in tables[1:]: if t.caption.get_text(strip=True) in mapping: td = t.find_all('td', limit=2)[1] tagged[mapping[t.caption.get_text(strip=True)]] = atoi( td.get_text(strip=True)) # Downloadable file zipurl = res[-1] # Sometimes there are files for multiple dates, we need the most recent specimens_file_prefix = 'Overview_Total Tests by' specimens_file_latest = specimens_file_prefix recovered_file = 'Overview_Cumulative COVID-19 Cases' recovered_file_latest = recovered_file people_tested_file = 'Overview_Number of People Tested by' people_tested_latest = people_tested_file test_type = ['PCR/amplification', 'Antigen by DFA/IF'] result = ['POSITIVE', 'NEGATIVE'] with zipContextManager(zipurl) as zipdir: with os.scandir(zipdir) as it: for entry in it: df = None fields = [] if not entry.is_file: # just in case continue if entry.name.startswith(specimens_file_prefix): if entry.name < specimens_file_latest: continue # specimens fields = [ Fields.SPECIMENS_POS, Fields.SPECIMENS_NEG, Fields.ANTIGEN_POS, Fields.ANTIGEN_NEG ] specimens_file_latest = entry.name elif entry.name.startswith(people_tested_file): if entry.name < people_tested_latest: continue # people tested fields = [ Fields.CONFIRMED, Fields.NEGATIVE, Fields.ANTIGEN_POS_PEOPLE, Fields.ANTIGEN_NEG_PEOPLE, Fields.TOTAL, Fields.ANTIGEN_TOTAL_PEOPLE ] people_tested_latest = entry.name elif entry.name.startswith(recovered_file): if entry.name < recovered_file_latest: continue # recoveries fields = [Fields.RECOVERED] recovered_file_latest = entry.name if fields and entry.name.startswith(recovered_file): df = pd.read_csv(os.path.join(zipdir, entry.name)) last = df['Estimated Recovered *'].iloc[-1] if Fields.RECOVERED in fields: tagged[Fields.RECOVERED.name] = last elif fields and not entry.name.startswith(recovered_file): df = pd.read_csv(os.path.join(zipdir, entry.name)) summed = df.groupby(['Test Type', 'Result']).sum() i = 0 for tt in test_type: for rr in result: tag = fields[i] tag = tag if isinstance(tag, str) else tag.name value = summed.loc[tt, rr]['Count'] tagged[tag] = value i += 1 # handle totals if Fields.CONFIRMED in fields: tagged[Fields.TOTAL.name] = sum([ summed.loc[test_type[0], rr]['Count'] for rr in result ]) return tagged