def _scrape(self, **kwargs): results = self.setup_session() assert results.x_session_id, 'No X-Session-Id found' demographic_cases_df = self.get_demographic_cases_df(results.x_session_id) demographic_deaths_df = self.get_demographic_deaths_df(results.x_session_id) date = self.get_date() cases = self.get_total_cases(results.x_session_id) deaths = int(demographic_deaths_df['Deaths'].sum()) aa_cases = int(demographic_cases_df.loc['Black or African American', 'Count']) aa_deaths = int(demographic_deaths_df.loc['Black or African American', 'Deaths']) pct_aa_cases = to_percentage(aa_cases, cases) pct_aa_deaths = to_percentage(aa_deaths, deaths) return [self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, )]
def test_california_sf(): mocked_requests = { 'date': util.MockSeleniumWireRequest(response_body=loader.get_blob('california_san_francisco_date.txt')), 'cases_by_race': util.MockSeleniumWireRequest( response_body=loader.get_blob('california_san_francisco_cases.txt')), 'deaths_by_race': util.MockSeleniumWireRequest( response_body=loader.get_blob('california_san_francisco_deaths.txt')) } mocked_webdriver = util.mocked_webdriver_runner(requests=mocked_requests) with mock.patch('covid19_scrapers.states.california_san_francisco.WebdriverRunner', mocked_webdriver): util.run_scraper_and_assert( scraper_cls=CaliforniaSanFrancisco, assertions={ 'Date Published': datetime(2020, 7, 17).date(), 'Total Cases': 5100, 'Total Deaths': 130, 'Count Cases Black/AA': 300, 'Count Deaths Black/AA': 5, 'Pct Includes Unknown Race': False, 'Pct Includes Hispanic Black': False, 'Pct Cases Black/AA': to_percentage(300, 4400), 'Pct Deaths Black/AA': to_percentage(5, 30), 'Count Cases Known Race': 4400, 'Count Deaths Known Race': 30 })
def _scrape(self, **kwargs): cases_df = pd.read_excel( get_content_as_file(self.BASE_URL.format(self.CASES_SUFFIX))) most_recent_cases = cases_df.iloc[-1] date = most_recent_cases['DATE'].to_pydatetime().date() cases = int(most_recent_cases['TOTAL_CASES']) deaths = int(most_recent_cases['TOTAL_DEATHS']) demographic_df = pd.read_excel( get_content_as_file(self.BASE_URL.format(self.DEMOGRAPHIC_SUFFIX))) most_recent_aa_cases = demographic_df[ (demographic_df['Category'] == 'RACE') & (demographic_df['CAT_DETAIL'] == 'Black or African American') & (demographic_df['Date'] == str(date))].iloc[0] aa_cases = int(most_recent_aa_cases['Cat_CaseCount']) aa_deaths = int(most_recent_aa_cases['CAT_DEATHCOUNT']) pct_aa_cases = to_percentage(aa_cases, cases) pct_aa_deaths = to_percentage(aa_deaths, deaths) return [ self._make_series(date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=True, pct_includes_hispanic_black=True) ]
def test_delaware(): util.run_scraper_and_assert(scraper_cls=Delaware, assertions={ 'Date Published': datetime(2020, 7, 16).date(), 'Total Cases': 13000, 'Total Deaths': 500, 'Count Cases Black/AA': 3000, 'Count Deaths Black/AA': 100, 'Pct Includes Unknown Race': False, 'Pct Includes Hispanic Black': False, 'Pct Cases Black/AA': to_percentage(3000, 12000), 'Pct Deaths Black/AA': to_percentage(100, 480), 'Count Cases Known Race': 12000, 'Count Deaths Known Race': 480 })
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps().go_to_url( self.CASES_URL).wait_for_number_of_elements( (By.XPATH, '//canvas'), 58).find_request( 'cases', find_by=tableau.find_tableau_request ).clear_request_history().go_to_url( self.DEATHS_URL).wait_for_number_of_elements( (By.XPATH, '//canvas'), 29).find_request('deaths', find_by=tableau.find_tableau_request)) parser = tableau.TableauParser(request=results.requests['cases']) raw_date_str = pydash.head( parser.extract_data_from_key('cases')['ATTR(Date Updated)']) date = datetime.strptime(raw_date_str, '%A, %B %d, %Y').date() confirmed_cases = pydash.head( parser.extract_data_from_key('cases') ['SUM(# Lab Confirmed Cases)']) probable_cases = pydash.head( parser.extract_data_from_key('probable cases')['SUM(# probable)']) cases = confirmed_cases + probable_cases cases_df = pd.DataFrame.from_dict( parser.extract_data_from_key('raceth')).set_index('sub-category') aa_cases = cases_df.loc['Black']['SUM(count)'] known_race_cases = cases - cases_df.loc['unknown']['SUM(count)'] parser = tableau.TableauParser(request=results.requests['deaths']) deaths = pydash.head( parser.extract_data_from_key('death (2)') ['SUM(# lab confirmed deaths)']) deaths_df = pd.DataFrame.from_dict( parser.extract_data_from_key('raceth (death)')).set_index( 'sub-category') deaths_df = deaths_df.assign(Count=[ round(v * deaths) for v in deaths_df['SUM(% of deaths)'].values ]) aa_deaths = deaths_df.loc['Black']['Count'] known_race_deaths = deaths - deaths_df.loc['unknown']['Count'] pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths) return [ self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths, ) ]
def _scrape(self, **kwargs): # HACK ALERT: # The main page dynamically adds JavaScript to insert and # submit (POST) a form with the field "_pd" set. The POST body # most be mime type multipart/form-data rather than the # requests default application/x-www-form-urlencoded. We can # make requests generate this by using the files argument # instead of data for the form data. Using a file name key of # None prevents the extraneous name from being included in the # call. soup = url_to_soup(self.DATA_URL, method='POST', files={None: b'_pd'}) # Find the update date last_updated_text = soup.find('strong', string=re.compile('Last Updated')) month, day, year = map( int, re.search(r'(\d)/(\d\d)/(\d\d\d\d)', last_updated_text.parent.text).groups()) date = datetime.date(year, month, day) _logger.info(f'Processing data for {date}') # Load the cases by race/ethnicity table cases_div = soup.find(id='pnlConfirmedCasesByRaceTbl') cases = table_to_dataframe( cases_div.find('table')).set_index('Race/Ethnicity') # Fix column names cases.columns = cases.columns.str.replace('\xa0.*', '') # Extract the data total_cases = cases.loc['Total Number of Cases', 'Confirmed Cases'] known_cases = cases.loc['Total with Race/Ethnicity Available', 'Confirmed Cases'] aa_cases = cases.loc['Non-Hispanic Black', 'Confirmed Cases'] aa_cases_pct = to_percentage(aa_cases, known_cases) deaths_div = soup.find(id='pnlDeathsByRaceTbl') deaths = table_to_dataframe( deaths_div.find('table')).set_index('Race/Ethnicity') deaths.columns = deaths.columns.str.replace('\xa0.*', '') total_deaths = deaths.loc['Total Number of Deaths', 'Deaths'] known_deaths = deaths.loc['Total with Race/Ethnicity Available', 'Deaths'] aa_deaths = deaths.loc['Non-Hispanic Black', 'Deaths'] aa_deaths_pct = to_percentage(aa_deaths, known_deaths) return [ self._make_series( date=date, cases=_maybe_int(total_cases), deaths=_maybe_int(total_deaths), aa_cases=_maybe_int(aa_cases), aa_deaths=_maybe_int(aa_deaths), pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_cases, known_race_deaths=known_deaths, ) ]
def _scrape(self, **kwargs): json = get_json(self.DATA_URL) state_info = pydash.get(json, 'state_testing_results.values.-1') demographics_data = pydash.get(json, 'demographics.race') aa_data = aa_data = pydash.find( demographics_data, lambda data: data['description'] == 'Black') date = datetime.strptime(state_info['testDate'], '%m/%d/%Y').date() cases = state_info.get('confirmed_cases') deaths = state_info.get('deaths') aa_cases = aa_data.get('count') aa_deaths = aa_data.get('deaths') assert cases, 'Could not find number of confirmed cases' assert deaths, 'Could not find number of deaths' assert aa_cases, 'Could not find number of AA cases' assert aa_deaths, 'Could not find number of AA deaths' pct_aa_cases = to_percentage(aa_cases, cases) pct_aa_deaths = to_percentage(aa_deaths, deaths) return [ self._make_series(date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=True, pct_includes_hispanic_black=False) ]
def test_california_los_angeles(): util.run_scraper_and_assert(scraper_cls=CaliforniaLosAngeles, assertions={ 'Date Published': datetime(2020, 7, 16).date(), 'Total Cases': 150000, 'Total Deaths': 4000, 'Count Cases Black/AA': 4000, 'Count Deaths Black/AA': 400, 'Pct Includes Unknown Race': False, 'Pct Includes Hispanic Black': False, 'Pct Cases Black/AA': to_percentage(4000, 78500), 'Pct Deaths Black/AA': to_percentage(400, 3650), 'Count Cases Known Race': 78500, 'Count Deaths Known Race': 3650 })
def _scrape(self, **kwargs): _logger.debug('Download covid data zip file') z = get_zip(self.ZIP_URL) _logger.debug( 'Get the last update of the demographics.csv file in archive') date = get_zip_member_update_date(z, 'demographics.csv') _logger.info(f'Processing data for {date}') _logger.debug('Load demographics CSV') data = pd.read_csv(get_zip_member_as_file(z, 'demographics.csv')) by_race = data[['race', 'Confirmed_Cases', 'Deaths'] ].groupby('race').sum() totals = by_race.sum(axis=0) total_cases = totals['Confirmed_Cases'] total_deaths = totals['Deaths'] _logger.debug('African American cases and deaths') aa_key = next(filter(lambda x: x.startswith('African-American'), by_race.index)) aa_cases = by_race.loc[aa_key, 'Confirmed_Cases'] aa_cases_pct = to_percentage(aa_cases, total_cases) aa_deaths = by_race.loc[aa_key, 'Deaths'] aa_deaths_pct = to_percentage(aa_deaths, total_deaths) return [self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=True, pct_includes_hispanic_black=True, )]
def _scrape(self, **kwargs): soup = url_to_soup(self.URL, local_file_name='nc_soup') demographic_df = get_demographic_dataframe() date = self.get_date(soup) _logger.info(f'Processing data for {date}') cases = self.get_total_cases(soup) deaths = self.get_total_deaths(soup) known_cases = cases - self.get_missing_cases(demographic_df) known_deaths = deaths - self.get_missing_deaths(demographic_df) aa_cases = self.get_aa_cases(demographic_df) aa_deaths = self.get_aa_deaths(demographic_df) pct_aa_cases = to_percentage(aa_cases, known_cases) pct_aa_deaths = to_percentage(aa_deaths, known_deaths) return [ self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_cases, known_race_deaths=known_deaths, ) ]
def _scrape(self, **kwargs): date, cases = query_geoservice(**self.CASES) total_cases = cases.iloc[0, 0] _, cases_by_race = query_geoservice(**self.CASES_BY_RACE) cases_by_race = cases_by_race.set_index('Race') known_cases = total_cases - cases_by_race.loc['Not Reported', 'value'] aa_cases = cases_by_race.loc['African American/Black', 'value'] pct_aa_cases = to_percentage(aa_cases, known_cases) _, deaths = query_geoservice(**self.DEATHS) total_deaths = deaths.iloc[0, 0] _, deaths_by_race = query_geoservice(**self.DEATHS_BY_RACE) deaths_by_race = deaths_by_race.set_index('Race') known_deaths = deaths_by_race.drop('Not Reported', errors='ignore').sum()['value'] aa_deaths = deaths_by_race.loc['African American', 'value'] pct_aa_deaths = to_percentage(aa_deaths, known_deaths) return [self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_cases, known_race_deaths=known_deaths, )]
def _scrape(self, **kwargs): date, data = query_geoservice(**self.DATA) _logger.info(f'Processing data for {date}') total_cases = data.loc[0, 'POSITIVE'] total_deaths = data.loc[0, 'DEATHS'] unknown_cases = data.loc[0, 'POS_UNK'] unknown_deaths = data.loc[0, 'DTH_UNK'] known_cases = total_cases - unknown_cases known_deaths = total_deaths - unknown_deaths aa_cases = data.loc[0, 'POS_BLK'] aa_deaths = data.loc[0, 'DTH_BLK'] aa_cases_pct = to_percentage(aa_cases, known_cases) aa_deaths_pct = to_percentage(aa_deaths, known_deaths) return [ self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_cases, known_race_deaths=known_deaths, ) ]
def _scrape(self, **kwargs): date, demog = query_geoservice(**self.DEMOG) _logger.info(f'Processing data for {date}') total_cases = demog.loc[0, 'positives'] known_cases = total_cases - demog.loc[0, 'unk_race'] aa_cases = demog.loc[0, 'black'] pct_aa_cases = to_percentage(aa_cases, known_cases) total_deaths = demog.loc[0, 'deaths'] known_deaths = total_deaths - demog.loc[0, 'd_unk_race'] aa_deaths = demog.loc[0, 'd_black'] pct_aa_deaths = to_percentage(aa_deaths, known_deaths) return [ self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_cases, known_race_deaths=known_deaths, ) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() # Get date cases_results = runner.run( WebdriverSteps() .go_to_url(self.SUMMARY_URL) .wait_for_presence_of_elements((By.XPATH, "//span[contains(text(),'Last updated')]")) .get_page_source()) date = self.get_date(cases_results.page_source) # Cases for Race cases_by_race_results = runner.run( WebdriverSteps() .go_to_url(self.RACE_CASES_URL) .find_request('race_cases', find_by=find_tableau_request)) assert cases_by_race_results.requests['race_cases'], 'No results for race_cases found' resp_body = cases_by_race_results.requests['race_cases'].response.body.decode('utf8') cases_for_race_json = TableauParser(resp_body).extract_data_from_key(key='Rates by Race for All Cases') cases_df = self.to_df(cases_for_race_json) cases = cases_df['Measure Values'].sum() known_race_cases = cases_df.drop('Not Reported/Missing')['Measure Values'].sum() aa_cases = cases_df.loc['Black or African American', 'Measure Values'].sum() # Deaths for Race deaths_by_race_results = runner.run( WebdriverSteps() .go_to_url(self.RACE_DEATHS_URL) .find_request('race_deaths', find_by=find_tableau_request)) assert deaths_by_race_results.requests['race_deaths'], 'No results for race_deaths found' resp_body = deaths_by_race_results.requests['race_deaths'].response.body.decode('utf8') deaths_for_race_json = TableauParser(resp_body).extract_data_from_key(key='Mortality by Race') deaths_df = self.to_df(deaths_for_race_json) deaths = deaths_df['Measure Values'].sum() known_race_deaths = deaths_df.drop('Not Reported/Missing')['Measure Values'].sum() aa_deaths = deaths_df.loc['Black or African American', 'Measure Values'].sum() pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths) return [self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths )]
def _scrape(self, **kwargs): r = get_cached_url(self.JS_URL) json_str = re.search(r'data = (([^;]|\n)*)', r.text, re.MULTILINE).group(1).strip() # Commas on the last item in a list or object are valid in # JavaScript, but not in JSON. json_str = re.sub(r',(\s|\n)*([]}]|$)', r'\2', json_str, re.MULTILINE) _logger.debug(f'Extracted JSON: {json_str}') data = json.loads(json_str)['content'] # Find the update date month, day, year = map( int, re.search(r'(\d{2})/(\d{2})/(\d{4})', data['info']).groups()) date = datetime.date(year, month, day) _logger.info(f'Processing data for {date}') # Extract the total counts total_cases = raw_string_to_int(data['count']) total_deaths = raw_string_to_int(data['death']) # Fetch the HTML page soup = url_to_soup(self.DATA_URL) # Extract the Black/AA counts cases = self._extract_by_race_table(soup.find(id='race')) deaths = self._extract_by_race_table(soup.find(id='race-d')) _logger.debug(f'cases: {cases}') _logger.debug(f'deaths: {deaths}') known_cases = cases.drop('Under Investigation')['count'].sum() known_deaths = deaths.drop('Under Investigation')['count'].sum() aa_cases = cases.loc['Black', 'count'].sum() aa_deaths = deaths.loc['Black', 'count'].sum() aa_cases_pct = to_percentage(aa_cases, known_cases) aa_deaths_pct = to_percentage(aa_deaths, known_deaths) return [ self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_cases, known_race_deaths=known_deaths, ) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps().go_to_url( self.CASES_URL).wait_for_number_of_elements( (By.XPATH, '//canvas'), 38).find_request( 'cases', find_by=tableau.find_tableau_request ).clear_request_history().go_to_url( self.DEATHS_URL).wait_for_number_of_elements( (By.XPATH, '//canvas'), 20).find_request('deaths', find_by=tableau.find_tableau_request)) parser = tableau.TableauParser(request=results.requests['cases']) date_str = pydash.head( parser.extract_data_from_key('Footer')['AGG(Today)']) date = datetime.strptime(date_str, '%m-%d-%y').date() cases = pydash.head( parser.extract_data_from_key('Total Cases')['AGG(Total Cases)']) deaths = pydash.head( parser.extract_data_from_key('Total Deaths') ['SUM(Count Of Deaths)']) cases_pct_df = pd.DataFrame.from_dict( parser.extract_data_from_key('Race Breakdown ')).set_index('Race') cases_df = cases_pct_df.assign(Count=[ round(v * cases) for v in cases_pct_df['CNTD(Caseid 1)'].values ]) aa_cases = cases_df.loc['Black']['Count'] known_race_cases = cases - cases_df.loc['Unknown']['Count'] parser = tableau.TableauParser(request=results.requests['deaths']) deaths_pct_df = pd.DataFrame.from_dict( parser.extract_data_from_key('Bar | Race')).set_index('Race') deaths_df = deaths_pct_df.assign(Count=[ round(v * deaths) for v in deaths_pct_df['SUM(Death Count)'].values ]) aa_deaths = deaths_df.loc['Black']['Count'] known_race_deaths = deaths - deaths_df.loc['Unknown']['Count'] pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths) return [ self._make_series(date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps( ).go_to_url(self.ACCEPTABLE_USE_URL).find_element_by_xpath( "//input[@class='form-check-input']" ).click_on_last_element_found().find_element_by_xpath( '//form/button' ).click_on_last_element_found().wait_for_presence_of_elements( (By.XPATH, "//a[@data-chart-id='count-charts']") ).find_element_by_xpath( "//a[@data-chart-id='count-charts']" ).click_on_last_element_found( ).wait_for_presence_of_elements( (By.XPATH, "//*[contains(text(), 'Total Cases by Race/Ethnicity & County')]" )).get_page_source()) soup = results.page_source date = self.get_last_updated_date(soup) _logger.info(f'Processing data for {date}') cases = self.get_total_cases(soup) deaths = self.get_total_deaths(soup) cases_df = self.get_race_cases_df(soup) aa_cases = cases_df.loc['Non-Hispanic Black']['State of Delaware'] try: unknown_race_cases = cases_df.loc['Unknown']['State of Delaware'] except KeyError: unknown_race_cases = 0 known_race_cases = cases - unknown_race_cases deaths_df = self.get_race_deaths_df(soup) aa_deaths = deaths_df.loc['Non-Hispanic Black']['State of Delaware'] try: unknown_race_deaths = deaths_df.loc['Unknown']['State of Delaware'] except KeyError: unknown_race_deaths = 0 known_race_deaths = deaths - unknown_race_deaths pct_aa_cases = to_percentage(aa_cases, known_race_cases) pct_aa_deaths = to_percentage(aa_deaths, known_race_deaths) return [ self._make_series(date=self.get_last_updated_date(soup), cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths) ]
def _scrape(self, *, validation=False, **kwargs): urls = find_all_links(url=self.REPORTING_URL, search_string='covid-19-raw-data') _logger.debug(f'Fetching links from {urls}') url_fragment = urls[0].split('/')[2] url = self.DOWNLOAD_URL_TEMPLATE.format(url_fragment) _logger.debug(f'Current COVID-19 data: {url}') # Cumulative number of cases / deaths ma_zip = get_zip(url) _logger.debug('Get the race/ethnicity breakdown') df_raw = pd.read_csv( get_zip_member_as_file(ma_zip, 'RaceEthnicity.csv'), parse_dates=['Date'] ) _logger.debug('Get date of most recent data published') # If desired (validation = True), verify that calculations as # of D4BL's last refresh match these calculations. if validation is True: max_date = datetime.date(2020, 4, 9) else: max_date = max(df_raw.Date) _logger.info(f'Processing data for {max_date}') df_mass = df_raw[df_raw.Date == max_date] # Intermediate calculations total_cases = df_mass['All Cases'].sum() total_deaths = df_mass['Deaths'].sum() aa_cases = df_mass[ df_mass['Race/Ethnicity'] == 'Non-Hispanic Black/African American' ]['All Cases'].tolist()[0] aa_cases_pct = to_percentage(aa_cases, total_cases) aa_deaths = df_mass[ df_mass['Race/Ethnicity'] == 'Non-Hispanic Black/African American' ]['Deaths'].tolist()[0] aa_deaths_pct = to_percentage(aa_deaths, total_deaths) return [self._make_series( date=max_date.date(), cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=True, pct_includes_hispanic_black=False, )]
def test_alabama(patched_geoservice): # setup constants and mock data AA_IDX = 1 UNKNOWN_IDX = 3 cases = { 'Racecat': ['Asian', 'Black', 'Other', 'Unknown', 'White'], 'value': [200, 19000, 3000, 18000, 20000] } deaths = { 'Racecat': ['Asian', 'Black', 'Other', 'Unknown', 'White'], 'value': [10, 500, 25, 50, 600] } known_cases_by_race = sum(cases['value']) - cases['value'][UNKNOWN_IDX] known_deaths_by_race = sum(deaths['value']) - deaths['value'][UNKNOWN_IDX] # patch geoservice patched_geoservice.side_effect = [ util.make_query_geoservice_data(data=cases), util.make_query_geoservice_data(data=deaths) ] # run and test util.run_scraper_and_assert(scraper_cls=Alabama, assertions={ 'Date Published': date.today(), 'Total Cases': sum(cases['value']), 'Total Deaths': sum(deaths['value']), 'Count Cases Black/AA': cases['value'][AA_IDX], 'Count Deaths Black/AA': deaths['value'][AA_IDX], 'Pct Includes Unknown Race': False, 'Pct Includes Hispanic Black': False, 'Pct Cases Black/AA': to_percentage(cases['value'][AA_IDX], known_cases_by_race), 'Pct Deaths Black/AA': to_percentage(deaths['value'][AA_IDX], known_deaths_by_race), 'Count Cases Known Race': known_cases_by_race, 'Count Deaths Known Race': known_deaths_by_race })
def _scrape(self, **kwargs): soup = url_to_soup(self.REPORTING_URL) # find date strong = soup.find('strong', string=re.compile('Updated ')) date_text = re.search(r'[A-Z][a-z][a-z]+ \d(\d)?, 20\d\d', strong.text).group() # find total number of confirmed cases strong = soup.find('strong', string=re.compile(r'Total positive( cases)?:')) num_cases = raw_string_to_int(str(strong.next_sibling)) # find total number of deaths strong = soup.find('strong', string=re.compile('(Total )?[Dd]eaths:')) num_deaths = raw_string_to_int(strong.next_sibling) date_obj = datetime.datetime.strptime(date_text, '%B %d, %Y').date() _logger.info(f'Processing data for {date_obj}') _logger.debug(f'Number Cases: {num_cases}') _logger.debug(f'Number Deaths: {num_deaths}') # find number of Black/AA cases and deaths table = soup.find('table', attrs={'id': 'raceethtable'}) if not table: raise ValueError('Unable to locate race/ethnicity table') th = table.find( lambda elt: elt.name == 'th' and elt.text.find('Black') >= 0) if not th: raise ValueError('Unable to locate Black/AA data row') tds = th.find_next_siblings('td') cnt_aa_cases = raw_string_to_int(tds[0].text) cnt_aa_deaths = raw_string_to_int(tds[1].text) pct_aa_cases = to_percentage(cnt_aa_cases, num_cases) pct_aa_deaths = to_percentage(cnt_aa_deaths, num_deaths) _logger.debug(f'Number Black/AA Cases: {cnt_aa_cases}') _logger.debug(f'Number Black/AA Deaths: {cnt_aa_deaths}') return [ self._make_series( date=date_obj, cases=num_cases, deaths=num_deaths, aa_cases=cnt_aa_cases, aa_deaths=cnt_aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=True, pct_includes_hispanic_black=True, ) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps().go_to_url( self.HOME_PAGE_URL).get_page_source().go_to_url( self.DEMOGRAPHIC_CASES_URL).find_request( key='cases', find_by=tableau.find_tableau_request). clear_request_history().go_to_url( self.DEMOGRAPHIC_DEATHS_URL).find_request( 'deaths', find_by=tableau.find_tableau_request)) date_str_element = results.page_source.find( 'strong', string=re.compile('current')) assert date_str_element, 'No date element found' date_str = date_str_element.get_text() pattern = re.compile(r'(\d{1,2}\/\d{1,2}\/\d{4})') matches = pattern.search(date_str) assert matches, 'Date not found.' date = datetime.strptime(matches.group(), '%m/%d/%Y').date() parser = tableau.TableauParser(request=results.requests['cases']) cases_df = parser.get_dataframe_from_key('CaseRace').set_index( 'Measure Status') cases = cases_df.loc['Black or African American']['AGG(Calculation1)'] aa_cases = round( cases_df.loc['Black or African American']['SUM(Count)'] * cases) parser = tableau.TableauParser(request=results.requests['deaths']) deaths = parser.get_dataframe_from_key( 'Total Deaths (2)')['SUM(Deaths)'].sum() deaths_df = parser.get_dataframe_from_key('Race').set_index( 'Measure Status11') aa_deaths = round( deaths_df.loc['Black or African American']['SUM(Deaths)'] * deaths) pct_aa_cases = to_percentage(aa_cases, cases) pct_aa_deaths = to_percentage(aa_deaths, deaths) return [ self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, ) ]
def _scrape(self, refresh=False, **kwargs): _logger.debug('Get case totals data') totals_json = get_json(self.CASE_DATA_URL) assert totals_json, 'Error finding total cases and deaths' most_recent_totals = totals_json[0] # dict.get sets value to None if key not availale report_date = datetime.strptime(most_recent_totals.get('date'), '%Y-%m-%dT%H:%M:%S.%f').date() total_cases = most_recent_totals.get('confirmedcases') total_deaths = most_recent_totals.get('confirmeddeaths') assert total_cases, 'Error finding total cases' assert total_deaths, 'Error finding total deaths' # convert from string to int total_cases = int(total_cases) total_deaths = int(total_deaths) _logger.debug('Get race data') race_json = get_json(self.RACE_DATA_URL) assert race_json, 'Error getting race cases and deaths json' most_recent_nh_black_data = pydash.find( race_json, lambda data: data['hisp_race'] == 'NH Black') assert most_recent_nh_black_data, 'Error finding total NH Black entry' aa_cases = most_recent_nh_black_data.get('case_tot') aa_deaths = most_recent_nh_black_data.get('deaths') assert aa_cases, 'Error finding total NH Black cases' assert aa_deaths, 'Error finding total NH Black deaths' # convert from string to int aa_cases = int(aa_cases) aa_deaths = int(aa_deaths) pct_aa_cases = to_percentage(aa_cases, total_cases) pct_aa_deaths = to_percentage(aa_deaths, total_deaths) return [ self._make_series(date=report_date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=True, pct_includes_hispanic_black=False) ]
def _scrape(self, **kwargs): runner = WebdriverRunner() results = runner.run(WebdriverSteps().go_to_url( self.CASES_URL).find_request('cases', find_by=tableau.find_tableau_request). clear_request_history().go_to_url( self.DEATHS_URL).find_request( 'deaths', find_by=tableau.find_tableau_request)) parser = tableau.TableauParser(request=results.requests['cases']) raw_date_str = pydash.head( parser.extract_data_from_key('cases')['ATTR(dateupdated)']) date = datetime.strptime(raw_date_str, '%m/%d/%Y').date() cases = pydash.head( parser.extract_data_from_key('cases') ['SUM(Laboratory Confirmed Cases)']) cases_df = pd.DataFrame.from_dict( parser.extract_data_from_key('raceth')).set_index('subcategory') aa_cases = cases_df.loc['Black']['SUM(count)'] known_race_cases = cases - cases_df.loc['Unknown']['SUM(count)'] parser = tableau.TableauParser(request=results.requests['deaths']) deaths = pydash.head( parser.extract_data_from_key('death (2)')['SUM(Deaths)']) deaths_df = pd.DataFrame.from_dict( parser.extract_data_from_key('raceth (death)')).set_index( 'subcategory') aa_deaths = deaths_df.loc['Black']['SUM(count)'] known_race_deaths = deaths - deaths_df.loc['Unknown']['SUM(count)'] pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases) pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths) return [ self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_race_cases, known_race_deaths=known_race_deaths, ) ]
def _scrape(self, **kwargs): data = pd.read_excel(self.DATA_URL, sheet_name='Demographics') # Get totals date month, day, year = map( int, re.search(r'(\d+)/(\d+)/(\d\d\d\d)', data.columns[0]).groups()) date = datetime.date(year, month, day) _logger.info(f'Processing data for {date}') # Get totals data total_cases = int(re.match(r'N=(\d+)', data.loc[0, 'Cases']).group(1)) total_deaths = int( re.match(r'N=(\d+)', data.loc[0, 'Deaths']).group(1)) data = data.set_index(data.columns[0]) data = data.rename( columns={ 'Unnamed: 2': '% Cases', 'Unnamed: 4': '% Hosp', 'Unnamed: 6': '% Deaths', }) for idx in data.index: str_idx = str(idx) if str_idx.startswith('Black'): aa_cases = int(data.loc[idx, 'Cases']) aa_deaths = int(data.loc[idx, 'Deaths']) elif str_idx.startswith('Unknown'): known_cases = total_cases - int(data.loc[idx, 'Cases']) known_deaths = total_deaths - int(data.loc[idx, 'Deaths']) # Compute the percentages as the provided ones are excessively rounded. aa_cases_pct = to_percentage(aa_cases, known_cases) aa_deaths_pct = to_percentage(aa_deaths, known_deaths) return [ self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_cases, known_race_deaths=known_deaths, ) ]
def _scrape(self, **kwargs): # Find latest report soup = url_to_soup(self.REPORTING_URL) by_dem_path = soup.find('a', text='Cases by Demographics Statewide')['href'] # Extract the report date (year, month, day) = map( int, re.search(r'(\d{4})-(\d{2})-(\d{2})', by_dem_path).groups()) date_published = datetime.date(year, month, day) _logger.info(f'Processing data for {date_published}') # Load the data by_dem_url = urljoin(self.REPORTING_URL, by_dem_path) by_dem = pd.read_excel(by_dem_url) # Drop probable cases by_dem = by_dem[by_dem['CASE_STATUS'] == 'Confirmed'] by_dem['Cases'] = by_dem['Cases'].str.replace('Suppressed', '0').astype(int) by_dem['Deaths'] = by_dem['Deaths'].str.replace('Suppressed', '0').astype(int) by_race = by_dem[['RaceCat', 'Cases', 'Deaths']].groupby('RaceCat').sum() total = by_race.sum(axis=0) total_cases = total['Cases'] total_deaths = total['Deaths'] aa_cases = by_race.loc['Black/African American', 'Cases'] aa_cases_pct = to_percentage(aa_cases, total_cases) aa_deaths = by_race.loc['Black/African American', 'Deaths'] aa_deaths_pct = to_percentage(aa_deaths, total_deaths) return [ self._make_series( date=date_published, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=True, pct_includes_hispanic_black=False, ) ]
def _scrape(self, **kwargs): date, data = query_geoservice(**self.DEMOG) _logger.info(f'Processing data for {date}') total_cases = data.loc[0, 'CasesAll'] known_cases = total_cases - data.loc[0, 'C_RaceUnknown'] aa_cases = data.loc[0, 'C_RaceBlack'] pct_aa_cases = to_percentage(aa_cases, known_cases) total_deaths = data.loc[0, 'Deaths'] # Does not include demographic breakdown of deaths known_deaths = nan aa_deaths = nan pct_aa_deaths = nan return [ self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_cases, known_race_deaths=known_deaths, ) ]
def test_wisconsin(): util.run_scraper_and_assert( scraper_cls=Wisconsin, assertions={ 'Date Published': date.today(), 'Total Cases': 40000, 'Total Deaths': 800, 'Count Cases Black/AA': 6600, 'Count Deaths Black/AA': 200, 'Pct Includes Unknown Race': False, 'Pct Includes Hispanic Black': True, 'Pct Cases Black/AA': to_percentage(6600, 40000 - 3800), 'Pct Deaths Black/AA': to_percentage(200, 800 - 10), 'Count Cases Known Race': 40000 - 3800, 'Count Deaths Known Race': 800 - 10 })
def test_arkansas(): util.run_scraper_and_assert( scraper_cls=Arkansas, assertions={ 'Date Published': date.today(), 'Total Cases': 31000, 'Total Deaths': 300, 'Count Cases Black/AA': 6000, 'Count Deaths Black/AA': 90, 'Pct Includes Unknown Race': False, 'Pct Includes Hispanic Black': True, 'Pct Cases Black/AA': to_percentage(6000, 27000), 'Pct Deaths Black/AA': to_percentage(90, 297), 'Count Cases Known Race': 27000, 'Count Deaths Known Race': 297 })
def test_alaska(): util.run_scraper_and_assert( scraper_cls=Alaska, assertions={ 'Date Published': date.today(), 'Total Cases': 1733, 'Total Deaths': 17, 'Count Cases Black/AA': 41, 'Count Deaths Black/AA': 0, 'Pct Includes Unknown Race': False, 'Pct Includes Hispanic Black': True, 'Pct Cases Black/AA': to_percentage(41, 1171), 'Pct Deaths Black/AA': to_percentage(0, 17), 'Count Cases Known Race': 1171, 'Count Deaths Known Race': 17 })
def _scrape(self, **kwargs): counties = pd.read_csv(self.CASES_BY_COUNTY_URL) total_deaths = counties['DEATHS'].sum() table = pd.read_csv( get_content_as_file(self.CASES_BY_RACE_URL), index_col=0, parse_dates=['DATA_REFRESH_DT']) total_cases = table['CASES'].sum() known_cases = table['CASES'].drop('Not disclosed').sum() date = table['DATA_REFRESH_DT'][0].date() _logger.info(f'Processing data for {date}') aa_cases_cnt = table.loc['Black or African American', 'CASES'] aa_cases_pct = to_percentage(aa_cases_cnt, known_cases) # No race breakdowns for deaths aa_deaths_cnt = float('nan') aa_deaths_pct = float('nan') known_deaths = float('nan') return [self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases_cnt, aa_deaths=aa_deaths_cnt, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_cases, known_race_deaths=known_deaths, )]