def _scrape(self, **kwargs): # HACK ALERT: # The main page dynamically adds JavaScript to insert and # submit (POST) a form with the field "_pd" set. The POST body # most be mime type multipart/form-data rather than the # requests default application/x-www-form-urlencoded. We can # make requests generate this by using the files argument # instead of data for the form data. Using a file name key of # None prevents the extraneous name from being included in the # call. soup = url_to_soup(self.DATA_URL, method='POST', files={None: b'_pd'}) # Find the update date last_updated_text = soup.find('strong', string=re.compile('Last Updated')) month, day, year = map( int, re.search(r'(\d)/(\d\d)/(\d\d\d\d)', last_updated_text.parent.text).groups()) date = datetime.date(year, month, day) _logger.info(f'Processing data for {date}') # Load the cases by race/ethnicity table cases_div = soup.find(id='pnlConfirmedCasesByRaceTbl') cases = table_to_dataframe( cases_div.find('table')).set_index('Race/Ethnicity') # Fix column names cases.columns = cases.columns.str.replace('\xa0.*', '') # Extract the data total_cases = cases.loc['Total Number of Cases', 'Confirmed Cases'] known_cases = cases.loc['Total with Race/Ethnicity Available', 'Confirmed Cases'] aa_cases = cases.loc['Non-Hispanic Black', 'Confirmed Cases'] aa_cases_pct = to_percentage(aa_cases, known_cases) deaths_div = soup.find(id='pnlDeathsByRaceTbl') deaths = table_to_dataframe( deaths_div.find('table')).set_index('Race/Ethnicity') deaths.columns = deaths.columns.str.replace('\xa0.*', '') total_deaths = deaths.loc['Total Number of Deaths', 'Deaths'] known_deaths = deaths.loc['Total with Race/Ethnicity Available', 'Deaths'] aa_deaths = deaths.loc['Non-Hispanic Black', 'Deaths'] aa_deaths_pct = to_percentage(aa_deaths, known_deaths) return [ self._make_series( date=date, cases=_maybe_int(total_cases), deaths=_maybe_int(total_deaths), aa_cases=_maybe_int(aa_cases), aa_deaths=_maybe_int(aa_deaths), pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_cases, known_race_deaths=known_deaths, ) ]
def _scrape(self, **kwargs): soup = url_to_soup(self.URL, local_file_name='nc_soup') demographic_df = get_demographic_dataframe() date = self.get_date(soup) _logger.info(f'Processing data for {date}') cases = self.get_total_cases(soup) deaths = self.get_total_deaths(soup) known_cases = cases - self.get_missing_cases(demographic_df) known_deaths = deaths - self.get_missing_deaths(demographic_df) aa_cases = self.get_aa_cases(demographic_df) aa_deaths = self.get_aa_deaths(demographic_df) pct_aa_cases = to_percentage(aa_cases, known_cases) pct_aa_deaths = to_percentage(aa_deaths, known_deaths) return [ self._make_series( date=date, cases=cases, deaths=deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_cases, known_race_deaths=known_deaths, ) ]
def _scrape(self, **kwargs): r = get_cached_url(self.JS_URL) json_str = re.search(r'data = (([^;]|\n)*)', r.text, re.MULTILINE).group(1).strip() # Commas on the last item in a list or object are valid in # JavaScript, but not in JSON. json_str = re.sub(r',(\s|\n)*([]}]|$)', r'\2', json_str, re.MULTILINE) _logger.debug(f'Extracted JSON: {json_str}') data = json.loads(json_str)['content'] # Find the update date month, day, year = map( int, re.search(r'(\d{2})/(\d{2})/(\d{4})', data['info']).groups()) date = datetime.date(year, month, day) _logger.info(f'Processing data for {date}') # Extract the total counts total_cases = raw_string_to_int(data['count']) total_deaths = raw_string_to_int(data['death']) # Fetch the HTML page soup = url_to_soup(self.DATA_URL) # Extract the Black/AA counts cases = self._extract_by_race_table(soup.find(id='race')) deaths = self._extract_by_race_table(soup.find(id='race-d')) _logger.debug(f'cases: {cases}') _logger.debug(f'deaths: {deaths}') known_cases = cases.drop('Under Investigation')['count'].sum() known_deaths = deaths.drop('Under Investigation')['count'].sum() aa_cases = cases.loc['Black', 'count'].sum() aa_deaths = deaths.loc['Black', 'count'].sum() aa_cases_pct = to_percentage(aa_cases, known_cases) aa_deaths_pct = to_percentage(aa_deaths, known_deaths) return [ self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=known_cases, known_race_deaths=known_deaths, ) ]
def _scrape(self, **kwargs): soup = url_to_soup(self.DATA_URL) soup.find()(id='demographics') # Extract publication date overview = soup.find(id='overview-of-covid-19-surveillance') date_str = re.search(r'Report Date: ([A-Za-z]+ \d+, \d+)', overview.text).group(1) date = datetime.datetime.strptime(date_str, '%B %d, %Y').date() _logger.info(f'Processing data for {date}') # Extract demographic and total data race_data = json.loads( soup.find( id='cases-hospitalizations-and-deaths-by-raceethnicity' ).find('script', {'type': 'application/json'}).string) headers = [th.string.strip() for th in BeautifulSoup( race_data['x']['container'], features='lxml').find_all('th')] race_df = pd.DataFrame(race_data['x']['data']).T race_df.columns = headers race_df = race_df.set_index('Race/Ethnicity') race_df['Cases'] = race_df['Cases'].astype( str ).str.replace('<', '<') race_df['Deaths'] = race_df['Deaths'].astype( str ).str.replace('<', '<') cnt_cases = race_df.loc['Statewide', 'Cases'] cnt_deaths = race_df.loc['Statewide', 'Deaths'] cnt_cases_aa = race_df.loc['Black/African American', 'Cases'] cnt_deaths_aa = race_df.loc['Black/African American', 'Deaths'] pct_cases_aa = float(str( race_df.loc['Black/African American', '% of Cases']).replace( '%', '')) try: pct_deaths_aa = to_percentage(int(cnt_deaths_aa), int(cnt_deaths)) except ValueError: pct_deaths_aa = float('nan') return [self._make_series( date=date, cases=cnt_cases, deaths=cnt_deaths, aa_cases=cnt_cases_aa, aa_deaths=cnt_deaths_aa, pct_aa_cases=pct_cases_aa, pct_aa_deaths=pct_deaths_aa, pct_includes_unknown_race=True, pct_includes_hispanic_black=False, )]
def _scrape(self, **kwargs): soup = url_to_soup(self.URL) date = self.get_date(soup) cases = self.get_cases(soup) deaths = self.get_deaths(soup) return [self._make_series( date=date, cases=cases, deaths=deaths, )]
def _scrape(self, **kwargs): soup = url_to_soup(self.REPORTING_URL) # find date strong = soup.find('strong', string=re.compile('Updated ')) date_text = re.search(r'[A-Z][a-z][a-z]+ \d(\d)?, 20\d\d', strong.text).group() # find total number of confirmed cases strong = soup.find('strong', string=re.compile(r'Total positive( cases)?:')) num_cases = raw_string_to_int(str(strong.next_sibling)) # find total number of deaths strong = soup.find('strong', string=re.compile('(Total )?[Dd]eaths:')) num_deaths = raw_string_to_int(strong.next_sibling) date_obj = datetime.datetime.strptime(date_text, '%B %d, %Y').date() _logger.info(f'Processing data for {date_obj}') _logger.debug(f'Number Cases: {num_cases}') _logger.debug(f'Number Deaths: {num_deaths}') # find number of Black/AA cases and deaths table = soup.find('table', attrs={'id': 'raceethtable'}) if not table: raise ValueError('Unable to locate race/ethnicity table') th = table.find( lambda elt: elt.name == 'th' and elt.text.find('Black') >= 0) if not th: raise ValueError('Unable to locate Black/AA data row') tds = th.find_next_siblings('td') cnt_aa_cases = raw_string_to_int(tds[0].text) cnt_aa_deaths = raw_string_to_int(tds[1].text) pct_aa_cases = to_percentage(cnt_aa_cases, num_cases) pct_aa_deaths = to_percentage(cnt_aa_deaths, num_deaths) _logger.debug(f'Number Black/AA Cases: {cnt_aa_cases}') _logger.debug(f'Number Black/AA Deaths: {cnt_aa_deaths}') return [ self._make_series( date=date_obj, cases=num_cases, deaths=num_deaths, aa_cases=cnt_aa_cases, aa_deaths=cnt_aa_deaths, pct_aa_cases=pct_aa_cases, pct_aa_deaths=pct_aa_deaths, pct_includes_unknown_race=True, pct_includes_hispanic_black=True, ) ]
def get_daily_url(reporting_url): """Fetch the main reporting URL and search for the latest PDF. """ disaster_covid_soup = url_to_soup(reporting_url) find_txt = 'COVID-19 Data - Daily Report' daily_url = disaster_covid_soup.find(lambda tag: tag.has_attr( 'href') and re.search(find_txt, tag.text)).get('href') if not daily_url: raise ValueError('Unable to find Daily Report Archive link') # daily report URL is often relative. urljoin fixes this. return urljoin(reporting_url, daily_url)
def _scrape(self, **kwargs): soup = url_to_soup(self.DATA_URL) # Find the update date # The headers don't include a Last-Modified, and the page does # not indicate when the page was updated. As a hack based on # the description in the page content, assume the page is modified at # 10AM Mountain time. now = datetime.datetime.now(tz=pytz.timezone('US/Mountain')) if now.hour >= 10: date = now.date() else: date = now.date() - datetime.timedelta(days=1) _logger.info(f'Processing data for {date}') # Find the summary table and extract the death count total_deaths = raw_string_to_int( soup.find('td', string=re.compile( 'Number of Deaths')).find_next_sibling('td').text.strip()) # Find the demographics table and extract the data table = soup.find( 'th', string=re.compile('Race and Ethnicity')).find_parent('table') aa_cases = raw_string_to_int( table.find('td', string=re.compile('Black or African American')). find_next_sibling('td').text.strip().split(' ')[0]) total_cases = raw_string_to_int( table.find('td', string=re.compile('Total')).find_next_sibling( 'td').text.strip().split(' ')[0]) aa_cases_pct = to_percentage(aa_cases, total_cases) # Missing data nan = float('nan') aa_deaths = nan aa_deaths_pct = nan return [ self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=True, pct_includes_hispanic_black=True, ) ]
def _scrape(self, **kwargs): # Find latest report soup = url_to_soup(self.REPORTING_URL) by_dem_path = soup.find('a', text='Cases by Demographics Statewide')['href'] # Extract the report date (year, month, day) = map( int, re.search(r'(\d{4})-(\d{2})-(\d{2})', by_dem_path).groups()) date_published = datetime.date(year, month, day) _logger.info(f'Processing data for {date_published}') # Load the data by_dem_url = urljoin(self.REPORTING_URL, by_dem_path) by_dem = pd.read_excel(by_dem_url) # Drop probable cases by_dem = by_dem[by_dem['CASE_STATUS'] == 'Confirmed'] by_dem['Cases'] = by_dem['Cases'].str.replace('Suppressed', '0').astype(int) by_dem['Deaths'] = by_dem['Deaths'].str.replace('Suppressed', '0').astype(int) by_race = by_dem[['RaceCat', 'Cases', 'Deaths']].groupby('RaceCat').sum() total = by_race.sum(axis=0) total_cases = total['Cases'] total_deaths = total['Deaths'] aa_cases = by_race.loc['Black/African American', 'Cases'] aa_cases_pct = to_percentage(aa_cases, total_cases) aa_deaths = by_race.loc['Black/African American', 'Deaths'] aa_deaths_pct = to_percentage(aa_deaths, total_deaths) return [ self._make_series( date=date_published, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=True, pct_includes_hispanic_black=False, ) ]
def _scrape(self, **kwargs): # Extract publication date soup = url_to_soup(self.METADATA_URL) heading = soup.find( 'a', href='/coronavirus/TexasCOVID19Demographics.xlsx.asp').parent month, day, year = map( int, re.search(r'(\d\d?)/(\d\d?)/(\d\d\d\d)', heading.text).groups()) date = datetime.date(year, month, day) _logger.info(f'Processing data for {date}') data = get_content(self.DATA_URL) cases_df = pd.read_excel(BytesIO(data), sheet_name='Cases by RaceEthnicity', header=0, index_col=0) cnt_cases = cases_df.loc['Total', 'Number'] cnt_cases_aa = cases_df.loc['Black', 'Number'] pct_cases_aa = round(cases_df.loc['Black', '%'], 2) deaths_df = pd.read_excel(BytesIO(data), sheet_name='Fatalities by Race-Ethnicity', header=0, index_col=0) deaths_df.index = deaths_df.index.str.strip() cnt_deaths = deaths_df.loc['Total', 'Number'] cnt_deaths_aa = deaths_df.loc['Black', 'Number'] pct_deaths_aa = round(deaths_df.loc['Black', '%'], 2) return [ self._make_series( date=date, cases=cnt_cases, deaths=cnt_deaths, aa_cases=cnt_cases_aa, aa_deaths=cnt_deaths_aa, pct_aa_cases=pct_cases_aa, pct_aa_deaths=pct_deaths_aa, pct_includes_unknown_race=True, pct_includes_hispanic_black=False, ) ]
def _scrape(self, **kwargs): # Download the data soup = url_to_soup(self.REPORT_URL) # Find the Google sheet url = soup.find('a', string=re.compile('Google Sheet', re.I))['href'] url = re.sub(r'(.*)/edit\b.*', r'\1/export?format=xlsx', url) _logger.debug(f'Sheets URL is {url}') counties = pd.read_excel(url, sheet_name='cases_by_county') total_deaths = counties['DEATHS'].sum() table = pd.read_excel(url, sheet_name='cases_by_race', index_col=0) total_cases = table['CASES'].sum() known_cases = table['CASES'].drop('Not disclosed').sum() date = table['DATA_REFRESH_DT'].max().date() _logger.info(f'Processing data for {date}') aa_cases_cnt = table.loc['Black or African American', 'CASES'] aa_cases_pct = to_percentage(aa_cases_cnt, known_cases) # No race breakdowns for deaths aa_deaths_cnt = float('nan') aa_deaths_pct = float('nan') known_deaths = float('nan') return [self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases_cnt, aa_deaths=aa_deaths_cnt, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_cases, known_race_deaths=known_deaths, )]