Python url_to_soup Examples, covid19_scrapers.utils.html.url_to_soup Python Examples

Example #1

0

Show file

    def _scrape(self, **kwargs):
        # HACK ALERT:
        # The main page dynamically adds JavaScript to insert and
        # submit (POST) a form with the field "_pd" set. The POST body
        # most be mime type multipart/form-data rather than the
        # requests default application/x-www-form-urlencoded.  We can
        # make requests generate this by using the files argument
        # instead of data for the form data. Using a file name key of
        # None prevents the extraneous name from being included in the
        # call.
        soup = url_to_soup(self.DATA_URL, method='POST', files={None: b'_pd'})

        # Find the update date
        last_updated_text = soup.find('strong',
                                      string=re.compile('Last Updated'))
        month, day, year = map(
            int,
            re.search(r'(\d)/(\d\d)/(\d\d\d\d)',
                      last_updated_text.parent.text).groups())
        date = datetime.date(year, month, day)
        _logger.info(f'Processing data for {date}')

        # Load the cases by race/ethnicity table
        cases_div = soup.find(id='pnlConfirmedCasesByRaceTbl')
        cases = table_to_dataframe(
            cases_div.find('table')).set_index('Race/Ethnicity')
        # Fix column names
        cases.columns = cases.columns.str.replace('\xa0.*', '')
        # Extract the data
        total_cases = cases.loc['Total Number of Cases', 'Confirmed Cases']
        known_cases = cases.loc['Total with Race/Ethnicity Available',
                                'Confirmed Cases']
        aa_cases = cases.loc['Non-Hispanic Black', 'Confirmed Cases']
        aa_cases_pct = to_percentage(aa_cases, known_cases)

        deaths_div = soup.find(id='pnlDeathsByRaceTbl')
        deaths = table_to_dataframe(
            deaths_div.find('table')).set_index('Race/Ethnicity')
        deaths.columns = deaths.columns.str.replace('\xa0.*', '')
        total_deaths = deaths.loc['Total Number of Deaths', 'Deaths']
        known_deaths = deaths.loc['Total with Race/Ethnicity Available',
                                  'Deaths']
        aa_deaths = deaths.loc['Non-Hispanic Black', 'Deaths']
        aa_deaths_pct = to_percentage(aa_deaths, known_deaths)

        return [
            self._make_series(
                date=date,
                cases=_maybe_int(total_cases),
                deaths=_maybe_int(total_deaths),
                aa_cases=_maybe_int(aa_cases),
                aa_deaths=_maybe_int(aa_deaths),
                pct_aa_cases=aa_cases_pct,
                pct_aa_deaths=aa_deaths_pct,
                pct_includes_unknown_race=False,
                pct_includes_hispanic_black=False,
                known_race_cases=known_cases,
                known_race_deaths=known_deaths,
            )
        ]

Example #2

0

Show file

File: north_carolina.py Project: wjhrdy/COVID19_tracker_data_extraction

    def _scrape(self, **kwargs):
        soup = url_to_soup(self.URL, local_file_name='nc_soup')
        demographic_df = get_demographic_dataframe()

        date = self.get_date(soup)
        _logger.info(f'Processing data for {date}')
        cases = self.get_total_cases(soup)
        deaths = self.get_total_deaths(soup)
        known_cases = cases - self.get_missing_cases(demographic_df)
        known_deaths = deaths - self.get_missing_deaths(demographic_df)
        aa_cases = self.get_aa_cases(demographic_df)
        aa_deaths = self.get_aa_deaths(demographic_df)
        pct_aa_cases = to_percentage(aa_cases, known_cases)
        pct_aa_deaths = to_percentage(aa_deaths, known_deaths)

        return [
            self._make_series(
                date=date,
                cases=cases,
                deaths=deaths,
                aa_cases=aa_cases,
                aa_deaths=aa_deaths,
                pct_aa_cases=pct_aa_cases,
                pct_aa_deaths=pct_aa_deaths,
                pct_includes_unknown_race=False,
                pct_includes_hispanic_black=True,
                known_race_cases=known_cases,
                known_race_deaths=known_deaths,
            )
        ]

Example #3

0

Show file

    def _scrape(self, **kwargs):
        r = get_cached_url(self.JS_URL)
        json_str = re.search(r'data = (([^;]|\n)*)', r.text,
                             re.MULTILINE).group(1).strip()
        # Commas on the last item in a list or object are valid in
        # JavaScript, but not in JSON.
        json_str = re.sub(r',(\s|\n)*([]}]|$)', r'\2', json_str, re.MULTILINE)
        _logger.debug(f'Extracted JSON: {json_str}')
        data = json.loads(json_str)['content']

        # Find the update date
        month, day, year = map(
            int,
            re.search(r'(\d{2})/(\d{2})/(\d{4})', data['info']).groups())

        date = datetime.date(year, month, day)
        _logger.info(f'Processing data for {date}')

        # Extract the total counts
        total_cases = raw_string_to_int(data['count'])
        total_deaths = raw_string_to_int(data['death'])

        # Fetch the HTML page
        soup = url_to_soup(self.DATA_URL)

        # Extract the Black/AA counts
        cases = self._extract_by_race_table(soup.find(id='race'))
        deaths = self._extract_by_race_table(soup.find(id='race-d'))

        _logger.debug(f'cases: {cases}')
        _logger.debug(f'deaths: {deaths}')

        known_cases = cases.drop('Under Investigation')['count'].sum()
        known_deaths = deaths.drop('Under Investigation')['count'].sum()

        aa_cases = cases.loc['Black', 'count'].sum()
        aa_deaths = deaths.loc['Black', 'count'].sum()

        aa_cases_pct = to_percentage(aa_cases, known_cases)
        aa_deaths_pct = to_percentage(aa_deaths, known_deaths)

        return [
            self._make_series(
                date=date,
                cases=total_cases,
                deaths=total_deaths,
                aa_cases=aa_cases,
                aa_deaths=aa_deaths,
                pct_aa_cases=aa_cases_pct,
                pct_aa_deaths=aa_deaths_pct,
                pct_includes_unknown_race=False,
                pct_includes_hispanic_black=False,
                known_race_cases=known_cases,
                known_race_deaths=known_deaths,
            )
        ]

Example #4

0

Show file

File: utah.py Project: wjhrdy/COVID19_tracker_data_extraction

    def _scrape(self, **kwargs):
        soup = url_to_soup(self.DATA_URL)
        soup.find()(id='demographics')

        # Extract publication date
        overview = soup.find(id='overview-of-covid-19-surveillance')
        date_str = re.search(r'Report Date: ([A-Za-z]+ \d+, \d+)',
                             overview.text).group(1)
        date = datetime.datetime.strptime(date_str, '%B %d, %Y').date()
        _logger.info(f'Processing data for {date}')

        # Extract demographic and total data
        race_data = json.loads(
            soup.find(
                id='cases-hospitalizations-and-deaths-by-raceethnicity'
            ).find('script', {'type': 'application/json'}).string)

        headers = [th.string.strip()
                   for th in BeautifulSoup(
            race_data['x']['container'],
            features='lxml').find_all('th')]
        race_df = pd.DataFrame(race_data['x']['data']).T
        race_df.columns = headers
        race_df = race_df.set_index('Race/Ethnicity')
        race_df['Cases'] = race_df['Cases'].astype(
            str
        ).str.replace('&lt;', '<')
        race_df['Deaths'] = race_df['Deaths'].astype(
            str
        ).str.replace('&lt;', '<')

        cnt_cases = race_df.loc['Statewide', 'Cases']
        cnt_deaths = race_df.loc['Statewide', 'Deaths']
        cnt_cases_aa = race_df.loc['Black/African American', 'Cases']
        cnt_deaths_aa = race_df.loc['Black/African American', 'Deaths']
        pct_cases_aa = float(str(
            race_df.loc['Black/African American', '% of Cases']).replace(
                '%', ''))
        try:
            pct_deaths_aa = to_percentage(int(cnt_deaths_aa), int(cnt_deaths))
        except ValueError:
            pct_deaths_aa = float('nan')

        return [self._make_series(
            date=date,
            cases=cnt_cases,
            deaths=cnt_deaths,
            aa_cases=cnt_cases_aa,
            aa_deaths=cnt_deaths_aa,
            pct_aa_cases=pct_cases_aa,
            pct_aa_deaths=pct_deaths_aa,
            pct_includes_unknown_race=True,
            pct_includes_hispanic_black=False,
        )]

Example #5

0

Show file

File: north_dakota.py Project: wjhrdy/COVID19_tracker_data_extraction

    def _scrape(self, **kwargs):
        soup = url_to_soup(self.URL)
        date = self.get_date(soup)
        cases = self.get_cases(soup)
        deaths = self.get_deaths(soup)

        return [self._make_series(
            date=date,
            cases=cases,
            deaths=deaths,
        )]

Example #6

0

Show file

    def _scrape(self, **kwargs):
        soup = url_to_soup(self.REPORTING_URL)

        # find date
        strong = soup.find('strong', string=re.compile('Updated '))
        date_text = re.search(r'[A-Z][a-z][a-z]+ \d(\d)?, 20\d\d',
                              strong.text).group()

        # find total number of confirmed cases
        strong = soup.find('strong',
                           string=re.compile(r'Total positive( cases)?:'))
        num_cases = raw_string_to_int(str(strong.next_sibling))

        # find total number of deaths
        strong = soup.find('strong', string=re.compile('(Total )?[Dd]eaths:'))
        num_deaths = raw_string_to_int(strong.next_sibling)

        date_obj = datetime.datetime.strptime(date_text, '%B %d, %Y').date()
        _logger.info(f'Processing data for {date_obj}')
        _logger.debug(f'Number Cases: {num_cases}')
        _logger.debug(f'Number Deaths: {num_deaths}')

        # find number of Black/AA cases and deaths
        table = soup.find('table', attrs={'id': 'raceethtable'})
        if not table:
            raise ValueError('Unable to locate race/ethnicity table')
        th = table.find(
            lambda elt: elt.name == 'th' and elt.text.find('Black') >= 0)
        if not th:
            raise ValueError('Unable to locate Black/AA data row')
        tds = th.find_next_siblings('td')
        cnt_aa_cases = raw_string_to_int(tds[0].text)
        cnt_aa_deaths = raw_string_to_int(tds[1].text)
        pct_aa_cases = to_percentage(cnt_aa_cases, num_cases)
        pct_aa_deaths = to_percentage(cnt_aa_deaths, num_deaths)

        _logger.debug(f'Number Black/AA Cases: {cnt_aa_cases}')
        _logger.debug(f'Number Black/AA Deaths: {cnt_aa_deaths}')

        return [
            self._make_series(
                date=date_obj,
                cases=num_cases,
                deaths=num_deaths,
                aa_cases=cnt_aa_cases,
                aa_deaths=cnt_aa_deaths,
                pct_aa_cases=pct_aa_cases,
                pct_aa_deaths=pct_aa_deaths,
                pct_includes_unknown_race=True,
                pct_includes_hispanic_black=True,
            )
        ]

Example #7

0

Show file

File: florida.py Project: wjhrdy/COVID19_tracker_data_extraction

def get_daily_url(reporting_url):
    """Fetch the main reporting URL and search for the latest PDF.

    """
    disaster_covid_soup = url_to_soup(reporting_url)
    find_txt = 'COVID-19 Data - Daily Report'
    daily_url = disaster_covid_soup.find(lambda tag: tag.has_attr(
        'href') and re.search(find_txt, tag.text)).get('href')

    if not daily_url:
        raise ValueError('Unable to find Daily Report Archive link')
    # daily report URL is often relative. urljoin fixes this.
    return urljoin(reporting_url, daily_url)

Example #8

0

Show file

    def _scrape(self, **kwargs):
        soup = url_to_soup(self.DATA_URL)

        # Find the update date
        # The headers don't include a Last-Modified, and the page does
        # not indicate when the page was updated.  As a hack based on
        # the description in the page content, assume the page is modified at
        # 10AM Mountain time.
        now = datetime.datetime.now(tz=pytz.timezone('US/Mountain'))
        if now.hour >= 10:
            date = now.date()
        else:
            date = now.date() - datetime.timedelta(days=1)
        _logger.info(f'Processing data for {date}')

        # Find the summary table and extract the death count
        total_deaths = raw_string_to_int(
            soup.find('td', string=re.compile(
                'Number of Deaths')).find_next_sibling('td').text.strip())

        # Find the demographics table and extract the data
        table = soup.find(
            'th', string=re.compile('Race and Ethnicity')).find_parent('table')
        aa_cases = raw_string_to_int(
            table.find('td', string=re.compile('Black or African American')).
            find_next_sibling('td').text.strip().split(' ')[0])
        total_cases = raw_string_to_int(
            table.find('td', string=re.compile('Total')).find_next_sibling(
                'td').text.strip().split(' ')[0])
        aa_cases_pct = to_percentage(aa_cases, total_cases)

        # Missing data
        nan = float('nan')
        aa_deaths = nan
        aa_deaths_pct = nan

        return [
            self._make_series(
                date=date,
                cases=total_cases,
                deaths=total_deaths,
                aa_cases=aa_cases,
                aa_deaths=aa_deaths,
                pct_aa_cases=aa_cases_pct,
                pct_aa_deaths=aa_deaths_pct,
                pct_includes_unknown_race=True,
                pct_includes_hispanic_black=True,
            )
        ]

Example #9

0

Show file

File: michigan.py Project: wjhrdy/COVID19_tracker_data_extraction

    def _scrape(self, **kwargs):
        # Find latest report
        soup = url_to_soup(self.REPORTING_URL)
        by_dem_path = soup.find('a',
                                text='Cases by Demographics Statewide')['href']

        # Extract the report date
        (year, month, day) = map(
            int,
            re.search(r'(\d{4})-(\d{2})-(\d{2})', by_dem_path).groups())

        date_published = datetime.date(year, month, day)
        _logger.info(f'Processing data for {date_published}')

        # Load the data
        by_dem_url = urljoin(self.REPORTING_URL, by_dem_path)
        by_dem = pd.read_excel(by_dem_url)

        # Drop probable cases
        by_dem = by_dem[by_dem['CASE_STATUS'] == 'Confirmed']
        by_dem['Cases'] = by_dem['Cases'].str.replace('Suppressed',
                                                      '0').astype(int)
        by_dem['Deaths'] = by_dem['Deaths'].str.replace('Suppressed',
                                                        '0').astype(int)
        by_race = by_dem[['RaceCat', 'Cases',
                          'Deaths']].groupby('RaceCat').sum()

        total = by_race.sum(axis=0)
        total_cases = total['Cases']
        total_deaths = total['Deaths']
        aa_cases = by_race.loc['Black/African American', 'Cases']
        aa_cases_pct = to_percentage(aa_cases, total_cases)
        aa_deaths = by_race.loc['Black/African American', 'Deaths']
        aa_deaths_pct = to_percentage(aa_deaths, total_deaths)

        return [
            self._make_series(
                date=date_published,
                cases=total_cases,
                deaths=total_deaths,
                aa_cases=aa_cases,
                aa_deaths=aa_deaths,
                pct_aa_cases=aa_cases_pct,
                pct_aa_deaths=aa_deaths_pct,
                pct_includes_unknown_race=True,
                pct_includes_hispanic_black=False,
            )
        ]

Example #10

0

Show file

    def _scrape(self, **kwargs):
        # Extract publication date
        soup = url_to_soup(self.METADATA_URL)
        heading = soup.find(
            'a', href='/coronavirus/TexasCOVID19Demographics.xlsx.asp').parent
        month, day, year = map(
            int,
            re.search(r'(\d\d?)/(\d\d?)/(\d\d\d\d)', heading.text).groups())
        date = datetime.date(year, month, day)
        _logger.info(f'Processing data for {date}')

        data = get_content(self.DATA_URL)
        cases_df = pd.read_excel(BytesIO(data),
                                 sheet_name='Cases by RaceEthnicity',
                                 header=0,
                                 index_col=0)

        cnt_cases = cases_df.loc['Total', 'Number']
        cnt_cases_aa = cases_df.loc['Black', 'Number']
        pct_cases_aa = round(cases_df.loc['Black', '%'], 2)

        deaths_df = pd.read_excel(BytesIO(data),
                                  sheet_name='Fatalities by Race-Ethnicity',
                                  header=0,
                                  index_col=0)
        deaths_df.index = deaths_df.index.str.strip()
        cnt_deaths = deaths_df.loc['Total', 'Number']
        cnt_deaths_aa = deaths_df.loc['Black', 'Number']
        pct_deaths_aa = round(deaths_df.loc['Black', '%'], 2)

        return [
            self._make_series(
                date=date,
                cases=cnt_cases,
                deaths=cnt_deaths,
                aa_cases=cnt_cases_aa,
                aa_deaths=cnt_deaths_aa,
                pct_aa_cases=pct_cases_aa,
                pct_aa_deaths=pct_deaths_aa,
                pct_includes_unknown_race=True,
                pct_includes_hispanic_black=False,
            )
        ]

Example #11

0

Show file

File: maine.py Project: wjhrdy/COVID19_tracker_data_extraction

    def _scrape(self, **kwargs):
        # Download the data
        soup = url_to_soup(self.REPORT_URL)

        # Find the Google sheet
        url = soup.find('a', string=re.compile('Google Sheet', re.I))['href']
        url = re.sub(r'(.*)/edit\b.*', r'\1/export?format=xlsx', url)
        _logger.debug(f'Sheets URL is {url}')
        counties = pd.read_excel(url, sheet_name='cases_by_county')
        total_deaths = counties['DEATHS'].sum()

        table = pd.read_excel(url, sheet_name='cases_by_race', index_col=0)
        total_cases = table['CASES'].sum()
        known_cases = table['CASES'].drop('Not disclosed').sum()
        date = table['DATA_REFRESH_DT'].max().date()
        _logger.info(f'Processing data for {date}')
        aa_cases_cnt = table.loc['Black or African American', 'CASES']
        aa_cases_pct = to_percentage(aa_cases_cnt, known_cases)

        # No race breakdowns for deaths
        aa_deaths_cnt = float('nan')
        aa_deaths_pct = float('nan')
        known_deaths = float('nan')

        return [self._make_series(
            date=date,
            cases=total_cases,
            deaths=total_deaths,
            aa_cases=aa_cases_cnt,
            aa_deaths=aa_deaths_cnt,
            pct_aa_cases=aa_cases_pct,
            pct_aa_deaths=aa_deaths_pct,
            pct_includes_unknown_race=False,
            pct_includes_hispanic_black=True,
            known_race_cases=known_cases,
            known_race_deaths=known_deaths,
        )]