def _scrape(self, **kwargs):
        results = self.setup_session()
        assert results.x_session_id, 'No X-Session-Id found'

        demographic_cases_df = self.get_demographic_cases_df(results.x_session_id)
        demographic_deaths_df = self.get_demographic_deaths_df(results.x_session_id)

        date = self.get_date()
        cases = self.get_total_cases(results.x_session_id)
        deaths = int(demographic_deaths_df['Deaths'].sum())
        aa_cases = int(demographic_cases_df.loc['Black or African American', 'Count'])
        aa_deaths = int(demographic_deaths_df.loc['Black or African American', 'Deaths'])
        pct_aa_cases = to_percentage(aa_cases, cases)
        pct_aa_deaths = to_percentage(aa_deaths, deaths)

        return [self._make_series(
            date=date,
            cases=cases,
            deaths=deaths,
            aa_cases=aa_cases,
            aa_deaths=aa_deaths,
            pct_aa_cases=pct_aa_cases,
            pct_aa_deaths=pct_aa_deaths,
            pct_includes_unknown_race=False,
            pct_includes_hispanic_black=True,
        )]
Exemple #2
0
def test_california_sf():
    mocked_requests = {
        'date': util.MockSeleniumWireRequest(response_body=loader.get_blob('california_san_francisco_date.txt')),
        'cases_by_race': util.MockSeleniumWireRequest(
            response_body=loader.get_blob('california_san_francisco_cases.txt')),
        'deaths_by_race': util.MockSeleniumWireRequest(
            response_body=loader.get_blob('california_san_francisco_deaths.txt'))
    }

    mocked_webdriver = util.mocked_webdriver_runner(requests=mocked_requests)

    with mock.patch('covid19_scrapers.states.california_san_francisco.WebdriverRunner', mocked_webdriver):
        util.run_scraper_and_assert(
            scraper_cls=CaliforniaSanFrancisco,
            assertions={
                'Date Published': datetime(2020, 7, 17).date(),
                'Total Cases': 5100,
                'Total Deaths': 130,
                'Count Cases Black/AA': 300,
                'Count Deaths Black/AA': 5,
                'Pct Includes Unknown Race': False,
                'Pct Includes Hispanic Black': False,
                'Pct Cases Black/AA': to_percentage(300, 4400),
                'Pct Deaths Black/AA': to_percentage(5, 30),
                'Count Cases Known Race': 4400,
                'Count Deaths Known Race': 30
            })
    def _scrape(self, **kwargs):
        cases_df = pd.read_excel(
            get_content_as_file(self.BASE_URL.format(self.CASES_SUFFIX)))
        most_recent_cases = cases_df.iloc[-1]
        date = most_recent_cases['DATE'].to_pydatetime().date()
        cases = int(most_recent_cases['TOTAL_CASES'])
        deaths = int(most_recent_cases['TOTAL_DEATHS'])

        demographic_df = pd.read_excel(
            get_content_as_file(self.BASE_URL.format(self.DEMOGRAPHIC_SUFFIX)))
        most_recent_aa_cases = demographic_df[
            (demographic_df['Category'] == 'RACE')
            & (demographic_df['CAT_DETAIL'] == 'Black or African American')
            & (demographic_df['Date'] == str(date))].iloc[0]
        aa_cases = int(most_recent_aa_cases['Cat_CaseCount'])
        aa_deaths = int(most_recent_aa_cases['CAT_DEATHCOUNT'])

        pct_aa_cases = to_percentage(aa_cases, cases)
        pct_aa_deaths = to_percentage(aa_deaths, deaths)

        return [
            self._make_series(date=date,
                              cases=cases,
                              deaths=deaths,
                              aa_cases=aa_cases,
                              aa_deaths=aa_deaths,
                              pct_aa_cases=pct_aa_cases,
                              pct_aa_deaths=pct_aa_deaths,
                              pct_includes_unknown_race=True,
                              pct_includes_hispanic_black=True)
        ]
def test_delaware():
    util.run_scraper_and_assert(scraper_cls=Delaware,
                                assertions={
                                    'Date Published':
                                    datetime(2020, 7, 16).date(),
                                    'Total Cases':
                                    13000,
                                    'Total Deaths':
                                    500,
                                    'Count Cases Black/AA':
                                    3000,
                                    'Count Deaths Black/AA':
                                    100,
                                    'Pct Includes Unknown Race':
                                    False,
                                    'Pct Includes Hispanic Black':
                                    False,
                                    'Pct Cases Black/AA':
                                    to_percentage(3000, 12000),
                                    'Pct Deaths Black/AA':
                                    to_percentage(100, 480),
                                    'Count Cases Known Race':
                                    12000,
                                    'Count Deaths Known Race':
                                    480
                                })
Exemple #5
0
    def _scrape(self, **kwargs):
        runner = WebdriverRunner()
        results = runner.run(WebdriverSteps().go_to_url(
            self.CASES_URL).wait_for_number_of_elements(
                (By.XPATH, '//canvas'), 58).find_request(
                    'cases', find_by=tableau.find_tableau_request
                ).clear_request_history().go_to_url(
                    self.DEATHS_URL).wait_for_number_of_elements(
                        (By.XPATH, '//canvas'),
                        29).find_request('deaths',
                                         find_by=tableau.find_tableau_request))

        parser = tableau.TableauParser(request=results.requests['cases'])

        raw_date_str = pydash.head(
            parser.extract_data_from_key('cases')['ATTR(Date Updated)'])
        date = datetime.strptime(raw_date_str, '%A, %B %d, %Y').date()

        confirmed_cases = pydash.head(
            parser.extract_data_from_key('cases')
            ['SUM(# Lab Confirmed Cases)'])
        probable_cases = pydash.head(
            parser.extract_data_from_key('probable cases')['SUM(# probable)'])
        cases = confirmed_cases + probable_cases
        cases_df = pd.DataFrame.from_dict(
            parser.extract_data_from_key('raceth')).set_index('sub-category')
        aa_cases = cases_df.loc['Black']['SUM(count)']
        known_race_cases = cases - cases_df.loc['unknown']['SUM(count)']

        parser = tableau.TableauParser(request=results.requests['deaths'])
        deaths = pydash.head(
            parser.extract_data_from_key('death (2)')
            ['SUM(# lab confirmed deaths)'])
        deaths_df = pd.DataFrame.from_dict(
            parser.extract_data_from_key('raceth (death)')).set_index(
                'sub-category')
        deaths_df = deaths_df.assign(Count=[
            round(v * deaths) for v in deaths_df['SUM(% of deaths)'].values
        ])
        aa_deaths = deaths_df.loc['Black']['Count']
        known_race_deaths = deaths - deaths_df.loc['unknown']['Count']

        pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases)
        pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths)

        return [
            self._make_series(
                date=date,
                cases=cases,
                deaths=deaths,
                aa_cases=aa_cases,
                aa_deaths=aa_deaths,
                pct_aa_cases=pct_aa_cases,
                pct_aa_deaths=pct_aa_deaths,
                pct_includes_unknown_race=False,
                pct_includes_hispanic_black=False,
                known_race_cases=known_race_cases,
                known_race_deaths=known_race_deaths,
            )
        ]
Exemple #6
0
    def _scrape(self, **kwargs):
        # HACK ALERT:
        # The main page dynamically adds JavaScript to insert and
        # submit (POST) a form with the field "_pd" set. The POST body
        # most be mime type multipart/form-data rather than the
        # requests default application/x-www-form-urlencoded.  We can
        # make requests generate this by using the files argument
        # instead of data for the form data. Using a file name key of
        # None prevents the extraneous name from being included in the
        # call.
        soup = url_to_soup(self.DATA_URL, method='POST', files={None: b'_pd'})

        # Find the update date
        last_updated_text = soup.find('strong',
                                      string=re.compile('Last Updated'))
        month, day, year = map(
            int,
            re.search(r'(\d)/(\d\d)/(\d\d\d\d)',
                      last_updated_text.parent.text).groups())
        date = datetime.date(year, month, day)
        _logger.info(f'Processing data for {date}')

        # Load the cases by race/ethnicity table
        cases_div = soup.find(id='pnlConfirmedCasesByRaceTbl')
        cases = table_to_dataframe(
            cases_div.find('table')).set_index('Race/Ethnicity')
        # Fix column names
        cases.columns = cases.columns.str.replace('\xa0.*', '')
        # Extract the data
        total_cases = cases.loc['Total Number of Cases', 'Confirmed Cases']
        known_cases = cases.loc['Total with Race/Ethnicity Available',
                                'Confirmed Cases']
        aa_cases = cases.loc['Non-Hispanic Black', 'Confirmed Cases']
        aa_cases_pct = to_percentage(aa_cases, known_cases)

        deaths_div = soup.find(id='pnlDeathsByRaceTbl')
        deaths = table_to_dataframe(
            deaths_div.find('table')).set_index('Race/Ethnicity')
        deaths.columns = deaths.columns.str.replace('\xa0.*', '')
        total_deaths = deaths.loc['Total Number of Deaths', 'Deaths']
        known_deaths = deaths.loc['Total with Race/Ethnicity Available',
                                  'Deaths']
        aa_deaths = deaths.loc['Non-Hispanic Black', 'Deaths']
        aa_deaths_pct = to_percentage(aa_deaths, known_deaths)

        return [
            self._make_series(
                date=date,
                cases=_maybe_int(total_cases),
                deaths=_maybe_int(total_deaths),
                aa_cases=_maybe_int(aa_cases),
                aa_deaths=_maybe_int(aa_deaths),
                pct_aa_cases=aa_cases_pct,
                pct_aa_deaths=aa_deaths_pct,
                pct_includes_unknown_race=False,
                pct_includes_hispanic_black=False,
                known_race_cases=known_cases,
                known_race_deaths=known_deaths,
            )
        ]
    def _scrape(self, **kwargs):
        json = get_json(self.DATA_URL)

        state_info = pydash.get(json, 'state_testing_results.values.-1')
        demographics_data = pydash.get(json, 'demographics.race')
        aa_data = aa_data = pydash.find(
            demographics_data, lambda data: data['description'] == 'Black')

        date = datetime.strptime(state_info['testDate'], '%m/%d/%Y').date()
        cases = state_info.get('confirmed_cases')
        deaths = state_info.get('deaths')
        aa_cases = aa_data.get('count')
        aa_deaths = aa_data.get('deaths')

        assert cases, 'Could not find number of confirmed cases'
        assert deaths, 'Could not find number of deaths'
        assert aa_cases, 'Could not find number of AA cases'
        assert aa_deaths, 'Could not find number of AA deaths'

        pct_aa_cases = to_percentage(aa_cases, cases)
        pct_aa_deaths = to_percentage(aa_deaths, deaths)

        return [
            self._make_series(date=date,
                              cases=cases,
                              deaths=deaths,
                              aa_cases=aa_cases,
                              aa_deaths=aa_deaths,
                              pct_aa_cases=pct_aa_cases,
                              pct_aa_deaths=pct_aa_deaths,
                              pct_includes_unknown_race=True,
                              pct_includes_hispanic_black=False)
        ]
def test_california_los_angeles():
    util.run_scraper_and_assert(scraper_cls=CaliforniaLosAngeles,
                                assertions={
                                    'Date Published':
                                    datetime(2020, 7, 16).date(),
                                    'Total Cases':
                                    150000,
                                    'Total Deaths':
                                    4000,
                                    'Count Cases Black/AA':
                                    4000,
                                    'Count Deaths Black/AA':
                                    400,
                                    'Pct Includes Unknown Race':
                                    False,
                                    'Pct Includes Hispanic Black':
                                    False,
                                    'Pct Cases Black/AA':
                                    to_percentage(4000, 78500),
                                    'Pct Deaths Black/AA':
                                    to_percentage(400, 3650),
                                    'Count Cases Known Race':
                                    78500,
                                    'Count Deaths Known Race':
                                    3650
                                })
Exemple #9
0
    def _scrape(self, **kwargs):
        _logger.debug('Download covid data zip file')
        z = get_zip(self.ZIP_URL)

        _logger.debug(
            'Get the last update of the demographics.csv file in archive')
        date = get_zip_member_update_date(z, 'demographics.csv')
        _logger.info(f'Processing data for {date}')

        _logger.debug('Load demographics CSV')
        data = pd.read_csv(get_zip_member_as_file(z, 'demographics.csv'))
        by_race = data[['race', 'Confirmed_Cases', 'Deaths']
                       ].groupby('race').sum()
        totals = by_race.sum(axis=0)
        total_cases = totals['Confirmed_Cases']
        total_deaths = totals['Deaths']
        _logger.debug('African American cases and deaths')
        aa_key = next(filter(lambda x: x.startswith('African-American'),
                             by_race.index))
        aa_cases = by_race.loc[aa_key, 'Confirmed_Cases']
        aa_cases_pct = to_percentage(aa_cases, total_cases)
        aa_deaths = by_race.loc[aa_key, 'Deaths']
        aa_deaths_pct = to_percentage(aa_deaths, total_deaths)

        return [self._make_series(
            date=date,
            cases=total_cases,
            deaths=total_deaths,
            aa_cases=aa_cases,
            aa_deaths=aa_deaths,
            pct_aa_cases=aa_cases_pct,
            pct_aa_deaths=aa_deaths_pct,
            pct_includes_unknown_race=True,
            pct_includes_hispanic_black=True,
        )]
    def _scrape(self, **kwargs):
        soup = url_to_soup(self.URL, local_file_name='nc_soup')
        demographic_df = get_demographic_dataframe()

        date = self.get_date(soup)
        _logger.info(f'Processing data for {date}')
        cases = self.get_total_cases(soup)
        deaths = self.get_total_deaths(soup)
        known_cases = cases - self.get_missing_cases(demographic_df)
        known_deaths = deaths - self.get_missing_deaths(demographic_df)
        aa_cases = self.get_aa_cases(demographic_df)
        aa_deaths = self.get_aa_deaths(demographic_df)
        pct_aa_cases = to_percentage(aa_cases, known_cases)
        pct_aa_deaths = to_percentage(aa_deaths, known_deaths)

        return [
            self._make_series(
                date=date,
                cases=cases,
                deaths=deaths,
                aa_cases=aa_cases,
                aa_deaths=aa_deaths,
                pct_aa_cases=pct_aa_cases,
                pct_aa_deaths=pct_aa_deaths,
                pct_includes_unknown_race=False,
                pct_includes_hispanic_black=True,
                known_race_cases=known_cases,
                known_race_deaths=known_deaths,
            )
        ]
    def _scrape(self, **kwargs):
        date, cases = query_geoservice(**self.CASES)
        total_cases = cases.iloc[0, 0]

        _, cases_by_race = query_geoservice(**self.CASES_BY_RACE)
        cases_by_race = cases_by_race.set_index('Race')
        known_cases = total_cases - cases_by_race.loc['Not Reported',
                                                      'value']
        aa_cases = cases_by_race.loc['African American/Black', 'value']
        pct_aa_cases = to_percentage(aa_cases, known_cases)

        _, deaths = query_geoservice(**self.DEATHS)
        total_deaths = deaths.iloc[0, 0]

        _, deaths_by_race = query_geoservice(**self.DEATHS_BY_RACE)
        deaths_by_race = deaths_by_race.set_index('Race')
        known_deaths = deaths_by_race.drop('Not Reported',
                                           errors='ignore').sum()['value']
        aa_deaths = deaths_by_race.loc['African American', 'value']
        pct_aa_deaths = to_percentage(aa_deaths, known_deaths)

        return [self._make_series(
            date=date,
            cases=total_cases,
            deaths=total_deaths,
            aa_cases=aa_cases,
            aa_deaths=aa_deaths,
            pct_aa_cases=pct_aa_cases,
            pct_aa_deaths=pct_aa_deaths,
            pct_includes_unknown_race=False,
            pct_includes_hispanic_black=True,
            known_race_cases=known_cases,
            known_race_deaths=known_deaths,
        )]
    def _scrape(self, **kwargs):
        date, data = query_geoservice(**self.DATA)
        _logger.info(f'Processing data for {date}')

        total_cases = data.loc[0, 'POSITIVE']
        total_deaths = data.loc[0, 'DEATHS']
        unknown_cases = data.loc[0, 'POS_UNK']
        unknown_deaths = data.loc[0, 'DTH_UNK']
        known_cases = total_cases - unknown_cases
        known_deaths = total_deaths - unknown_deaths
        aa_cases = data.loc[0, 'POS_BLK']
        aa_deaths = data.loc[0, 'DTH_BLK']

        aa_cases_pct = to_percentage(aa_cases, known_cases)
        aa_deaths_pct = to_percentage(aa_deaths, known_deaths)

        return [
            self._make_series(
                date=date,
                cases=total_cases,
                deaths=total_deaths,
                aa_cases=aa_cases,
                aa_deaths=aa_deaths,
                pct_aa_cases=aa_cases_pct,
                pct_aa_deaths=aa_deaths_pct,
                pct_includes_unknown_race=False,
                pct_includes_hispanic_black=True,
                known_race_cases=known_cases,
                known_race_deaths=known_deaths,
            )
        ]
Exemple #13
0
    def _scrape(self, **kwargs):
        date, demog = query_geoservice(**self.DEMOG)
        _logger.info(f'Processing data for {date}')

        total_cases = demog.loc[0, 'positives']
        known_cases = total_cases - demog.loc[0, 'unk_race']
        aa_cases = demog.loc[0, 'black']
        pct_aa_cases = to_percentage(aa_cases, known_cases)

        total_deaths = demog.loc[0, 'deaths']
        known_deaths = total_deaths - demog.loc[0, 'd_unk_race']
        aa_deaths = demog.loc[0, 'd_black']
        pct_aa_deaths = to_percentage(aa_deaths, known_deaths)

        return [
            self._make_series(
                date=date,
                cases=total_cases,
                deaths=total_deaths,
                aa_cases=aa_cases,
                aa_deaths=aa_deaths,
                pct_aa_cases=pct_aa_cases,
                pct_aa_deaths=pct_aa_deaths,
                pct_includes_unknown_race=False,
                pct_includes_hispanic_black=True,
                known_race_cases=known_cases,
                known_race_deaths=known_deaths,
            )
        ]
    def _scrape(self, **kwargs):
        runner = WebdriverRunner()

        # Get date
        cases_results = runner.run(
            WebdriverSteps()
            .go_to_url(self.SUMMARY_URL)
            .wait_for_presence_of_elements((By.XPATH, "//span[contains(text(),'Last updated')]"))
            .get_page_source())

        date = self.get_date(cases_results.page_source)

        # Cases for Race
        cases_by_race_results = runner.run(
            WebdriverSteps()
            .go_to_url(self.RACE_CASES_URL)
            .find_request('race_cases', find_by=find_tableau_request))

        assert cases_by_race_results.requests['race_cases'], 'No results for race_cases found'
        resp_body = cases_by_race_results.requests['race_cases'].response.body.decode('utf8')
        cases_for_race_json = TableauParser(resp_body).extract_data_from_key(key='Rates by Race for All Cases')
        cases_df = self.to_df(cases_for_race_json)
        cases = cases_df['Measure Values'].sum()
        known_race_cases = cases_df.drop('Not Reported/Missing')['Measure Values'].sum()
        aa_cases = cases_df.loc['Black or African American', 'Measure Values'].sum()

        # Deaths for Race
        deaths_by_race_results = runner.run(
            WebdriverSteps()
            .go_to_url(self.RACE_DEATHS_URL)
            .find_request('race_deaths', find_by=find_tableau_request))

        assert deaths_by_race_results.requests['race_deaths'], 'No results for race_deaths found'
        resp_body = deaths_by_race_results.requests['race_deaths'].response.body.decode('utf8')
        deaths_for_race_json = TableauParser(resp_body).extract_data_from_key(key='Mortality by Race')
        deaths_df = self.to_df(deaths_for_race_json)
        deaths = deaths_df['Measure Values'].sum()
        known_race_deaths = deaths_df.drop('Not Reported/Missing')['Measure Values'].sum()
        aa_deaths = deaths_df.loc['Black or African American', 'Measure Values'].sum()

        pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases)
        pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths)

        return [self._make_series(
            date=date,
            cases=cases,
            deaths=deaths,
            aa_cases=aa_cases,
            aa_deaths=aa_deaths,
            pct_aa_cases=pct_aa_cases,
            pct_aa_deaths=pct_aa_deaths,
            pct_includes_unknown_race=False,
            pct_includes_hispanic_black=True,
            known_race_cases=known_race_cases,
            known_race_deaths=known_race_deaths
        )]
Exemple #15
0
    def _scrape(self, **kwargs):
        r = get_cached_url(self.JS_URL)
        json_str = re.search(r'data = (([^;]|\n)*)', r.text,
                             re.MULTILINE).group(1).strip()
        # Commas on the last item in a list or object are valid in
        # JavaScript, but not in JSON.
        json_str = re.sub(r',(\s|\n)*([]}]|$)', r'\2', json_str, re.MULTILINE)
        _logger.debug(f'Extracted JSON: {json_str}')
        data = json.loads(json_str)['content']

        # Find the update date
        month, day, year = map(
            int,
            re.search(r'(\d{2})/(\d{2})/(\d{4})', data['info']).groups())

        date = datetime.date(year, month, day)
        _logger.info(f'Processing data for {date}')

        # Extract the total counts
        total_cases = raw_string_to_int(data['count'])
        total_deaths = raw_string_to_int(data['death'])

        # Fetch the HTML page
        soup = url_to_soup(self.DATA_URL)

        # Extract the Black/AA counts
        cases = self._extract_by_race_table(soup.find(id='race'))
        deaths = self._extract_by_race_table(soup.find(id='race-d'))

        _logger.debug(f'cases: {cases}')
        _logger.debug(f'deaths: {deaths}')

        known_cases = cases.drop('Under Investigation')['count'].sum()
        known_deaths = deaths.drop('Under Investigation')['count'].sum()

        aa_cases = cases.loc['Black', 'count'].sum()
        aa_deaths = deaths.loc['Black', 'count'].sum()

        aa_cases_pct = to_percentage(aa_cases, known_cases)
        aa_deaths_pct = to_percentage(aa_deaths, known_deaths)

        return [
            self._make_series(
                date=date,
                cases=total_cases,
                deaths=total_deaths,
                aa_cases=aa_cases,
                aa_deaths=aa_deaths,
                pct_aa_cases=aa_cases_pct,
                pct_aa_deaths=aa_deaths_pct,
                pct_includes_unknown_race=False,
                pct_includes_hispanic_black=False,
                known_race_cases=known_cases,
                known_race_deaths=known_deaths,
            )
        ]
    def _scrape(self, **kwargs):
        runner = WebdriverRunner()
        results = runner.run(WebdriverSteps().go_to_url(
            self.CASES_URL).wait_for_number_of_elements(
                (By.XPATH, '//canvas'), 38).find_request(
                    'cases', find_by=tableau.find_tableau_request
                ).clear_request_history().go_to_url(
                    self.DEATHS_URL).wait_for_number_of_elements(
                        (By.XPATH, '//canvas'),
                        20).find_request('deaths',
                                         find_by=tableau.find_tableau_request))

        parser = tableau.TableauParser(request=results.requests['cases'])

        date_str = pydash.head(
            parser.extract_data_from_key('Footer')['AGG(Today)'])
        date = datetime.strptime(date_str, '%m-%d-%y').date()

        cases = pydash.head(
            parser.extract_data_from_key('Total Cases')['AGG(Total Cases)'])
        deaths = pydash.head(
            parser.extract_data_from_key('Total  Deaths')
            ['SUM(Count Of Deaths)'])
        cases_pct_df = pd.DataFrame.from_dict(
            parser.extract_data_from_key('Race Breakdown ')).set_index('Race')
        cases_df = cases_pct_df.assign(Count=[
            round(v * cases) for v in cases_pct_df['CNTD(Caseid 1)'].values
        ])
        aa_cases = cases_df.loc['Black']['Count']
        known_race_cases = cases - cases_df.loc['Unknown']['Count']

        parser = tableau.TableauParser(request=results.requests['deaths'])
        deaths_pct_df = pd.DataFrame.from_dict(
            parser.extract_data_from_key('Bar | Race')).set_index('Race')
        deaths_df = deaths_pct_df.assign(Count=[
            round(v * deaths) for v in deaths_pct_df['SUM(Death Count)'].values
        ])
        aa_deaths = deaths_df.loc['Black']['Count']
        known_race_deaths = deaths - deaths_df.loc['Unknown']['Count']

        pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases)
        pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths)

        return [
            self._make_series(date=date,
                              cases=cases,
                              deaths=deaths,
                              aa_cases=aa_cases,
                              aa_deaths=aa_deaths,
                              pct_aa_cases=pct_aa_cases,
                              pct_aa_deaths=pct_aa_deaths,
                              pct_includes_unknown_race=False,
                              pct_includes_hispanic_black=True,
                              known_race_cases=known_race_cases,
                              known_race_deaths=known_race_deaths)
        ]
    def _scrape(self, **kwargs):
        runner = WebdriverRunner()
        results = runner.run(WebdriverSteps(
        ).go_to_url(self.ACCEPTABLE_USE_URL).find_element_by_xpath(
            "//input[@class='form-check-input']"
        ).click_on_last_element_found().find_element_by_xpath(
            '//form/button'
        ).click_on_last_element_found().wait_for_presence_of_elements(
            (By.XPATH, "//a[@data-chart-id='count-charts']")
        ).find_element_by_xpath(
            "//a[@data-chart-id='count-charts']"
        ).click_on_last_element_found(
        ).wait_for_presence_of_elements(
            (By.XPATH,
             "//*[contains(text(), 'Total Cases by Race/Ethnicity & County')]"
             )).get_page_source())
        soup = results.page_source

        date = self.get_last_updated_date(soup)
        _logger.info(f'Processing data for {date}')

        cases = self.get_total_cases(soup)
        deaths = self.get_total_deaths(soup)

        cases_df = self.get_race_cases_df(soup)
        aa_cases = cases_df.loc['Non-Hispanic Black']['State of Delaware']
        try:
            unknown_race_cases = cases_df.loc['Unknown']['State of Delaware']
        except KeyError:
            unknown_race_cases = 0
        known_race_cases = cases - unknown_race_cases

        deaths_df = self.get_race_deaths_df(soup)
        aa_deaths = deaths_df.loc['Non-Hispanic Black']['State of Delaware']
        try:
            unknown_race_deaths = deaths_df.loc['Unknown']['State of Delaware']
        except KeyError:
            unknown_race_deaths = 0
        known_race_deaths = deaths - unknown_race_deaths

        pct_aa_cases = to_percentage(aa_cases, known_race_cases)
        pct_aa_deaths = to_percentage(aa_deaths, known_race_deaths)

        return [
            self._make_series(date=self.get_last_updated_date(soup),
                              cases=cases,
                              deaths=deaths,
                              aa_cases=aa_cases,
                              aa_deaths=aa_deaths,
                              pct_aa_cases=pct_aa_cases,
                              pct_aa_deaths=pct_aa_deaths,
                              pct_includes_unknown_race=False,
                              pct_includes_hispanic_black=False,
                              known_race_cases=known_race_cases,
                              known_race_deaths=known_race_deaths)
        ]
    def _scrape(self, *, validation=False, **kwargs):
        urls = find_all_links(url=self.REPORTING_URL,
                              search_string='covid-19-raw-data')
        _logger.debug(f'Fetching links from {urls}')

        url_fragment = urls[0].split('/')[2]
        url = self.DOWNLOAD_URL_TEMPLATE.format(url_fragment)
        _logger.debug(f'Current COVID-19 data: {url}')

        # Cumulative number of cases / deaths
        ma_zip = get_zip(url)

        _logger.debug('Get the race/ethnicity breakdown')
        df_raw = pd.read_csv(
            get_zip_member_as_file(ma_zip, 'RaceEthnicity.csv'),
            parse_dates=['Date']
        )

        _logger.debug('Get date of most recent data published')
        # If desired (validation = True), verify that calculations as
        # of D4BL's last refresh match these calculations.
        if validation is True:
            max_date = datetime.date(2020, 4, 9)
        else:
            max_date = max(df_raw.Date)

        _logger.info(f'Processing data for {max_date}')
        df_mass = df_raw[df_raw.Date == max_date]

        # Intermediate calculations

        total_cases = df_mass['All Cases'].sum()
        total_deaths = df_mass['Deaths'].sum()
        aa_cases = df_mass[
            df_mass['Race/Ethnicity']
            == 'Non-Hispanic Black/African American'
        ]['All Cases'].tolist()[0]
        aa_cases_pct = to_percentage(aa_cases, total_cases)
        aa_deaths = df_mass[
            df_mass['Race/Ethnicity']
            == 'Non-Hispanic Black/African American'
        ]['Deaths'].tolist()[0]
        aa_deaths_pct = to_percentage(aa_deaths, total_deaths)
        return [self._make_series(
            date=max_date.date(),
            cases=total_cases,
            deaths=total_deaths,
            aa_cases=aa_cases,
            aa_deaths=aa_deaths,
            pct_aa_cases=aa_cases_pct,
            pct_aa_deaths=aa_deaths_pct,
            pct_includes_unknown_race=True,
            pct_includes_hispanic_black=False,
        )]
Exemple #19
0
def test_alabama(patched_geoservice):
    # setup constants and mock data
    AA_IDX = 1
    UNKNOWN_IDX = 3

    cases = {
        'Racecat': ['Asian', 'Black', 'Other', 'Unknown', 'White'],
        'value': [200, 19000, 3000, 18000, 20000]
    }

    deaths = {
        'Racecat': ['Asian', 'Black', 'Other', 'Unknown', 'White'],
        'value': [10, 500, 25, 50, 600]
    }

    known_cases_by_race = sum(cases['value']) - cases['value'][UNKNOWN_IDX]
    known_deaths_by_race = sum(deaths['value']) - deaths['value'][UNKNOWN_IDX]

    # patch geoservice
    patched_geoservice.side_effect = [
        util.make_query_geoservice_data(data=cases),
        util.make_query_geoservice_data(data=deaths)
    ]

    # run and test
    util.run_scraper_and_assert(scraper_cls=Alabama,
                                assertions={
                                    'Date Published':
                                    date.today(),
                                    'Total Cases':
                                    sum(cases['value']),
                                    'Total Deaths':
                                    sum(deaths['value']),
                                    'Count Cases Black/AA':
                                    cases['value'][AA_IDX],
                                    'Count Deaths Black/AA':
                                    deaths['value'][AA_IDX],
                                    'Pct Includes Unknown Race':
                                    False,
                                    'Pct Includes Hispanic Black':
                                    False,
                                    'Pct Cases Black/AA':
                                    to_percentage(cases['value'][AA_IDX],
                                                  known_cases_by_race),
                                    'Pct Deaths Black/AA':
                                    to_percentage(deaths['value'][AA_IDX],
                                                  known_deaths_by_race),
                                    'Count Cases Known Race':
                                    known_cases_by_race,
                                    'Count Deaths Known Race':
                                    known_deaths_by_race
                                })
Exemple #20
0
    def _scrape(self, **kwargs):
        soup = url_to_soup(self.REPORTING_URL)

        # find date
        strong = soup.find('strong', string=re.compile('Updated '))
        date_text = re.search(r'[A-Z][a-z][a-z]+ \d(\d)?, 20\d\d',
                              strong.text).group()

        # find total number of confirmed cases
        strong = soup.find('strong',
                           string=re.compile(r'Total positive( cases)?:'))
        num_cases = raw_string_to_int(str(strong.next_sibling))

        # find total number of deaths
        strong = soup.find('strong', string=re.compile('(Total )?[Dd]eaths:'))
        num_deaths = raw_string_to_int(strong.next_sibling)

        date_obj = datetime.datetime.strptime(date_text, '%B %d, %Y').date()
        _logger.info(f'Processing data for {date_obj}')
        _logger.debug(f'Number Cases: {num_cases}')
        _logger.debug(f'Number Deaths: {num_deaths}')

        # find number of Black/AA cases and deaths
        table = soup.find('table', attrs={'id': 'raceethtable'})
        if not table:
            raise ValueError('Unable to locate race/ethnicity table')
        th = table.find(
            lambda elt: elt.name == 'th' and elt.text.find('Black') >= 0)
        if not th:
            raise ValueError('Unable to locate Black/AA data row')
        tds = th.find_next_siblings('td')
        cnt_aa_cases = raw_string_to_int(tds[0].text)
        cnt_aa_deaths = raw_string_to_int(tds[1].text)
        pct_aa_cases = to_percentage(cnt_aa_cases, num_cases)
        pct_aa_deaths = to_percentage(cnt_aa_deaths, num_deaths)

        _logger.debug(f'Number Black/AA Cases: {cnt_aa_cases}')
        _logger.debug(f'Number Black/AA Deaths: {cnt_aa_deaths}')

        return [
            self._make_series(
                date=date_obj,
                cases=num_cases,
                deaths=num_deaths,
                aa_cases=cnt_aa_cases,
                aa_deaths=cnt_aa_deaths,
                pct_aa_cases=pct_aa_cases,
                pct_aa_deaths=pct_aa_deaths,
                pct_includes_unknown_race=True,
                pct_includes_hispanic_black=True,
            )
        ]
Exemple #21
0
    def _scrape(self, **kwargs):
        runner = WebdriverRunner()
        results = runner.run(WebdriverSteps().go_to_url(
            self.HOME_PAGE_URL).get_page_source().go_to_url(
                self.DEMOGRAPHIC_CASES_URL).find_request(
                    key='cases', find_by=tableau.find_tableau_request).
                             clear_request_history().go_to_url(
                                 self.DEMOGRAPHIC_DEATHS_URL).find_request(
                                     'deaths',
                                     find_by=tableau.find_tableau_request))

        date_str_element = results.page_source.find(
            'strong', string=re.compile('current'))
        assert date_str_element, 'No date element found'
        date_str = date_str_element.get_text()
        pattern = re.compile(r'(\d{1,2}\/\d{1,2}\/\d{4})')
        matches = pattern.search(date_str)
        assert matches, 'Date not found.'
        date = datetime.strptime(matches.group(), '%m/%d/%Y').date()

        parser = tableau.TableauParser(request=results.requests['cases'])
        cases_df = parser.get_dataframe_from_key('CaseRace').set_index(
            'Measure Status')
        cases = cases_df.loc['Black or African American']['AGG(Calculation1)']
        aa_cases = round(
            cases_df.loc['Black or African American']['SUM(Count)'] * cases)

        parser = tableau.TableauParser(request=results.requests['deaths'])
        deaths = parser.get_dataframe_from_key(
            'Total Deaths (2)')['SUM(Deaths)'].sum()
        deaths_df = parser.get_dataframe_from_key('Race').set_index(
            'Measure Status11')
        aa_deaths = round(
            deaths_df.loc['Black or African American']['SUM(Deaths)'] * deaths)

        pct_aa_cases = to_percentage(aa_cases, cases)
        pct_aa_deaths = to_percentage(aa_deaths, deaths)

        return [
            self._make_series(
                date=date,
                cases=cases,
                deaths=deaths,
                aa_cases=aa_cases,
                aa_deaths=aa_deaths,
                pct_aa_cases=pct_aa_cases,
                pct_aa_deaths=pct_aa_deaths,
                pct_includes_unknown_race=False,
                pct_includes_hispanic_black=True,
            )
        ]
    def _scrape(self, refresh=False, **kwargs):
        _logger.debug('Get case totals data')
        totals_json = get_json(self.CASE_DATA_URL)
        assert totals_json, 'Error finding total cases and deaths'

        most_recent_totals = totals_json[0]
        # dict.get sets value to None if key not availale
        report_date = datetime.strptime(most_recent_totals.get('date'),
                                        '%Y-%m-%dT%H:%M:%S.%f').date()
        total_cases = most_recent_totals.get('confirmedcases')
        total_deaths = most_recent_totals.get('confirmeddeaths')

        assert total_cases, 'Error finding total cases'
        assert total_deaths, 'Error finding total deaths'

        # convert from string to int
        total_cases = int(total_cases)
        total_deaths = int(total_deaths)

        _logger.debug('Get race data')
        race_json = get_json(self.RACE_DATA_URL)
        assert race_json, 'Error getting race cases and deaths json'

        most_recent_nh_black_data = pydash.find(
            race_json, lambda data: data['hisp_race'] == 'NH Black')
        assert most_recent_nh_black_data, 'Error finding total NH Black entry'
        aa_cases = most_recent_nh_black_data.get('case_tot')
        aa_deaths = most_recent_nh_black_data.get('deaths')

        assert aa_cases, 'Error finding total NH Black cases'
        assert aa_deaths, 'Error finding total NH Black deaths'

        # convert from string to int
        aa_cases = int(aa_cases)
        aa_deaths = int(aa_deaths)

        pct_aa_cases = to_percentage(aa_cases, total_cases)
        pct_aa_deaths = to_percentage(aa_deaths, total_deaths)

        return [
            self._make_series(date=report_date,
                              cases=total_cases,
                              deaths=total_deaths,
                              aa_cases=aa_cases,
                              aa_deaths=aa_deaths,
                              pct_aa_cases=pct_aa_cases,
                              pct_aa_deaths=pct_aa_deaths,
                              pct_includes_unknown_race=True,
                              pct_includes_hispanic_black=False)
        ]
    def _scrape(self, **kwargs):
        runner = WebdriverRunner()
        results = runner.run(WebdriverSteps().go_to_url(
            self.CASES_URL).find_request('cases',
                                         find_by=tableau.find_tableau_request).
                             clear_request_history().go_to_url(
                                 self.DEATHS_URL).find_request(
                                     'deaths',
                                     find_by=tableau.find_tableau_request))

        parser = tableau.TableauParser(request=results.requests['cases'])
        raw_date_str = pydash.head(
            parser.extract_data_from_key('cases')['ATTR(dateupdated)'])
        date = datetime.strptime(raw_date_str, '%m/%d/%Y').date()

        cases = pydash.head(
            parser.extract_data_from_key('cases')
            ['SUM(Laboratory Confirmed Cases)'])
        cases_df = pd.DataFrame.from_dict(
            parser.extract_data_from_key('raceth')).set_index('subcategory')
        aa_cases = cases_df.loc['Black']['SUM(count)']
        known_race_cases = cases - cases_df.loc['Unknown']['SUM(count)']

        parser = tableau.TableauParser(request=results.requests['deaths'])
        deaths = pydash.head(
            parser.extract_data_from_key('death (2)')['SUM(Deaths)'])
        deaths_df = pd.DataFrame.from_dict(
            parser.extract_data_from_key('raceth (death)')).set_index(
                'subcategory')
        aa_deaths = deaths_df.loc['Black']['SUM(count)']
        known_race_deaths = deaths - deaths_df.loc['Unknown']['SUM(count)']

        pct_aa_cases = misc.to_percentage(aa_cases, known_race_cases)
        pct_aa_deaths = misc.to_percentage(aa_deaths, known_race_deaths)

        return [
            self._make_series(
                date=date,
                cases=cases,
                deaths=deaths,
                aa_cases=aa_cases,
                aa_deaths=aa_deaths,
                pct_aa_cases=pct_aa_cases,
                pct_aa_deaths=pct_aa_deaths,
                pct_includes_unknown_race=False,
                pct_includes_hispanic_black=False,
                known_race_cases=known_race_cases,
                known_race_deaths=known_race_deaths,
            )
        ]
Exemple #24
0
    def _scrape(self, **kwargs):
        data = pd.read_excel(self.DATA_URL, sheet_name='Demographics')

        # Get totals date
        month, day, year = map(
            int,
            re.search(r'(\d+)/(\d+)/(\d\d\d\d)', data.columns[0]).groups())
        date = datetime.date(year, month, day)
        _logger.info(f'Processing data for {date}')

        # Get totals data
        total_cases = int(re.match(r'N=(\d+)', data.loc[0, 'Cases']).group(1))
        total_deaths = int(
            re.match(r'N=(\d+)', data.loc[0, 'Deaths']).group(1))

        data = data.set_index(data.columns[0])
        data = data.rename(
            columns={
                'Unnamed: 2': '% Cases',
                'Unnamed: 4': '% Hosp',
                'Unnamed: 6': '% Deaths',
            })
        for idx in data.index:
            str_idx = str(idx)
            if str_idx.startswith('Black'):
                aa_cases = int(data.loc[idx, 'Cases'])
                aa_deaths = int(data.loc[idx, 'Deaths'])
            elif str_idx.startswith('Unknown'):
                known_cases = total_cases - int(data.loc[idx, 'Cases'])
                known_deaths = total_deaths - int(data.loc[idx, 'Deaths'])
        # Compute the percentages as the provided ones are excessively rounded.
        aa_cases_pct = to_percentage(aa_cases, known_cases)
        aa_deaths_pct = to_percentage(aa_deaths, known_deaths)

        return [
            self._make_series(
                date=date,
                cases=total_cases,
                deaths=total_deaths,
                aa_cases=aa_cases,
                aa_deaths=aa_deaths,
                pct_aa_cases=aa_cases_pct,
                pct_aa_deaths=aa_deaths_pct,
                pct_includes_unknown_race=False,
                pct_includes_hispanic_black=False,
                known_race_cases=known_cases,
                known_race_deaths=known_deaths,
            )
        ]
    def _scrape(self, **kwargs):
        # Find latest report
        soup = url_to_soup(self.REPORTING_URL)
        by_dem_path = soup.find('a',
                                text='Cases by Demographics Statewide')['href']

        # Extract the report date
        (year, month, day) = map(
            int,
            re.search(r'(\d{4})-(\d{2})-(\d{2})', by_dem_path).groups())

        date_published = datetime.date(year, month, day)
        _logger.info(f'Processing data for {date_published}')

        # Load the data
        by_dem_url = urljoin(self.REPORTING_URL, by_dem_path)
        by_dem = pd.read_excel(by_dem_url)

        # Drop probable cases
        by_dem = by_dem[by_dem['CASE_STATUS'] == 'Confirmed']
        by_dem['Cases'] = by_dem['Cases'].str.replace('Suppressed',
                                                      '0').astype(int)
        by_dem['Deaths'] = by_dem['Deaths'].str.replace('Suppressed',
                                                        '0').astype(int)
        by_race = by_dem[['RaceCat', 'Cases',
                          'Deaths']].groupby('RaceCat').sum()

        total = by_race.sum(axis=0)
        total_cases = total['Cases']
        total_deaths = total['Deaths']
        aa_cases = by_race.loc['Black/African American', 'Cases']
        aa_cases_pct = to_percentage(aa_cases, total_cases)
        aa_deaths = by_race.loc['Black/African American', 'Deaths']
        aa_deaths_pct = to_percentage(aa_deaths, total_deaths)

        return [
            self._make_series(
                date=date_published,
                cases=total_cases,
                deaths=total_deaths,
                aa_cases=aa_cases,
                aa_deaths=aa_deaths,
                pct_aa_cases=aa_cases_pct,
                pct_aa_deaths=aa_deaths_pct,
                pct_includes_unknown_race=True,
                pct_includes_hispanic_black=False,
            )
        ]
        def _scrape(self, **kwargs):
            date, data = query_geoservice(**self.DEMOG)
            _logger.info(f'Processing data for {date}')

            total_cases = data.loc[0, 'CasesAll']
            known_cases = total_cases - data.loc[0, 'C_RaceUnknown']
            aa_cases = data.loc[0, 'C_RaceBlack']
            pct_aa_cases = to_percentage(aa_cases, known_cases)

            total_deaths = data.loc[0, 'Deaths']
            # Does not include demographic breakdown of deaths
            known_deaths = nan
            aa_deaths = nan
            pct_aa_deaths = nan

            return [
                self._make_series(
                    date=date,
                    cases=total_cases,
                    deaths=total_deaths,
                    aa_cases=aa_cases,
                    aa_deaths=aa_deaths,
                    pct_aa_cases=pct_aa_cases,
                    pct_aa_deaths=pct_aa_deaths,
                    pct_includes_unknown_race=False,
                    pct_includes_hispanic_black=True,
                    known_race_cases=known_cases,
                    known_race_deaths=known_deaths,
                )
            ]
Exemple #27
0
def test_wisconsin():
    util.run_scraper_and_assert(
        scraper_cls=Wisconsin,
        assertions={
            'Date Published': date.today(),
            'Total Cases': 40000,
            'Total Deaths': 800,
            'Count Cases Black/AA': 6600,
            'Count Deaths Black/AA': 200,
            'Pct Includes Unknown Race': False,
            'Pct Includes Hispanic Black': True,
            'Pct Cases Black/AA': to_percentage(6600, 40000 - 3800),
            'Pct Deaths Black/AA': to_percentage(200, 800 - 10),
            'Count Cases Known Race': 40000 - 3800,
            'Count Deaths Known Race': 800 - 10
        })
Exemple #28
0
def test_arkansas():
    util.run_scraper_and_assert(
        scraper_cls=Arkansas,
        assertions={
            'Date Published': date.today(),
            'Total Cases': 31000,
            'Total Deaths': 300,
            'Count Cases Black/AA': 6000,
            'Count Deaths Black/AA': 90,
            'Pct Includes Unknown Race': False,
            'Pct Includes Hispanic Black': True,
            'Pct Cases Black/AA': to_percentage(6000, 27000),
            'Pct Deaths Black/AA': to_percentage(90, 297),
            'Count Cases Known Race': 27000,
            'Count Deaths Known Race': 297
        })
Exemple #29
0
def test_alaska():
    util.run_scraper_and_assert(
        scraper_cls=Alaska,
        assertions={
            'Date Published': date.today(),
            'Total Cases': 1733,
            'Total Deaths': 17,
            'Count Cases Black/AA': 41,
            'Count Deaths Black/AA': 0,
            'Pct Includes Unknown Race': False,
            'Pct Includes Hispanic Black': True,
            'Pct Cases Black/AA': to_percentage(41, 1171),
            'Pct Deaths Black/AA': to_percentage(0, 17),
            'Count Cases Known Race': 1171,
            'Count Deaths Known Race': 17
        })
    def _scrape(self, **kwargs):
        counties = pd.read_csv(self.CASES_BY_COUNTY_URL)
        total_deaths = counties['DEATHS'].sum()

        table = pd.read_csv(
            get_content_as_file(self.CASES_BY_RACE_URL),
            index_col=0,
            parse_dates=['DATA_REFRESH_DT'])

        total_cases = table['CASES'].sum()
        known_cases = table['CASES'].drop('Not disclosed').sum()
        date = table['DATA_REFRESH_DT'][0].date()
        _logger.info(f'Processing data for {date}')
        aa_cases_cnt = table.loc['Black or African American', 'CASES']
        aa_cases_pct = to_percentage(aa_cases_cnt, known_cases)

        # No race breakdowns for deaths
        aa_deaths_cnt = float('nan')
        aa_deaths_pct = float('nan')
        known_deaths = float('nan')

        return [self._make_series(
            date=date,
            cases=total_cases,
            deaths=total_deaths,
            aa_cases=aa_cases_cnt,
            aa_deaths=aa_deaths_cnt,
            pct_aa_cases=aa_cases_pct,
            pct_aa_deaths=aa_deaths_pct,
            pct_includes_unknown_race=False,
            pct_includes_hispanic_black=True,
            known_race_cases=known_cases,
            known_race_deaths=known_deaths,
        )]