Python as_listの例、covid19_scrapers.utils.misc.as_list Pythonの例

コード例 #1

0

ファイルを表示

 def _get_values(self, select_value, data_row):
     row = []
     for sv in select_value:
         if sv and sv in data_row:
             row.append(data_row[sv])
         else:
             return as_list(data_row.get('C', [])) + as_list(
                 data_row.get('R', []))
     return row

コード例 #2

0

ファイルを表示

def wait_for_conditions_on_webdriver(driver, conditions, timeout):
    try:
        for c in as_list(conditions):
            WebDriverWait(driver, timeout).until(c)
    except TimeoutException:
        _logger.error('Waiting timed out in %s seconds' % timeout)
        raise

コード例 #3

0

ファイルを表示

def _serarch_by_selects(selects, body_json):
    if not selects:
        return True
    queries = body_json.get('queries', [])
    resp_selects = pydash.flat_map(
        queries, lambda q: pydash.get(
            q, 'Query.Commands.0.SemanticQueryDataShapeCommand.Query.Select'))
    return len(set(as_list(selects)) - {s.get('Name')
                                        for s in resp_selects}) == 0

コード例 #4

0

ファイルを表示

 def __init__(self,
              element_locators,
              condition=Condition.PRESENCE,
              number_of_elements=None,
              timeout=60):
     if condition not in Condition:
         raise ExecutionStepException(
             'Invalid condition, check the `Conditions` enum for valid conditions'
         )
     self.locators = as_list(element_locators)
     self.condition = condition
     self.timeout = timeout
     self.number_of_elements = number_of_elements

コード例 #5

0

ファイルを表示

ファイル: florida.py プロジェクト: wjhrdy/COVID19_tracker_data_extraction

    def _scrape(self, refresh=False, **kwargs):
        """Set refresh to true to ignore the cache.  If false, we will still
        use conditional GET to invalidate cached data.
        """
        _logger.debug('Find daily Florida URL')
        daily_url = get_daily_url(self.REPORTING_URL)
        _logger.debug(f'URL: is {daily_url}')

        report_date = get_report_date(daily_url)
        _logger.info(f'Processing data for {report_date}')

        _logger.debug('Download the daily Florida URL')
        pdf_data = get_content(daily_url, force_remote=refresh)

        _logger.debug('Find the table area coordinates')
        table_bbox = get_table_area(pdf_data)
        table_area = (table_bbox.y0, table_bbox.x0, table_bbox.y1,
                      table_bbox.x1)

        _logger.debug('Parse the PDF')
        table = as_list(
            read_pdf(BytesIO(pdf_data),
                     pages='3',
                     stream=True,
                     multiple_tables=False,
                     area=table_area,
                     pandas_options=dict(header=None,
                                         names=COLUMN_NAMES,
                                         converters=CONVERTERS)))[0]

        _logger.debug('Set the race/ethnicity indices')
        races = ('White', 'Black', 'Other', 'Unknown race', 'Total')
        for idx, row in table.iterrows():
            if row['Race/ethnicity'] in races:
                race = row['Race/ethnicity']
                ethnicity = 'All ethnicities'
            else:
                ethnicity = row['Race/ethnicity']
            table.loc[idx, 'Race'] = race
            table.loc[idx, 'Ethnicity'] = ethnicity

        table = table.drop('Race/ethnicity', axis=1)
        table = table.set_index(['Race', 'Ethnicity'])

        _logger.debug('Fill NAs with 1')
        table.loc[('Total', 'All ethnicities')] = table.loc[(
            'Total', 'All ethnicities')].fillna(1)

        att_names = ['Cases', 'Deaths']
        all_cases_and_deaths = {
            nm: int(
                table.query(
                    "Race == 'Total' and Ethnicity == 'All ethnicities'")
                [nm].to_list()[0])
            for nm in att_names
        }
        aa_cases_and_deaths = {
            nm: int(
                table.query("Race == 'Black' and Ethnicity == 'Non-Hispanic'")
                [nm].to_list()[0])
            for nm in att_names
        }
        aa_cases_and_deaths_pct = {
            nm: round(100 * aa_cases_and_deaths[nm] / all_cases_and_deaths[nm],
                      2)
            for nm in att_names
        }

        return [
            self._make_series(
                date=report_date,
                cases=all_cases_and_deaths['Cases'],
                deaths=all_cases_and_deaths['Deaths'],
                aa_cases=aa_cases_and_deaths['Cases'],
                aa_deaths=aa_cases_and_deaths['Deaths'],
                pct_aa_cases=aa_cases_and_deaths_pct['Cases'],
                pct_aa_deaths=aa_cases_and_deaths_pct['Deaths'],
                pct_includes_unknown_race=True,
                pct_includes_hispanic_black=False,
            )
        ]

コード例 #6

0

ファイルを表示

ファイル: california_san_diego.py プロジェクト: d4bl/COVID19_tracker_data_extraction

    def _scrape(self, **kwargs):
        # Download the files
        download_file(self.CASES_URL, 'cases.pdf')
        download_file(self.DEATHS_URL, 'deaths.pdf')

        # Extract the date
        pdf = fitz.Document(filename='cases.pdf', filetype='pdf')
        date = None
        for (x0, y0, x1, y1, block, block_type,
             block_no) in pdf[0].getText('blocks'):
            match = re.search(r'updated +(\d\d?)/(\d\d?)/(\d{4})', block)
            if match:
                month, day, year = map(int, match.groups())
                date = datetime.date(year, month, day)
                break
        if not date:
            raise ValueError('Unable to find date in cases PDF')
        _logger.info(f'Processing data for {date}')

        _logger.debug('Loading cases')
        cases_raw = as_list(read_pdf('cases.pdf', pages=1))[0]

        # Scan the rows to find where the header ends.
        for idx in cases_raw.index:
            if cases_raw.iloc[idx, 0] == 'Race and Ethnicity':
                cases = cases_raw.iloc[idx + 1:].copy()
                cases.columns = cases_raw.iloc[idx]
                break

        # Format the cases and calculate/extract data.
        cases['Count'] = cases['Count'].str.replace(',', '').astype(int)
        cases = cases.set_index('Race and Ethnicity')

        total_cases = cases['Count'].sum()
        total_known_cases = cases['Count'].drop(
            'Race/Ethnicity Other/Unknown').sum()
        cases['Percent'] = to_percentage(cases['Count'], total_known_cases)

        aa_cases_cnt = cases.loc['Black or African American', 'Count']
        aa_cases_pct = cases.loc['Black or African American', 'Percent']

        _logger.debug(f'Total cases: {total_cases}')
        _logger.debug(f'Total cases with known race: {total_known_cases}')
        _logger.debug(f'Total AA cases: {aa_cases_cnt}')
        _logger.debug(f'Pct AA cases: {aa_cases_pct}')

        _logger.debug('Loading deaths')
        deaths_raw = as_list(read_pdf('deaths.pdf', pages=1))[0]

        # Scan the rows to find where the header ends.
        for idx in deaths_raw.index:
            if deaths_raw.iloc[idx, 0] == 'Total Deaths':
                # Pick out the total deaths en passant
                total_deaths = self.check_cvt(deaths_raw.iloc[idx, 1])
            elif deaths_raw.iloc[idx, 0] == 'Race/Ethnicity':
                deaths = deaths_raw.iloc[idx + 1:]
                # The table is read with two columns, and centering
                # makes some entries in the left column get included
                # in the right instead. dropna removes these.
                deaths = deaths.dropna().copy()
                deaths.columns = ['Race/Ethnicity', 'Count']
                break
        deaths = deaths.set_index('Race/Ethnicity')

        deaths['Count'] = deaths['Count'].apply(self.check_cvt)
        # Some reports have a discrepancy between sum of known
        # race/ethnicity counts, and total reported ex unknown
        # count. SD appears to use the latter, so we do the same.
        total_known_deaths = (
            total_deaths - deaths.loc['Race/Ethnicity Other/Unknown', 'Count'])

        deaths['Percent'] = to_percentage(deaths['Count'], total_known_deaths)

        aa_deaths_cnt = deaths.loc['Black or African American', 'Count']
        aa_deaths_pct = deaths.loc['Black or African American', 'Percent']

        _logger.debug(f'Total deaths: {total_deaths}')
        _logger.debug(f'Total deaths with known race: {total_known_deaths}')
        _logger.debug(f'Total AA deaths: {aa_deaths_cnt}')
        _logger.debug(f'Pct AA deaths: {aa_deaths_pct}')

        return [
            self._make_series(
                date=date,
                cases=total_cases,
                deaths=total_deaths,
                aa_cases=aa_cases_cnt,
                aa_deaths=aa_deaths_cnt,
                pct_aa_cases=aa_cases_pct,
                pct_aa_deaths=aa_deaths_pct,
                pct_includes_unknown_race=False,
                pct_includes_hispanic_black=False,
                known_race_cases=total_known_cases,
                known_race_deaths=total_known_deaths,
            )
        ]

コード例 #7

0

ファイルを表示

ファイル: kentucky.py プロジェクト: wjhrdy/COVID19_tracker_data_extraction

    def _scrape(self, **kwargs):
        # Download the report
        download_file(self.REPORT_URL, 'report.pdf')

        # Extract the date
        doc = fitz.Document(filename='report.pdf', filetype='pdf')
        for (
                x0, y0, x1, y1, word, block_no, line_no, word_no
        ) in doc[0].getText('words'):
            match = re.match(r'(\d+)/(\d+)/(\d+)', word)
            if match:
                month, day, year = map(int, match.groups())
                date = datetime.date(year, month, day)
        _logger.info(f'Processing data for {date}')

        # Extract totals data
        totals_list = as_list(read_pdf(
            'report.pdf',
            multiple_tables=True, pages=1,
            lattice=True,
            pandas_options={'header': None}))

        _logger.debug(f'First table is {totals_list[0]}')

        totals = totals_list[0]
        totals[0] = (totals[0]
                     .str.replace('*', '', regex=False)
                     .str.replace('\r', ' ', regex=False))
        totals.set_index(0, inplace=True)
        total_cases = raw_string_to_int(totals.loc['Total Cases', 1])
        total_deaths = raw_string_to_int(totals.loc['Total Deaths', 1])

        # Clean demographic data tables and extract data
        raw_tables = as_list(read_pdf(
            'report.pdf',
            lattice=True,
            multiple_tables=True, pages=[2],
            pandas_options={'header': None}))
        seen = set()
        _logger.debug(f'got {len(raw_tables)} tables: ')
        for idx, table in enumerate(raw_tables):
            _logger.debug(f'table #{idx+1}: {table}')
            if len(table) == 0:
                continue
            table.iloc[:, 0] = (table.iloc[:, 0]
                                .str.replace('*', '', regex=False)
                                .str.replace('\r', ' ', regex=False))
            race_label = table.iloc[:, 0].str.contains(
                'Where Race Known').fillna(False)
            if race_label.any():
                splits = table[race_label].index.values.tolist() + [-1]
                for header, end in zip(splits[:-1], splits[1:]):
                    # Stash the table name
                    title = str(table.iloc[header, 0])
                    # Set up the table
                    tbl = table.iloc[header + 1:end].copy()
                    tbl.columns = ['race', 'value']
                    tbl.set_index('race', inplace=True)
                    tbl.loc[:, 'value'] = tbl.loc[:, 'value'].str.extract(
                        PCT_RE
                    ).astype(float)
                    # Find the Black/AA label (this has varied from
                    # report to report)
                    black_label = _get_black_label(tbl)
                    # Extract the data
                    if (title.find('Cases') >= 0 and 'cases' not in seen):
                        (known_case_pct, known_cases, aa_cases_pct,
                         aa_cases) = _extract_demographic_data(
                             tbl, title, total_cases, black_label)
                        seen.add('cases')
                    elif (title.find('Deaths') >= 0 and 'deaths' not in seen):
                        (known_death_pct, known_deaths, aa_deaths_pct,
                         aa_deaths) = _extract_demographic_data(
                             tbl, title, total_deaths, black_label)
                        seen.add('deaths')
        assert 'cases' in seen, 'Did not find Cases by Race table'
        assert 'deaths' in seen, 'Did not find Deaths by Race table'

        return [self._make_series(
            date=date,
            cases=total_cases,
            deaths=total_deaths,
            aa_cases=aa_cases,
            aa_deaths=aa_deaths,
            pct_aa_cases=aa_cases_pct,
            pct_aa_deaths=aa_deaths_pct,
            pct_includes_unknown_race=False,
            pct_includes_hispanic_black=True,
            known_race_cases=known_cases,
            known_race_deaths=known_deaths,
        )]