def _get_values(self, select_value, data_row): row = [] for sv in select_value: if sv and sv in data_row: row.append(data_row[sv]) else: return as_list(data_row.get('C', [])) + as_list( data_row.get('R', [])) return row
def wait_for_conditions_on_webdriver(driver, conditions, timeout): try: for c in as_list(conditions): WebDriverWait(driver, timeout).until(c) except TimeoutException: _logger.error('Waiting timed out in %s seconds' % timeout) raise
def _serarch_by_selects(selects, body_json): if not selects: return True queries = body_json.get('queries', []) resp_selects = pydash.flat_map( queries, lambda q: pydash.get( q, 'Query.Commands.0.SemanticQueryDataShapeCommand.Query.Select')) return len(set(as_list(selects)) - {s.get('Name') for s in resp_selects}) == 0
def __init__(self, element_locators, condition=Condition.PRESENCE, number_of_elements=None, timeout=60): if condition not in Condition: raise ExecutionStepException( 'Invalid condition, check the `Conditions` enum for valid conditions' ) self.locators = as_list(element_locators) self.condition = condition self.timeout = timeout self.number_of_elements = number_of_elements
def _scrape(self, refresh=False, **kwargs): """Set refresh to true to ignore the cache. If false, we will still use conditional GET to invalidate cached data. """ _logger.debug('Find daily Florida URL') daily_url = get_daily_url(self.REPORTING_URL) _logger.debug(f'URL: is {daily_url}') report_date = get_report_date(daily_url) _logger.info(f'Processing data for {report_date}') _logger.debug('Download the daily Florida URL') pdf_data = get_content(daily_url, force_remote=refresh) _logger.debug('Find the table area coordinates') table_bbox = get_table_area(pdf_data) table_area = (table_bbox.y0, table_bbox.x0, table_bbox.y1, table_bbox.x1) _logger.debug('Parse the PDF') table = as_list( read_pdf(BytesIO(pdf_data), pages='3', stream=True, multiple_tables=False, area=table_area, pandas_options=dict(header=None, names=COLUMN_NAMES, converters=CONVERTERS)))[0] _logger.debug('Set the race/ethnicity indices') races = ('White', 'Black', 'Other', 'Unknown race', 'Total') for idx, row in table.iterrows(): if row['Race/ethnicity'] in races: race = row['Race/ethnicity'] ethnicity = 'All ethnicities' else: ethnicity = row['Race/ethnicity'] table.loc[idx, 'Race'] = race table.loc[idx, 'Ethnicity'] = ethnicity table = table.drop('Race/ethnicity', axis=1) table = table.set_index(['Race', 'Ethnicity']) _logger.debug('Fill NAs with 1') table.loc[('Total', 'All ethnicities')] = table.loc[( 'Total', 'All ethnicities')].fillna(1) att_names = ['Cases', 'Deaths'] all_cases_and_deaths = { nm: int( table.query( "Race == 'Total' and Ethnicity == 'All ethnicities'") [nm].to_list()[0]) for nm in att_names } aa_cases_and_deaths = { nm: int( table.query("Race == 'Black' and Ethnicity == 'Non-Hispanic'") [nm].to_list()[0]) for nm in att_names } aa_cases_and_deaths_pct = { nm: round(100 * aa_cases_and_deaths[nm] / all_cases_and_deaths[nm], 2) for nm in att_names } return [ self._make_series( date=report_date, cases=all_cases_and_deaths['Cases'], deaths=all_cases_and_deaths['Deaths'], aa_cases=aa_cases_and_deaths['Cases'], aa_deaths=aa_cases_and_deaths['Deaths'], pct_aa_cases=aa_cases_and_deaths_pct['Cases'], pct_aa_deaths=aa_cases_and_deaths_pct['Deaths'], pct_includes_unknown_race=True, pct_includes_hispanic_black=False, ) ]
def _scrape(self, **kwargs): # Download the files download_file(self.CASES_URL, 'cases.pdf') download_file(self.DEATHS_URL, 'deaths.pdf') # Extract the date pdf = fitz.Document(filename='cases.pdf', filetype='pdf') date = None for (x0, y0, x1, y1, block, block_type, block_no) in pdf[0].getText('blocks'): match = re.search(r'updated +(\d\d?)/(\d\d?)/(\d{4})', block) if match: month, day, year = map(int, match.groups()) date = datetime.date(year, month, day) break if not date: raise ValueError('Unable to find date in cases PDF') _logger.info(f'Processing data for {date}') _logger.debug('Loading cases') cases_raw = as_list(read_pdf('cases.pdf', pages=1))[0] # Scan the rows to find where the header ends. for idx in cases_raw.index: if cases_raw.iloc[idx, 0] == 'Race and Ethnicity': cases = cases_raw.iloc[idx + 1:].copy() cases.columns = cases_raw.iloc[idx] break # Format the cases and calculate/extract data. cases['Count'] = cases['Count'].str.replace(',', '').astype(int) cases = cases.set_index('Race and Ethnicity') total_cases = cases['Count'].sum() total_known_cases = cases['Count'].drop( 'Race/Ethnicity Other/Unknown').sum() cases['Percent'] = to_percentage(cases['Count'], total_known_cases) aa_cases_cnt = cases.loc['Black or African American', 'Count'] aa_cases_pct = cases.loc['Black or African American', 'Percent'] _logger.debug(f'Total cases: {total_cases}') _logger.debug(f'Total cases with known race: {total_known_cases}') _logger.debug(f'Total AA cases: {aa_cases_cnt}') _logger.debug(f'Pct AA cases: {aa_cases_pct}') _logger.debug('Loading deaths') deaths_raw = as_list(read_pdf('deaths.pdf', pages=1))[0] # Scan the rows to find where the header ends. for idx in deaths_raw.index: if deaths_raw.iloc[idx, 0] == 'Total Deaths': # Pick out the total deaths en passant total_deaths = self.check_cvt(deaths_raw.iloc[idx, 1]) elif deaths_raw.iloc[idx, 0] == 'Race/Ethnicity': deaths = deaths_raw.iloc[idx + 1:] # The table is read with two columns, and centering # makes some entries in the left column get included # in the right instead. dropna removes these. deaths = deaths.dropna().copy() deaths.columns = ['Race/Ethnicity', 'Count'] break deaths = deaths.set_index('Race/Ethnicity') deaths['Count'] = deaths['Count'].apply(self.check_cvt) # Some reports have a discrepancy between sum of known # race/ethnicity counts, and total reported ex unknown # count. SD appears to use the latter, so we do the same. total_known_deaths = ( total_deaths - deaths.loc['Race/Ethnicity Other/Unknown', 'Count']) deaths['Percent'] = to_percentage(deaths['Count'], total_known_deaths) aa_deaths_cnt = deaths.loc['Black or African American', 'Count'] aa_deaths_pct = deaths.loc['Black or African American', 'Percent'] _logger.debug(f'Total deaths: {total_deaths}') _logger.debug(f'Total deaths with known race: {total_known_deaths}') _logger.debug(f'Total AA deaths: {aa_deaths_cnt}') _logger.debug(f'Pct AA deaths: {aa_deaths_pct}') return [ self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases_cnt, aa_deaths=aa_deaths_cnt, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=False, pct_includes_hispanic_black=False, known_race_cases=total_known_cases, known_race_deaths=total_known_deaths, ) ]
def _scrape(self, **kwargs): # Download the report download_file(self.REPORT_URL, 'report.pdf') # Extract the date doc = fitz.Document(filename='report.pdf', filetype='pdf') for ( x0, y0, x1, y1, word, block_no, line_no, word_no ) in doc[0].getText('words'): match = re.match(r'(\d+)/(\d+)/(\d+)', word) if match: month, day, year = map(int, match.groups()) date = datetime.date(year, month, day) _logger.info(f'Processing data for {date}') # Extract totals data totals_list = as_list(read_pdf( 'report.pdf', multiple_tables=True, pages=1, lattice=True, pandas_options={'header': None})) _logger.debug(f'First table is {totals_list[0]}') totals = totals_list[0] totals[0] = (totals[0] .str.replace('*', '', regex=False) .str.replace('\r', ' ', regex=False)) totals.set_index(0, inplace=True) total_cases = raw_string_to_int(totals.loc['Total Cases', 1]) total_deaths = raw_string_to_int(totals.loc['Total Deaths', 1]) # Clean demographic data tables and extract data raw_tables = as_list(read_pdf( 'report.pdf', lattice=True, multiple_tables=True, pages=[2], pandas_options={'header': None})) seen = set() _logger.debug(f'got {len(raw_tables)} tables: ') for idx, table in enumerate(raw_tables): _logger.debug(f'table #{idx+1}: {table}') if len(table) == 0: continue table.iloc[:, 0] = (table.iloc[:, 0] .str.replace('*', '', regex=False) .str.replace('\r', ' ', regex=False)) race_label = table.iloc[:, 0].str.contains( 'Where Race Known').fillna(False) if race_label.any(): splits = table[race_label].index.values.tolist() + [-1] for header, end in zip(splits[:-1], splits[1:]): # Stash the table name title = str(table.iloc[header, 0]) # Set up the table tbl = table.iloc[header + 1:end].copy() tbl.columns = ['race', 'value'] tbl.set_index('race', inplace=True) tbl.loc[:, 'value'] = tbl.loc[:, 'value'].str.extract( PCT_RE ).astype(float) # Find the Black/AA label (this has varied from # report to report) black_label = _get_black_label(tbl) # Extract the data if (title.find('Cases') >= 0 and 'cases' not in seen): (known_case_pct, known_cases, aa_cases_pct, aa_cases) = _extract_demographic_data( tbl, title, total_cases, black_label) seen.add('cases') elif (title.find('Deaths') >= 0 and 'deaths' not in seen): (known_death_pct, known_deaths, aa_deaths_pct, aa_deaths) = _extract_demographic_data( tbl, title, total_deaths, black_label) seen.add('deaths') assert 'cases' in seen, 'Did not find Cases by Race table' assert 'deaths' in seen, 'Did not find Deaths by Race table' return [self._make_series( date=date, cases=total_cases, deaths=total_deaths, aa_cases=aa_cases, aa_deaths=aa_deaths, pct_aa_cases=aa_cases_pct, pct_aa_deaths=aa_deaths_pct, pct_includes_unknown_race=False, pct_includes_hispanic_black=True, known_race_cases=known_cases, known_race_deaths=known_deaths, )]