def get_dates_from_title(cls, title): # type: (str) -> Tuple[str,List[Tuple[datetime,datetime]]] """ Get dataset dates (start and end dates in a list) from title and clean title of dates Args: title (str): Title to get date from and clean Returns: Tuple[str,List[Tuple[datetime,datetime]]]: Cleaned title, list of start and end dates """ ranges = list() ignore_wrong_years = list() for match in cls.YEAR_RANGE_PATTERN.finditer(title): first_year, first_month, second_year = cls.get_month_year_in_slash_range(match, ignore_wrong_years) if first_year is None: continue if first_month is None: first_month = 1 startdate = parse_date('%d-%d-01' % (first_year, first_month), '%Y-%m-%d', zero_time=True) enddate = parse_date('%s-12-31' % match.group(5), '%Y-%m-%d', zero_time=True) ranges.append((startdate, enddate)) newtitle = remove_string(title, match.group(0)) logger.info('Removing date range from title: %s -> %s' % (title, newtitle)) title = newtitle for match in cls.YEAR_RANGE_PATTERN2.finditer(title): first_year, first_month, second_year = cls.get_month_year_in_slash_range(match, ignore_wrong_years) if first_year is None or second_year is None: continue startdate = parse_date('%d-01-01' % first_year, '%Y-%m-%d', zero_time=True) enddate = parse_date('%d-12-31' % second_year, '%Y-%m-%d', zero_time=True) ranges.append((startdate, enddate)) newtitle = remove_string(title, match.group(0)) logger.info('Removing date range from title: %s -> %s' % (title, newtitle)) title = newtitle title = cls.fuzzy_match_dates_in_title(title, ranges, ignore_wrong_years) for match in cls.WORD_RIGHT_BRACKET_PATTERN.finditer(title): word = match.group(2) if word in cls.DATE_INTRO_WORDS: title = title.replace(match.group(0), ')') for match in cls.EMPTY_BRACKET_PATTERN.finditer(title): title = title.replace(match.group(0), ' ') title = remove_end_characters(title, '%s%s' % (PUNCTUATION_MINUS_BRACKETS, whitespace)) title = remove_from_end(title, ['as of'] + cls.DATE_INTRO_WORDS, 'Removing - from title: %s -> %s') return title, sorted(ranges)
def __init__(self, level, datasetinfo, headers, maxdateonly=True): if isinstance(level, str): if level == 'global': level = None elif level == 'national': level = 0 else: level = 1 self.level = level self.datecol = datasetinfo.get('date_col') self.datetype = datasetinfo.get('date_type') if self.datetype: if self.datetype == 'date': date = parse_date('1900-01-01') else: date = 0 else: date = 0 self.admininfo = AdminInfo.get() self.admcols = datasetinfo.get('adm_cols', list()) if self.level is None: self.maxdate = date else: if self.level > len(self.admcols): raise ValueError('No admin columns specified for required level!') self.maxdates = {adm: date for adm in self.admininfo.adms[self.level]} self.maxdateonly = maxdateonly self.flatteninfo = datasetinfo.get('flatten') self.headers = headers
def get_transaction(configuration, dtransaction, activity_identifier): # We're not interested in transactions that have no value if not dtransaction.value: return None # We're only interested in some transaction types transaction_type_info = configuration["transaction_type_info"].get( dtransaction.type) if not transaction_type_info: return None # We're not interested in transactions that can't be valued try: # Use value-date falling back on date date = dtransaction.value_date if not date: date = dtransaction.date # Convert the transaction value to USD currency = dtransaction.currency if currency is None: logger.error( f"Activity {activity_identifier} transaction with value {dtransaction.value} currency error!" ) return None value = Currency.get_historic_value_in_usd(dtransaction.value, currency, parse_date(date)) except (ValueError, CurrencyError): logger.exception( f"Activity {activity_identifier} transaction with value {dtransaction.value} USD conversion failed!" ) return None return Transaction(transaction_type_info, dtransaction, value)
def check_date(date): nonlocal conflict_start_date, start_date_in_conflict if start_date_in_conflict: return if not date: return start_date = parse_date(date) if start_date >= conflict_start_date: start_date_in_conflict = True
def do_set_value(self, row, scrapername=None): adms = [None for _ in range(len(self.admcols))] def get_adm(admcol, i): match = template.search(admcol) if match: template_string = match.group() admcol = self.headers[int(template_string[2:-2])] adm = row[admcol] if not adm: return False adms[i] = row[admcol].strip() return self.admininfo.get_adm(adms, i, scrapername) for i, admcol in enumerate(self.admcols): if admcol is None: continue if isinstance(admcol, str): admcol = [admcol] for admcl in admcol: exact = get_adm(admcl, i) if adms[i] and exact: break if not adms[i]: return None, None if self.datecol: date = row[self.datecol] if self.datetype == 'int': date = int(date) else: if not isinstance(date, datetime): date = parse_date(date) date = date.replace(tzinfo=None) if self.level is None: if self.maxdateonly: if date < self.maxdate: return None, None self.maxdate = date else: if self.maxdateonly: if date < self.maxdates[adms[self.level]]: return None, None self.maxdates[adms[self.level]] = date else: date = None if self.level is None: return 'global', date return adms[self.level], date
def get_monthly_report_source(configuration): monthly_report_configuration = configuration['monthly_report'] dataset = monthly_report_configuration['dataset'] resource = monthly_report_configuration['resource'] if isinstance(dataset, str): dataset = Dataset.read_from_hdx(dataset) resource_name = resource resource = None for res in dataset.get_resources(): if res['name'] == resource_name: resource = res break if not resource: raise ValueError('No monthly report resource found!') last_modified = parse_date(resource['last_modified']).isoformat()[:10] return monthly_report_configuration['hxltag'], last_modified, dataset[ 'dataset_source'], resource['url']
def get_latest_columns(date, base_url, countryiso3s, input_cols, downloader): countries_index = download_data(date, base_url, countryiso3s, input_cols, downloader) valuedicts = [dict() for _ in input_cols] crisis_types = dict() max_date = default_date for countryiso3, country_data in countries_index.items(): crises_types = country_data['ind_agg_type'].get('Aggregated') if not crises_types: crises_types = country_data['ind_agg_type'].get('Individual') type_of_crisis = crises_types[0] crisis_types[countryiso3] = type_of_crisis crisis = country_data['crises'][type_of_crisis] for i, input_col in enumerate(input_cols): val, last_updated = crisis[input_col] valuedicts[i][countryiso3] = val date = parse_date(last_updated) if date > max_date: max_date = date return valuedicts, crisis_types, max_date
def get_unhcr(configuration, today, today_str, countryiso3s, downloader, scrapers=None): name = inspect.currentframe().f_code.co_name if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list() iso3tocode = downloader.download_tabular_key_value( join('config', 'UNHCR_geocode.csv')) unhcr_configuration = configuration['unhcr'] base_url = unhcr_configuration['url'] population_collections = unhcr_configuration['population_collections'] exclude = unhcr_configuration['exclude'] valuedicts = [dict(), dict()] for countryiso3 in countryiso3s: if countryiso3 in exclude: continue code = iso3tocode.get(countryiso3) if not code: continue for population_collection in population_collections: r = downloader.download(base_url % (population_collection, code)) data = r.json()['data'][0] individuals = data['individuals'] if individuals is None: continue date = data['date'] if parse_date(date) < today - relativedelta(years=2): continue existing_individuals = valuedicts[0].get(countryiso3) if existing_individuals is None: valuedicts[0][countryiso3] = int(individuals) valuedicts[1][countryiso3] = date else: valuedicts[0][countryiso3] += int(individuals) logger.info('Processed UNHCR') hxltags = ['#affected+refugees', '#affected+date+refugees'] return [['TotalRefugees', 'TotalRefugeesDate'], hxltags], valuedicts, [ (hxltag, today_str, 'UNHCR', unhcr_configuration['source_url']) for hxltag in hxltags ]
def __init__(self, level, datasetinfo, headers, indicatorcols, maxdateonly=True): if isinstance(level, str): if level == 'global': level = None elif level == 'national': level = 0 else: level = 1 self.level = level self.datecol = datasetinfo.get('date_col') self.datetype = datasetinfo.get('date_type') if self.datetype: if self.datetype == 'date': date = parse_date('1900-01-01') else: date = 0 else: date = 0 self.maxdate = date date_condition = datasetinfo.get('date_condition') if date_condition is not None: for col in datasetinfo['val_cols']: date_condition = date_condition.replace(col, f"row['{col}']") self.date_condition = date_condition self.admininfo = AdminInfo.get() self.admcols = datasetinfo.get('adm_cols', list()) self.admexact = datasetinfo.get('adm_exact', False) self.indicatorcols = indicatorcols if self.level is None: self.maxdates = {i: date for i, _ in enumerate(indicatorcols)} else: if self.level > len(self.admcols): raise ValueError('No admin columns specified for required level!') self.maxdates = {i: {adm: date for adm in self.admininfo.adms[self.level]} for i, _ in enumerate(indicatorcols)} self.maxdateonly = maxdateonly self.flatteninfo = datasetinfo.get('flatten') self.headers = headers self.filters = dict() self.get_external_filter(datasetinfo)
def test_get_indicators(self, configuration, folder): with temp_dir('TestCovidViz', delete_on_success=True, delete_on_failure=False) as tempdir: with Download(user_agent='test') as downloader: retriever = Retrieve(downloader, tempdir, folder, tempdir, save=False, use_saved=True) tabs = configuration['tabs'] noout = NoOutput(tabs) jsonout = JsonOutput(configuration, tabs) outputs = {'gsheets': noout, 'excel': noout, 'json': jsonout} today = parse_date('2021-05-03') countries_to_save = get_indicators( configuration, today, retriever, outputs, tabs, scrapers=[ 'ifi', 'who_global', 'who_national', 'who_subnational', 'who_covid', 'sadd', 'covidtests', 'cadre_harmonise', 'access', 'food_prices' ], use_live=False) filepaths = jsonout.save(tempdir, countries_to_save=countries_to_save) assert filecmp.cmp(filepaths[0], join(folder, 'test_scraper_all.json')) assert filecmp.cmp(filepaths[1], join(folder, 'test_scraper.json')) assert filecmp.cmp(filepaths[2], join(folder, 'test_scraper_daily.json')) assert filecmp.cmp( filepaths[3], join(folder, 'test_scraper_covidseries.json'))
def exclude_dactivity(cls, dactivity): if cls.has_desired_scope(dactivity.humanitarian_scopes): return False if not dactivity.humanitarian: return True if dactivity.activity_status != "2": return True conflict_start_date = parse_date("2022-02-24") relevant_countries = ("UA", "PL", "HU", "SK", "RO", "MD", "BY", "RU") start_date_in_conflict = False country_in_list = False text_in_narrative = False def check_date(date): nonlocal conflict_start_date, start_date_in_conflict if start_date_in_conflict: return if not date: return start_date = parse_date(date) if start_date >= conflict_start_date: start_date_in_conflict = True def check_countries(countries): nonlocal relevant_countries, country_in_list if country_in_list: return if not countries: return for country in countries: if country.code in relevant_countries: country_in_list = True return def check_narratives(title_or_desc): nonlocal text_in_narrative if text_in_narrative: return if not title_or_desc: return for lang, text in title_or_desc.narratives.items(): text_lower = text.lower() if "ukraine" in text_lower or "ukrainian" in text_lower: text_in_narrative = True return check_date(dactivity.start_date_actual) check_countries(dactivity.recipient_countries) check_narratives(dactivity.title) check_narratives(dactivity.description) for dtransaction in dactivity.transactions: check_date(dtransaction.date) check_date(dtransaction.value_date) check_countries(dtransaction.recipient_countries) check_narratives(dtransaction.description) if not country_in_list: return True if not start_date_in_conflict: return True if not text_in_narrative: return True return False
def run_scrapers(datasets, countryiso3s, adminone, level, maindownloader, basic_auths=dict(), today=None, scrapers=None, population_lookup=None, **kwargs): # type: (Dict, List[str], AdminOne, str, Download, Dict[str,str], Optional[datetime], Optional[List[str]], Dict[str,int], Any) -> Dict """Runs all mini scrapers given in configuration and returns headers, values and sources. Args: datasets (Dict): Configuration for mini scrapers countryiso3s (List[str]): List of ISO3 country codes to process adminone (AdminOne): AdminOne object from HDX Python Country library that handles processing of admin level 1 level (str): Can be global, national or subnational maindownloader (Download): Download object for downloading files basic_auths (Dict[str,str]): Dictionary of basic authentication information today (Optional[datetime]): Value to use for today. Defaults to None (datetime.now()). scrapers (Optional[List[str]])): List of mini scraper names to process population_lookup (Dict[str,int]): Dictionary from admin code to population **kwargs: Variables to use when evaluating template arguments in urls Returns: Dict: Dictionary of output containing output headers, values and sources """ results = { 'headers': [list(), list()], 'values': list(), 'sources': list() } now = datetime.now() for name in datasets: if scrapers: if not any(scraper in name for scraper in scrapers): continue else: if name == 'population': continue logger.info('Processing %s' % name) basic_auth = basic_auths.get(name) if basic_auth is None: downloader = maindownloader else: downloader = Download(basic_auth=basic_auth, rate_limit={ 'calls': 1, 'period': 0.1 }) datasetinfo = datasets[name] datasetinfo['name'] = name headers, iterator = read(downloader, datasetinfo, today=today, **kwargs) if 'source_url' not in datasetinfo: datasetinfo['source_url'] = datasetinfo['url'] if 'date' not in datasetinfo or datasetinfo.get( 'force_date_today', False): today_str = kwargs.get('today_str') if today_str: today = parse_date(today_str) else: if not today: today = now today_str = today.strftime('%Y-%m-%d') datasetinfo['date'] = today_str _run_scraper(countryiso3s, adminone, level, today, name, datasetinfo, headers, iterator, population_lookup, results) if downloader != maindownloader: downloader.close() if population_lookup is not None: add_population(population_lookup, results['headers'], results['values']) return results
def test_save(self, configuration, fixtures, hxltags): with temp_dir('TestScraperSave', delete_on_success=True, delete_on_failure=False) as tempdir: with Download(user_agent='test') as downloader: tabs = configuration['tabs'] sheetname = list(tabs.values())[0] noout = NoOutput(tabs) excelout = ExcelOutput(join(tempdir, 'test_output.xlsx'), tabs, tabs) gsheet_auth = getenv('GSHEET_AUTH') if not gsheet_auth: raise ValueError('No gsheet authorisation supplied!') googleout = GoogleSheets(configuration, gsheet_auth, None, tabs, tabs) jsonout = JsonOutput(configuration, tabs) output = [list(hxltags.keys()), list(hxltags.values()), ['AFG', 'Afghanistan', 38041754]] # won't do anything as wrong tab name excelout.update_tab('lala', output, hxltags=hxltags) googleout.update_tab('lala', output, hxltags=hxltags) jsonout.update_tab('lala', output, hxltags=hxltags) noout.update_tab('national', output, hxltags=hxltags) excelout.update_tab('national', output, hxltags=hxltags) googleout.update_tab('national', output, hxltags=hxltags) jsonout.update_tab('national', output, hxltags=hxltags) noout.add_additional_json(downloader, today=parse_date('2020-10-01')) jsonout.add_additional_json(downloader, today=parse_date('2020-10-01')) noout.save() excelout.save() filepaths = jsonout.save(tempdir, countries_to_save=['AFG']) excelsheet = excelout.workbook.get_sheet_by_name(sheetname) def get_list_from_cells(cells): result = [list(), list(), list()] for i, row in enumerate(excelsheet[cells]): for column in row: result[i].append(column.value) return result assert get_list_from_cells('A1:C3') == output spreadsheet = googleout.gc.open_by_url(configuration['googlesheets']['test']) googletab = spreadsheet.worksheet_by_title(sheetname) result = googletab.get_values(start=(1, 1), end=(3, 3), returnas='matrix') result[2][2] = int(result[2][2]) assert result == output assert filecmp.cmp(filepaths[0], join(fixtures, 'test_scraper_all.json')) assert filecmp.cmp(filepaths[1], join(fixtures, 'test_scraper_population.json')) assert filecmp.cmp(filepaths[2], join(fixtures, 'test_scraper_population.json')) assert filecmp.cmp(filepaths[3], join(fixtures, 'test_scraper_other.json')) jsonout.json = dict() df = pandas.DataFrame(output[2:], columns=output[0]) noout.update_tab('national', df, hxltags=hxltags) excelout.update_tab('national', df, hxltags=hxltags) googleout.update_tab('national', df, hxltags=hxltags) jsonout.update_tab('national', df, hxltags=hxltags) jsonout.add_additional_json(downloader, today=parse_date('2020-10-01')) filepaths = jsonout.save(tempdir, countries_to_save=['AFG']) assert get_list_from_cells('A1:C3') == output result = googletab.get_values(start=(1, 1), end=(3, 3), returnas='matrix') result[2][2] = int(result[2][2]) assert result == output assert filecmp.cmp(filepaths[0], join(fixtures, 'test_scraper_all.json')) assert filecmp.cmp(filepaths[1], join(fixtures, 'test_scraper_population.json')) assert filecmp.cmp(filepaths[2], join(fixtures, 'test_scraper_population.json')) assert filecmp.cmp(filepaths[3], join(fixtures, 'test_scraper_other.json')) df = pandas.DataFrame(output[1:], columns=output[0]) googleout.update_tab('national', df, limit=2) result = googletab.get_values(start=(1, 1), end=(3, 3), returnas='matrix') result[2][2] = int(result[2][2])
def __init__(self, countryiso3s, adminone, level, today, datasetinfo, headers, subsets, maxdateonly=True): # type: (List[str], AdminOne, str, datetime, Dict, List[str], List[Dict], bool) -> None def get_level(lvl): if isinstance(lvl, str): if lvl == 'global': return None elif lvl == 'national': return 0 else: return 1 return lvl self.level = get_level(level) self.today = today self.sort = datasetinfo.get('sort') self.datecol = datasetinfo.get('date_col') self.datetype = datasetinfo.get('date_type') if self.datetype: if self.datetype == 'date': date = parse_date('1900-01-01') else: date = 0 else: date = 0 self.maxdate = date datelevel = datasetinfo.get('date_level') if datelevel is None: self.datelevel = self.level else: self.datelevel = get_level(datelevel) date_condition = datasetinfo.get('date_condition') if date_condition is not None: for col in datasetinfo['input_cols']: date_condition = date_condition.replace(col, f"row['{col}']") self.date_condition = date_condition self.single_maxdate = datasetinfo.get('single_maxdate', False) self.ignore_future_date = datasetinfo.get('ignore_future_date', True) self.adminone = adminone self.admcols = datasetinfo.get('adm_cols', list()) self.admexact = datasetinfo.get('adm_exact', False) self.subsets = subsets adms = datasetinfo.get('adm_vals') if adms is None: self.adms = [countryiso3s, self.adminone.pcodes] else: if self.datelevel == 1: self.adms = adms else: self.adms = [adms, self.adminone.pcodes] if self.datelevel is None: self.maxdates = {i: date for i, _ in enumerate(subsets)} else: if self.datelevel > len(self.admcols): raise ValueError( 'No admin columns specified for required level!') self.maxdates = { i: {adm: date for adm in self.adms[self.datelevel]} for i, _ in enumerate(subsets) } self.maxdateonly = maxdateonly self.flatteninfo = datasetinfo.get('flatten') self.headers = headers self.filters = dict() self.read_external_filter(datasetinfo)
def parse(self, row, scrapername=None): # type: (Dict, str) -> Tuple[Optional[str], Optional[List[bool]]] """Parse row checking for valid admin information and if the row should be filtered out in each subset given its definition. Args: row (Dict): Row to parse Returns: Tuple[Optional[str], Optional[List[bool]]]: (admin name, should process subset list) or (None, None) """ if self.filtered(row): return None, None adms = [None for _ in range(len(self.admcols))] def get_adm(admcol, i): template_string, match_string = match_template(admcol) if template_string: admcol = self.headers[int(match_string)] adm = row[admcol] if not adm: return False adm = adm.strip() adms[i] = adm if adm in self.adms[i]: return True exact = False if self.admexact: adms[i] = None else: if i == 0: adms[i], exact = Country.get_iso3_country_code_fuzzy(adm) elif i == 1: adms[i], exact = self.adminone.get_pcode( adms[0], adm, scrapername) if adms[i] not in self.adms[i]: adms[i] = None return exact for i, admcol in enumerate(self.admcols): if admcol is None: continue if isinstance(admcol, str): admcol = [admcol] for admcl in admcol: exact = get_adm(admcl, i) if adms[i] and exact: break if not adms[i]: return None, None should_process_subset = list() for subset in self.subsets: filter = subset['filter'] process = True if filter: filters = filter.split('|') for filterstr in filters: filter = filterstr.split('=') if row[filter[0]] != filter[1]: process = False break should_process_subset.append(process) if self.datecol: if isinstance(self.datecol, list): dates = [str(row[x]) for x in self.datecol] date = ''.join(dates) else: date = row[self.datecol] if self.datetype == 'date': if not isinstance(date, datetime): date = parse_date(date) date = date.replace(tzinfo=None) if date > self.today and self.ignore_future_date: return None, None elif self.datetype == 'year': date = int(date) if date > self.today.year and self.ignore_future_date: return None, None else: date = int(date) if self.date_condition: if eval(self.date_condition) is False: return None, None for i, process in enumerate(should_process_subset): if not process: continue if date < self.maxdate: if self.single_maxdate: should_process_subset[i] = False else: self.maxdate = date if self.datelevel is None: if self.maxdateonly: if date < self.maxdates[i]: should_process_subset[i] = False else: self.maxdates[i] = date else: self.maxdates[i] = date else: if self.maxdateonly: if date < self.maxdates[i][adms[self.datelevel]]: should_process_subset[i] = False else: self.maxdates[i][adms[self.datelevel]] = date else: self.maxdates[i][adms[self.datelevel]] = date if self.level is None: return 'global', should_process_subset return adms[self.level], should_process_subset
def test_get_tabular(self, configuration): with Download(user_agent='test') as downloader: today = parse_date('2020-10-01') adminone = AdminOne(configuration) population_lookup = dict() level = 'national' scraper_configuration = configuration[f'scraper_{level}'] results = run_scrapers(scraper_configuration, ['AFG'], adminone, level, downloader, today=today, scrapers=['population'], population_lookup=population_lookup) assert results['headers'] == [['Population'], ['#population']] assert results['values'] == [{'AFG': 38041754}] assert results['sources'] == [ ('#population', '2020-10-01', 'World Bank', 'https://data.humdata.org/organization/world-bank-group') ] results = run_scrapers(scraper_configuration, ['AFG'], adminone, level, downloader, today=today, scrapers=['who'], population_lookup=population_lookup) assert results['headers'] == [[ 'CasesPer100000', 'DeathsPer100000', 'Cases2Per100000', 'Deaths2Per100000' ], [ '#affected+infected+per100000', '#affected+killed+per100000', '#affected+infected+2+per100000', '#affected+killed+2+per100000' ]] assert results['values'] == [{ 'AFG': '96.99' }, { 'AFG': '3.41' }, { 'AFG': '96.99' }, { 'AFG': '3.41' }] assert results['sources'] == [ ('#affected+infected+per100000', '2020-08-06', 'WHO', 'tests/fixtures/WHO-COVID-19-global-data.csv'), ('#affected+killed+per100000', '2020-08-06', 'WHO', 'tests/fixtures/WHO-COVID-19-global-data.csv'), ('#affected+infected+2+per100000', '2020-08-06', 'WHO', 'tests/fixtures/WHO-COVID-19-global-data.csv'), ('#affected+killed+2+per100000', '2020-08-06', 'WHO', 'tests/fixtures/WHO-COVID-19-global-data.csv') ] results = run_scrapers(scraper_configuration, ['AFG'], adminone, level, downloader, today=today, scrapers=['access'], population_lookup=population_lookup) assert results['headers'] == [ [ '% of visas pending or denied', '% of travel authorizations or movements denied', 'Number of incidents reported in previous year', 'Number of incidents reported since start of year', 'Number of incidents reported since start of previous year', '% of CERF projects affected by insecurity and inaccessibility', '% of CBPF projects affected by insecurity and inaccessibility', 'Campaign Vaccine', 'Campaign Vaccine Status', 'Number of learners enrolled from pre-primary to tertiary education' ], [ '#access+visas+pct', '#access+travel+pct', '#event+year+previous+num', '#event+year+todate+num', '#event+year+previous+todate+num', '#activity+cerf+project+insecurity+pct', '#activity+cbpf+project+insecurity+pct', '#service+name', '#status+name', '#population+education' ] ] assert results['values'] == [{ 'AFG': 0.2 }, { 'AFG': 'N/A' }, { 'AFG': '20' }, { 'AFG': '2' }, { 'AFG': '22' }, { 'AFG': 0.5710000000000001 }, { 'AFG': 0.04 }, { 'AFG': 'bivalent Oral Poliovirus' }, { 'AFG': 'Postponed' }, { 'AFG': 9979405 }] assert results['sources'] == [ ('#access+visas+pct', '2020-10-01', 'OCHA', 'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv' ), ('#access+travel+pct', '2020-10-01', 'OCHA', 'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv' ), ('#event+year+previous+num', '2020-10-01', 'Aid Workers Database', 'https://data.humdata.org/dataset/security-incidents-on-aid-workers' ), ('#event+year+todate+num', '2020-10-01', 'Aid Workers Database', 'https://data.humdata.org/dataset/security-incidents-on-aid-workers' ), ('#event+year+previous+todate+num', '2020-10-01', 'Aid Workers Database', 'https://data.humdata.org/dataset/security-incidents-on-aid-workers' ), ('#activity+cerf+project+insecurity+pct', '2020-10-01', 'UNCERF', 'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv' ), ('#activity+cbpf+project+insecurity+pct', '2020-10-01', 'UNCERF', 'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv' ), ('#service+name', '2020-10-01', 'Multiple sources', 'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv' ), ('#status+name', '2020-10-01', 'Multiple sources', 'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv' ), ('#population+education', '2020-10-01', 'UNESCO', 'https://docs.google.com/spreadsheets/d/e/2PACX-1vRSzJzuyVt9i_mkRQ2HbxrUl2Lx2VIhkTHQM-laE8NyhQTy70zQTCuFS3PXbhZGAt1l2bkoA4_dAoAP/pub?gid=1565063847&single=true&output=csv' ) ] results = run_scrapers(scraper_configuration, ['AFG'], adminone, level, downloader, today=today, scrapers=['sadd'], population_lookup=population_lookup) assert results['headers'] == [[ 'Cases (% male)', 'Cases (% female)', 'Deaths (% male)', 'Deaths (% female)' ], [ '#affected+infected+m+pct', '#affected+f+infected+pct', '#affected+killed+m+pct', '#affected+f+killed+pct' ]] assert results['values'] == [{ 'AFG': '0.7044' }, { 'AFG': '0.2956' }, { 'AFG': '0.7498' }, { 'AFG': '0.2502' }] assert results['sources'] == [ ('#affected+infected+m+pct', '2020-08-07', 'SADD', 'tests/fixtures/covid-19-sex-disaggregated-data.csv'), ('#affected+f+infected+pct', '2020-08-07', 'SADD', 'tests/fixtures/covid-19-sex-disaggregated-data.csv'), ('#affected+killed+m+pct', '2020-08-07', 'SADD', 'tests/fixtures/covid-19-sex-disaggregated-data.csv'), ('#affected+f+killed+pct', '2020-08-07', 'SADD', 'tests/fixtures/covid-19-sex-disaggregated-data.csv') ] results = run_scrapers(scraper_configuration, ['AFG', 'PHL'], adminone, level, downloader, today=today, scrapers=['ourworldindata'], population_lookup=population_lookup) assert results['headers'] == [[ 'TotalDosesAdministered' ], ['#capacity+doses+administered+total']] assert results['values'] == [dict()] assert results['sources'] == [ ('#capacity+doses+administered+total', '2020-10-01', 'Our World in Data', 'tests/fixtures/ourworldindata_vaccinedoses.csv') ] today = parse_date('2021-05-03') results = run_scrapers(scraper_configuration, ['AFG', 'PHL'], adminone, level, downloader, today=today, scrapers=['ourworldindata'], population_lookup=population_lookup) assert results['headers'] == [[ 'TotalDosesAdministered' ], ['#capacity+doses+administered+total']] assert results['values'] == [{'AFG': '240000'}] # NB: Source data will have been written into the in-memory config by the immediately previous run of the # ourworldindata scraper and is hence 2020-10-01 instead of 2021-05-03 assert results['sources'] == [ ('#capacity+doses+administered+total', '2020-10-01', 'Our World in Data', 'tests/fixtures/ourworldindata_vaccinedoses.csv') ] today = parse_date('2020-10-01') level = 'subnational' scraper_configuration = configuration[f'scraper_{level}'] results = run_scrapers(scraper_configuration, ['AFG'], adminone, level, downloader, today=today, scrapers=['gam'], population_lookup=population_lookup) assert results['headers'] == [[ 'Malnutrition Estimate' ], ['#severity+malnutrition+num+subnational']] assert results['values'] == [{ 'AF17': 3.371688, 'AF31': 3.519166, 'AF09': 1.524646, 'AF21': 1.319626, 'AF10': 1.40426, 'AF24': 1.043487, 'AF33': 2.745447, 'AF29': 2.478977, 'AF11': 1.022871, 'AF23': 1.340286, 'AF30': 1.677612, 'AF32': 1.687488, 'AF28': 0.6210205, 'AF01': 1.282291, 'AF27': 1.378641, 'AF02': 3.552082, 'AF14': 0.7653555, 'AF15': 0.953823, 'AF19': 1.684882, 'AF07': 2.090165, 'AF05': 0.9474334, 'AF06': 2.162038, 'AF34': 1.6455, 'AF16': 1.927783, 'AF12': 4.028857, 'AF13': 9.150105, 'AF08': 1.64338, 'AF03': 2.742952, 'AF20': 1.382376, 'AF22': 1.523334, 'AF18': 0.9578965, 'AF25': 0.580423, 'AF04': 0.501081, 'AF26': 4.572629 }] assert results['sources'] == [( '#severity+malnutrition+num+subnational', '2020-10-01', 'UNICEF', 'tests/fixtures/unicef_who_wb_global_expanded_databases_severe_wasting.xlsx' )] scraper_configuration = configuration['other'] results = run_scrapers(scraper_configuration, ['AFG'], adminone, level, downloader, today=today, scrapers=['gam'], population_lookup=population_lookup) assert results['headers'] == [[ 'Malnutrition Estimate' ], ['#severity+malnutrition+num+subnational']] assert results['values'] == [{'AF09': 1.524646, 'AF24': 1.043487}] assert results['sources'] == [( '#severity+malnutrition+num+subnational', '2020-10-01', 'UNICEF', 'tests/fixtures/unicef_who_wb_global_expanded_databases_severe_wasting.xlsx' )] level = 'global' scraper_configuration = configuration[f'scraper_{level}'] results = run_scrapers(scraper_configuration, configuration['HRPs'], adminone, level, downloader, today=today, scrapers=['covax'], population_lookup=population_lookup) assert results['headers'] == [ [ 'Covax Interim Forecast Doses', 'Covax Delivered Doses', 'Other Delivered Doses', 'Total Delivered Doses', 'Covax Pfizer-BioNTech Doses', 'Covax Astrazeneca-SII Doses', 'Covax Astrazeneca-SKBio Doses' ], [ '#capacity+doses+forecast+covax', '#capacity+doses+delivered+covax', '#capacity+doses+delivered+others', '#capacity+doses+delivered+total', '#capacity+doses+covax+pfizerbiontech', '#capacity+doses+covax+astrazenecasii', '#capacity+doses+covax+astrazenecaskbio' ] ] assert results['values'] == [{ 'global': '73248240' }, { 'global': '12608040' }, { 'global': '23728358' }, { 'global': '36336398' }, { 'global': '271440' }, { 'global': '67116000' }, { 'global': '5860800' }] assert results['sources'] == [ ('#capacity+doses+forecast+covax', '2020-08-07', 'covax', 'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv' ), ('#capacity+doses+delivered+covax', '2020-08-07', 'covax', 'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv' ), ('#capacity+doses+delivered+others', '2020-08-07', 'covax', 'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv' ), ('#capacity+doses+delivered+total', '2020-08-07', 'covax', 'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv' ), ('#capacity+doses+covax+pfizerbiontech', '2020-08-07', 'covax', 'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv' ), ('#capacity+doses+covax+astrazenecasii', '2020-08-07', 'covax', 'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv' ), ('#capacity+doses+covax+astrazenecaskbio', '2020-08-07', 'covax', 'tests/fixtures/COVID-19 Vaccine Doses in HRP Countries - Data HXL.csv' ) ] results = run_scrapers(scraper_configuration, configuration['HRPs'], adminone, level, downloader, today=today, scrapers=['cerf_global'], population_lookup=population_lookup) assert results['headers'] == [ [ 'CBPFFunding', 'CBPFFundingGMEmpty', 'CBPFFundingGM0', 'CBPFFundingGM1', 'CBPFFundingGM2', 'CBPFFundingGM3', 'CBPFFundingGM4', 'CERFFunding', 'CERFFundingGMEmpty', 'CERFFundingGM0', 'CERFFundingGM1', 'CERFFundingGM2', 'CERFFundingGM3', 'CERFFundingGM4' ], [ '#value+cbpf+funding+total+usd', '#value+cbpf+funding+gmempty+total+usd', '#value+cbpf+funding+gm0+total+usd', '#value+cbpf+funding+gm1+total+usd', '#value+cbpf+funding+gm2+total+usd', '#value+cbpf+funding+gm3+total+usd', '#value+cbpf+funding+gm4+total+usd', '#value+cerf+funding+total+usd', '#value+cerf+funding+gmempty+total+usd', '#value+cerf+funding+gm0+total+usd', '#value+cerf+funding+gm1+total+usd', '#value+cerf+funding+gm2+total+usd', '#value+cerf+funding+gm3+total+usd', '#value+cerf+funding+gm4+total+usd' ] ] assert results['values'] == [{ 'global': 906790749.5500005 }, { 'global': 829856355.4100008 }, { 'global': 37432868.04999999 }, { 'global': 39501526.08999999 }, {}, {}, {}, { 'global': 848145238.0 }, {}, { 'global': 50042305.0 }, { 'global': 75349572.0 }, { 'global': 224560378.0 }, { 'global': 349338181.0 }, { 'global': 147855321.0 }] assert results['sources'] == [ ('#value+cbpf+funding+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cbpf+funding+gmempty+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cbpf+funding+gm0+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cbpf+funding+gm1+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cbpf+funding+gm2+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cbpf+funding+gm3+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cbpf+funding+gm4+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+gmempty+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+gm0+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+gm1+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+gm2+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+gm3+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+gm4+total+usd', '2020-10-01', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations') ] results = run_scrapers(scraper_configuration, configuration['HRPs'], adminone, level, downloader, today=today, scrapers=['ourworldindata'], population_lookup=population_lookup) assert results['headers'] == [[ 'TotalDosesAdministered' ], ['#capacity+doses+administered+total']] assert results['values'] == [dict()] assert results['sources'] == [ ('#capacity+doses+administered+total', '2020-10-01', 'Our World in Data', 'tests/fixtures/ourworldindata_vaccinedoses.csv') ] today = parse_date('2021-05-03') results = run_scrapers(scraper_configuration, configuration['HRPs'], adminone, level, downloader, today=today, scrapers=['cerf_global'], population_lookup=population_lookup) assert results['headers'] == [ [ 'CBPFFunding', 'CBPFFundingGMEmpty', 'CBPFFundingGM0', 'CBPFFundingGM1', 'CBPFFundingGM2', 'CBPFFundingGM3', 'CBPFFundingGM4', 'CERFFunding', 'CERFFundingGMEmpty', 'CERFFundingGM0', 'CERFFundingGM1', 'CERFFundingGM2', 'CERFFundingGM3', 'CERFFundingGM4' ], [ '#value+cbpf+funding+total+usd', '#value+cbpf+funding+gmempty+total+usd', '#value+cbpf+funding+gm0+total+usd', '#value+cbpf+funding+gm1+total+usd', '#value+cbpf+funding+gm2+total+usd', '#value+cbpf+funding+gm3+total+usd', '#value+cbpf+funding+gm4+total+usd', '#value+cerf+funding+total+usd', '#value+cerf+funding+gmempty+total+usd', '#value+cerf+funding+gm0+total+usd', '#value+cerf+funding+gm1+total+usd', '#value+cerf+funding+gm2+total+usd', '#value+cerf+funding+gm3+total+usd', '#value+cerf+funding+gm4+total+usd' ] ] assert results['values'] == [{ 'global': 7811774.670000001 }, { 'global': 7811774.670000001 }, {}, {}, {}, {}, {}, { 'global': 89298919.0 }, { 'global': 6747034.0 }, {}, { 'global': 2549855.0 }, { 'global': 10552572.0 }, { 'global': 26098816.0 }, { 'global': 43350642.0 }] assert results['sources'] == [ ('#value+cbpf+funding+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cbpf+funding+gmempty+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cbpf+funding+gm0+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cbpf+funding+gm1+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cbpf+funding+gm2+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cbpf+funding+gm3+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cbpf+funding+gm4+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+gmempty+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+gm0+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+gm1+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+gm2+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+gm3+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations'), ('#value+cerf+funding+gm4+total+usd', '2021-05-03', 'CERF and CBPF', 'https://data.humdata.org/dataset/cerf-covid-19-allocations') ] results = run_scrapers(scraper_configuration, configuration['HRPs'], adminone, level, downloader, today=today, scrapers=['ourworldindata'], population_lookup=population_lookup) assert results['headers'] == [[ 'TotalDosesAdministered' ], ['#capacity+doses+administered+total']] assert results['values'] == [{'global': '13413871'}] assert results['sources'] == [ ('#capacity+doses+administered+total', '2020-10-01', 'Our World in Data', 'tests/fixtures/ourworldindata_vaccinedoses.csv') ] scraper_configuration = configuration['other'] results = run_scrapers(scraper_configuration, configuration['HRPs'], adminone, level, downloader, today=today, scrapers=['ourworldindata'], population_lookup=population_lookup) assert results['headers'] == [[ 'TotalDosesAdministered' ], ['#capacity+doses+administered+total']] assert results['values'] == [{'global': '1175451507'}] assert results['sources'] == [ ('#capacity+doses+administered+total', '2021-05-03', 'Our World in Data', 'tests/fixtures/ourworldindata_vaccinedoses.csv') ]
def update( self, sheetname: str, rows: List[Dict], dutyofficer_name: Optional[str] = None, ) -> None: """Update output Google spreadsheet (which must have been set up with setup_gsheet). The duty officer which is usually taken from the HDX Data Partnerships Team Duty Roster spreadsheet can be overridden by supplying dutyofficer_name. Args: sheetname (str): Name of tab in Google spreadsheet to output to rows (List[Dict]): Rows to add to Google spreadsheet dutyofficer_name (Optional[str]): Name of duty office. Defaults to None. Returns: None """ if self.issues_spreadsheet is None or (self.dutyofficer is None and dutyofficer_name is None): logger.warning("Cannot update Google spreadsheet!") return logger.info("Updating Google spreadsheet.") sheet = self.issues_spreadsheet.worksheet(sheetname) gsheet_rows = sheet.get_values() keys = gsheet_rows[0] url_ind = keys.index("URL") if "Update Frequency" in keys: update_frequency_ind = keys.index("Update Frequency") else: update_frequency_ind = None dateadded_ind = keys.index("Date Added") dateoccurred_ind = keys.index("Date Last Occurred") no_times_ind = keys.index("No. Times") assigned_ind = keys.index("Assigned") status_ind = keys.index("Status") headers = gsheet_rows[0] gsheet_rows = [row for row in gsheet_rows[1:] if row[url_ind]] urls = [x[url_ind] for x in gsheet_rows] if update_frequency_ind is not None: for gsheet_row in gsheet_rows: updatefreq = gsheet_row[update_frequency_ind] gsheet_row[update_frequency_ind] = int( Dataset.transform_update_frequency(updatefreq)) updated_notimes = set() now = self.now.isoformat() for row in rows: url = row["URL"] new_row = [row.get(key, "") for key in keys] new_row[dateoccurred_ind] = now try: rowno = urls.index(url) current_row = gsheet_rows[rowno] new_row[dateadded_ind] = current_row[dateadded_ind] no_times = current_row[no_times_ind] new_row[no_times_ind] = int(no_times) if url not in updated_notimes: updated_notimes.add(url) new_row[no_times_ind] += 1 new_row[assigned_ind] = current_row[assigned_ind] new_row[status_ind] = current_row[status_ind] gsheet_rows[rowno] = new_row except ValueError: new_row[dateadded_ind] = now new_row[no_times_ind] = 1 if dutyofficer_name is not None: new_row[assigned_ind] = dutyofficer_name else: new_row[assigned_ind] = self.dutyofficer["name"] gsheet_rows.append(new_row) urls.append(url) updated_notimes.add(url) if update_frequency_ind is None: gsheet_rows = sorted(gsheet_rows, key=lambda x: x[dateoccurred_ind], reverse=True) else: headers.append("sort") sort_ind = headers.index("sort") for gsheet_row in gsheet_rows: dateoccurred = gsheet_row[dateoccurred_ind] if dateoccurred == now: sort_val = 0 else: nodays = self.now - parse_date(dateoccurred) update_freq = gsheet_row[update_frequency_ind] if update_freq == -1: update_freq = 1000 elif update_freq == -2: update_freq = 500 elif update_freq == 0: update_freq = 0.5 sort_val = nodays.days / update_freq gsheet_row.append(sort_val) gsheet_rows = sorted( gsheet_rows, key=lambda x: (-x[sort_ind], x[dateoccurred_ind]), reverse=True, ) no_rows = len(gsheet_rows) no_rows_to_remove = no_rows - self.row_limit gsheet_rows = gsheet_rows[:-no_rows_to_remove] if update_frequency_ind is not None: for gsheet_row in gsheet_rows: update_freq = gsheet_row[update_frequency_ind] gsheet_row[ update_frequency_ind] = HDXHelper.get_update_frequency( update_freq) del gsheet_row[sort_ind] del headers[sort_ind] sheet.clear() sheet.update("A1", [headers] + gsheet_rows)
def _get_tabular(level, name, datasetinfo, headers, iterator, population_lookup, retheaders=[list(), list()], retval=list(), sources=list()): indicatorcols = datasetinfo.get('indicator_cols') if not indicatorcols: indicatorcols = [{ 'filter_col': datasetinfo.get('filter_col'), 'val_cols': datasetinfo.get('val_cols', list()), 'val_fns': datasetinfo.get('val_fns', dict()), 'eval_cols': datasetinfo.get('eval_cols', list()), 'keep_cols': datasetinfo.get('keep_cols', list()), 'append_cols': datasetinfo.get('append_cols', list()), 'total_cols': datasetinfo.get('total_cols'), 'ignore_vals': datasetinfo.get('ignore_vals', list()), 'columns': datasetinfo.get('columns', list()), 'hxltags': datasetinfo.get('hxltags', list()) }] use_hxl = datasetinfo.get('use_hxl', False) if use_hxl: hxlrow = next(iterator) while not hxlrow: hxlrow = next(iterator) exclude_tags = datasetinfo.get('exclude_tags', list()) adm_cols = list() val_cols = list() columns = list() for header in headers: hxltag = hxlrow[header] if not hxltag or hxltag in exclude_tags: continue if '#country' in hxltag: if 'code' in hxltag: if len(adm_cols) == 0: adm_cols.append(hxltag) else: adm_cols[0] = hxltag continue if '#adm1' in hxltag: if 'code' in hxltag: if len(adm_cols) == 0: adm_cols.append(None) if len(adm_cols) == 1: adm_cols.append(hxltag) continue if hxltag == datasetinfo.get('date_col') and datasetinfo.get( 'include_date', False) is False: continue val_cols.append(hxltag) columns.append(header) datasetinfo['adm_cols'] = adm_cols for indicatorcol in indicatorcols: orig_val_cols = indicatorcol.get('val_cols', list()) if not orig_val_cols: orig_val_cols.extend(val_cols) indicatorcol['val_cols'] = orig_val_cols orig_columns = indicatorcol.get('columns', list()) if not orig_columns: orig_columns.extend(columns) indicatorcol['columns'] = orig_columns orig_hxltags = indicatorcol.get('hxltags', list()) if not orig_hxltags: orig_hxltags.extend(val_cols) indicatorcol['hxltags'] = orig_hxltags else: hxlrow = None rowparser = RowParser(level, datasetinfo, headers, indicatorcols) valuedicts = dict() for indicatorcol in indicatorcols: for _ in indicatorcol['val_cols']: dict_of_lists_add(valuedicts, indicatorcol['filter_col'], dict()) def add_row(row): adm, indicators_process = rowparser.do_set_value(row, name) if not adm: return for i, indicatorcol in enumerate(indicatorcols): if not indicators_process[i]: continue filtercol = indicatorcol['filter_col'] total_cols = indicatorcol.get('total_cols') eval_cols = indicatorcol.get('eval_cols') append_cols = indicatorcol.get('append_cols', list()) keep_cols = indicatorcol.get('keep_cols', list()) for i, valcol in enumerate(indicatorcol['val_cols']): valuedict = valuedicts[filtercol][i] val = get_rowval(row, valcol) if total_cols or eval_cols: dict_of_lists_add(valuedict, adm, val) else: curval = valuedict.get(adm) if valcol in append_cols: if curval: val = curval + val elif valcol in keep_cols: if curval: val = curval valuedict[adm] = val stop_row = datasetinfo.get('stop_row') for row in iterator: if not isinstance(row, dict): row = row.value if hxlrow: newrow = dict() for header in row: newrow[hxlrow[header]] = row[header] row = newrow if stop_row: if all(row[key] == value for key, value in stop_row.items()): break for newrow in rowparser.flatten(row): add_row(newrow) date = datasetinfo.get('date') use_date_from_date_col = datasetinfo.get('use_date_from_date_col', False) if date and not use_date_from_date_col: date = parse_date(date) else: date = rowparser.get_maxdate() if date == 0: raise ValueError('No date given in datasetinfo or as a column!') if rowparser.datetype == 'date': if not isinstance(date, datetime): date = parse_date(date) elif rowparser.datetype == 'int': date = get_datetime_from_timestamp(date) else: raise ValueError('No date type specified!') date = date.strftime('%Y-%m-%d') for indicatorcol in indicatorcols: retheaders[0].extend(indicatorcol['columns']) hxltags = indicatorcol['hxltags'] retheaders[1].extend(hxltags) valdicts = valuedicts[indicatorcol['filter_col']] eval_cols = indicatorcol.get('eval_cols') keep_cols = indicatorcol.get('keep_cols', list()) total_cols = indicatorcol.get('total_cols') ignore_vals = indicatorcol.get('ignore_vals', list()) val_fns = indicatorcol.get('val_fns', dict()) valcols = indicatorcol['val_cols'] # Indices of list sorted by length sorted_len_indices = sorted(range(len(valcols)), key=lambda k: len(valcols[k]), reverse=True) if eval_cols: newvaldicts = [dict() for _ in eval_cols] def text_replacement(string, adm): string = string.replace('#population', '#pzbgvjh') hasvalues = False for j in sorted_len_indices: valcol = valcols[j] if valcol not in string: continue if valcol in keep_cols: keep_col_index = 0 else: keep_col_index = -1 val = valdicts[j][adm][keep_col_index] if not val or val in ignore_vals: val = 0 else: val_fn = val_fns.get(valcol) if val_fn: val = eval(val_fn.replace(valcol, 'val')) hasvalues = True string = string.replace(valcol, str(val)) string = string.replace('#pzbgvjh', '#population') return string, hasvalues for i, eval_col in enumerate(eval_cols): valdict0 = valdicts[0] for adm in valdict0: hasvalues = True matches = regex.search(brackets, eval_col, flags=regex.VERBOSE) if matches: for bracketed_str in matches.captures('rec'): if any(bracketed_str in x for x in valcols): continue _, hasvalues_t = text_replacement( bracketed_str, adm) if not hasvalues_t: hasvalues = False break if hasvalues: formula, hasvalues_t = text_replacement(eval_col, adm) if hasvalues_t: formula = formula.replace( '#population', 'population_lookup[adm]') newvaldicts[i][adm] = eval(formula) else: newvaldicts[i][adm] = '' else: newvaldicts[i][adm] = '' retval.extend(newvaldicts) elif total_cols: for total_col in total_cols: formula = total_col['formula'] mustbepopulated = total_col.get('mustbepopulated', False) newvaldicts = [dict() for _ in valdicts] valdict0 = valdicts[0] for adm in valdict0: for i, val in enumerate(valdict0[adm]): if not val or val in ignore_vals: exists = False else: exists = True for valdict in valdicts[1:]: val = valdict[adm] if not val or val in ignore_vals: exists = False break if mustbepopulated and not exists: continue for j, valdict in enumerate(valdicts): valcol = valcols[j] val_fn = val_fns.get(valcol) if not val_fn: val_fn = valcol newvaldicts[j][adm] = newvaldicts[j].get( adm, 0.0 ) + eval( val_fn.replace( valcol, 'get_numeric_if_possible(valdict[adm][i])') ) formula = formula.replace('#population', '#pzbgvjh') for i in sorted_len_indices: formula = formula.replace(valcols[i], 'newvaldicts[%d][adm]' % i) formula = formula.replace('#pzbgvjh', 'population_lookup[adm]') newvaldict = dict() for adm in valdicts[0].keys(): try: val = eval(formula) except (ValueError, TypeError, KeyError): val = '' newvaldict[adm] = val retval.append(newvaldict) else: retval.extend(valdicts) sources.extend([(hxltag, date, datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags]) logger.info('Processed %s' % name) return retheaders, retval, sources
def get_education(configuration, today, countryiso3s, regionlookup, downloader, scrapers=None): name = 'education' if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list(), list(), list(), list() educationinfo = configuration[name] datasetinfo = educationinfo['closures'] closures_headers, closures_iterator = read(downloader, datasetinfo) closures = dict() country_dates = dict() for row in closures_iterator: countryiso = row['ISO'] if not countryiso or countryiso not in countryiso3s: continue date = row['Date'] if isinstance(date, str): date = parse_date(date) if date > today: continue max_date = country_dates.get(countryiso, default_date) if date < max_date: continue country_dates[countryiso] = date closures[countryiso] = row['Status'] fully_closed = list() for countryiso, closure in closures.items(): if closure.lower() == 'closed due to covid-19': fully_closed.append(countryiso) datasetinfo = educationinfo['enrolment'] learners_headers, learners_iterator = read(downloader, datasetinfo) learners_012 = dict() learners_3 = dict() affected_learners = dict() all_learners = dict() for row in learners_iterator: countryiso = row['ISO3'] if not countryiso or countryiso not in countryiso3s: continue l_0 = row['Pre-primary (both)'] l_1 = row['Primary (both)'] l_2 = row['Secondary (both)'] l_3 = row['Tertiary (both)'] l_012 = None if l_0 != '-': l_012 = int(l_0) if l_1 != '-': l_1 = int(l_1) if l_012 is None: l_012 = l_1 else: l_012 += l_1 if l_2 != '-': l_2 = int(l_2) if l_012 is None: l_012 = l_2 else: l_012 += l_2 if l_012 is not None: learners_012[countryiso] = l_012 if l_3 == '-': l_3 = None else: l_3 = int(l_3) learners_3[countryiso] = l_3 no_learners = None if l_012 is not None: no_learners = l_012 if l_3: no_learners += l_3 elif l_3 is not None: no_learners = l_3 if no_learners is not None: all_learners[countryiso] = no_learners if countryiso in fully_closed: affected_learners[countryiso] = no_learners affected_learners_total = dict() learners_total = dict() closed_countries = dict() for countryiso in closures: country_learners = all_learners.get(countryiso) country_affected_learners = affected_learners.get(countryiso) for region in regionlookup.iso3_to_region_and_hrp[countryiso]: if country_learners is not None: learners_total[region] = learners_total.get( region, 0) + country_learners if country_affected_learners is not None: affected_learners_total[region] = affected_learners_total.get( region, 0) + country_affected_learners closed_countries[region] = closed_countries.get(region, 0) + 1 percentage_affected_learners = dict() for region, no_learners in affected_learners_total.items(): percentage_affected_learners[region] = get_fraction_str( no_learners, learners_total[region]) logger.info('Processed education') grheaders = [ 'No. affected learners', 'Percentage affected learners', 'No. closed countries' ] grhxltags = [ '#affected+learners', '#affected+learners+pct', '#status+country+closed' ] headers = [ 'School Closure', 'No. pre-primary to upper-secondary learners', 'No. tertiary learners', 'No. affected learners' ] hxltags = [ '#impact+type', '#population+learners+pre_primary_to_secondary', '#population+learners+tertiary', '#affected+learners' ] return [grheaders, grhxltags], [affected_learners_total, percentage_affected_learners, closed_countries], \ [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags], \ [headers, hxltags], [closures, learners_012, learners_3, affected_learners], \ [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags]
def _run_scraper(countryiso3s, adminone, level, today, name, datasetinfo, headers, iterator, population_lookup, results): # type: (List[str], AdminOne, str, datetime, str, Dict, List[str], Iterator[Union[List,Dict]], Dict[str,int], Dict) -> None """Run one mini scraper. Args: countryiso3s (List[str]): List of ISO3 country codes to process adminone (AdminOne): AdminOne object from HDX Python Country library that handles processing of admin level 1 level (str): Can be global, national or subnational today (datetime): Value to use for today. Defaults to None (datetime.now()). name (str): Name of mini scraper datasetinfo (Dict): Dictionary of information about dataset headers (List[str]): Row headers iterator (Iterator[Union[List,Dict]]): Rows population_lookup (Dict[str,int]): Dictionary from admin code to population results (Dict): Dictionary of output containing output headers, values and sources Returns: Tuple[Optional[str], Optional[List[bool]]]: (admin name, should process subset list) or (None, None) """ subsets = datasetinfo.get('subsets') if not subsets: subsets = [{ 'filter': datasetinfo.get('filter'), 'input_cols': datasetinfo.get('input_cols', list()), 'input_transforms': datasetinfo.get('input_transforms', dict()), 'process_cols': datasetinfo.get('process_cols', list()), 'input_keep': datasetinfo.get('input_keep', list()), 'input_append': datasetinfo.get('input_append', list()), 'sum_cols': datasetinfo.get('sum_cols'), 'input_ignore_vals': datasetinfo.get('input_ignore_vals', list()), 'output_cols': datasetinfo.get('output_cols', list()), 'output_hxltags': datasetinfo.get('output_hxltags', list()) }] use_hxl = datasetinfo.get('use_hxl', False) if use_hxl: hxlrow = next(iterator) while not hxlrow: hxlrow = next(iterator) exclude_tags = datasetinfo.get('exclude_tags', list()) adm_cols = list() input_cols = list() columns = list() for header in headers: hxltag = hxlrow[header] if not hxltag or hxltag in exclude_tags: continue if '#country' in hxltag: if 'code' in hxltag: if len(adm_cols) == 0: adm_cols.append(hxltag) else: adm_cols[0] = hxltag continue if '#adm1' in hxltag: if 'code' in hxltag: if len(adm_cols) == 0: adm_cols.append(None) if len(adm_cols) == 1: adm_cols.append(hxltag) continue if hxltag == datasetinfo.get('date_col') and datasetinfo.get( 'include_date', False) is False: continue input_cols.append(hxltag) columns.append(header) datasetinfo['adm_cols'] = adm_cols for subset in subsets: orig_input_cols = subset.get('input_cols', list()) if not orig_input_cols: orig_input_cols.extend(input_cols) subset['input_cols'] = orig_input_cols orig_columns = subset.get('output_cols', list()) if not orig_columns: orig_columns.extend(columns) subset['output_cols'] = orig_columns orig_hxltags = subset.get('output_hxltags', list()) if not orig_hxltags: orig_hxltags.extend(input_cols) subset['output_hxltags'] = orig_hxltags else: hxlrow = None rowparser = RowParser(countryiso3s, adminone, level, today, datasetinfo, headers, subsets) iterator = rowparser.sort_rows(iterator, hxlrow) valuedicts = dict() for subset in subsets: for _ in subset['input_cols']: dict_of_lists_add(valuedicts, subset['filter'], dict()) def add_row(row): adm, should_process_subset = rowparser.parse(row, name) if not adm: return for i, subset in enumerate(subsets): if not should_process_subset[i]: continue filter = subset['filter'] input_ignore_vals = subset.get('input_ignore_vals', list()) input_transforms = subset.get('input_transforms', dict()) sum_cols = subset.get('sum_cols') process_cols = subset.get('process_cols') input_append = subset.get('input_append', list()) input_keep = subset.get('input_keep', list()) for i, valcol in enumerate(subset['input_cols']): valuedict = valuedicts[filter][i] val = get_rowval(row, valcol) input_transform = input_transforms.get(valcol) if input_transform and val not in input_ignore_vals: val = eval(input_transform.replace(valcol, 'val')) if sum_cols or process_cols: dict_of_lists_add(valuedict, adm, val) else: curval = valuedict.get(adm) if valcol in input_append: if curval: val = curval + val elif valcol in input_keep: if curval: val = curval valuedict[adm] = val stop_row = datasetinfo.get('stop_row') for row in iterator: if not isinstance(row, dict): row = row.value if hxlrow: newrow = dict() for header in row: newrow[hxlrow[header]] = row[header] row = newrow if stop_row: if all(row[key] == value for key, value in stop_row.items()): break for newrow in rowparser.flatten(row): add_row(newrow) date = datasetinfo.get('date') use_date_from_date_col = datasetinfo.get('use_date_from_date_col', False) if date and not use_date_from_date_col: date = parse_date(date) else: date = rowparser.get_maxdate() if date == 0: raise ValueError('No date given in datasetinfo or as a column!') if rowparser.datetype == 'date': if not isinstance(date, datetime): date = parse_date(date) elif rowparser.datetype == 'int': date = get_datetime_from_timestamp(date) else: raise ValueError('No date type specified!') date = date.strftime('%Y-%m-%d') retheaders = results['headers'] retvalues = results['values'] sources = results['sources'] for subset in subsets: output_cols = subset['output_cols'] retheaders[0].extend(output_cols) output_hxltags = subset['output_hxltags'] retheaders[1].extend(output_hxltags) valdicts = valuedicts[subset['filter']] process_cols = subset.get('process_cols') input_keep = subset.get('input_keep', list()) sum_cols = subset.get('sum_cols') input_ignore_vals = subset.get('input_ignore_vals', list()) valcols = subset['input_cols'] # Indices of list sorted by length sorted_len_indices = sorted(range(len(valcols)), key=lambda k: len(valcols[k]), reverse=True) if process_cols: newvaldicts = [dict() for _ in process_cols] def text_replacement(string, adm): string = string.replace('#population', '#pzbgvjh') hasvalues = False for j in sorted_len_indices: valcol = valcols[j] if valcol not in string: continue if valcol in input_keep: input_keep_index = 0 else: input_keep_index = -1 val = valdicts[j][adm][input_keep_index] if val is None or val == '' or val in input_ignore_vals: val = 0 else: hasvalues = True string = string.replace(valcol, str(val)) string = string.replace('#pzbgvjh', '#population') return string, hasvalues for i, process_col in enumerate(process_cols): valdict0 = valdicts[0] for adm in valdict0: hasvalues = True matches = regex.search(brackets, process_col, flags=regex.VERBOSE) if matches: for bracketed_str in matches.captures('rec'): if any(bracketed_str in x for x in valcols): continue _, hasvalues_t = text_replacement( bracketed_str, adm) if not hasvalues_t: hasvalues = False break if hasvalues: formula, hasvalues_t = text_replacement( process_col, adm) if hasvalues_t: formula = formula.replace( '#population', 'population_lookup[adm]') newvaldicts[i][adm] = eval(formula) else: newvaldicts[i][adm] = '' else: newvaldicts[i][adm] = '' retvalues.extend(newvaldicts) elif sum_cols: for sum_col in sum_cols: formula = sum_col['formula'] mustbepopulated = sum_col.get('mustbepopulated', False) newvaldicts = [dict() for _ in valdicts] valdict0 = valdicts[0] for adm in valdict0: for i, val in enumerate(valdict0[adm]): if not val or val in input_ignore_vals: exists = False else: exists = True for valdict in valdicts[1:]: val = valdict[adm][i] if val is None or val == '' or val in input_ignore_vals: exists = False break if mustbepopulated and not exists: continue for j, valdict in enumerate(valdicts): val = valdict[adm][i] if val is None or val == '' or val in input_ignore_vals: continue newvaldicts[j][adm] = eval( f'newvaldicts[j].get(adm, 0.0) + {str(valdict[adm][i])}' ) formula = formula.replace('#population', '#pzbgvjh') for i in sorted_len_indices: formula = formula.replace(valcols[i], 'newvaldicts[%d][adm]' % i) formula = formula.replace('#pzbgvjh', 'population_lookup[adm]') newvaldict = dict() for adm in valdicts[0].keys(): try: val = eval(formula) except (ValueError, TypeError, KeyError): val = '' newvaldict[adm] = val retvalues.append(newvaldict) else: retvalues.extend(valdicts) source = datasetinfo['source'] if isinstance(source, str): source = {'default_source': source} source_url = datasetinfo['source_url'] if isinstance(source_url, str): source_url = {'default_url': source_url} sources.extend([(hxltag, date, source.get(hxltag, source['default_source']), source_url.get(hxltag, source_url['default_url'])) for hxltag in output_hxltags]) logger.info('Processed %s' % name)
def main(): configuration = Configuration.read() enddays = configuration['enddays'] ignore_users = configuration['ignore_users'] users_scrapers = configuration['users_scrapers'] spreadsheet_url = configuration['spreadsheet_url'] sheetname = configuration['sheetname'] logger.info('> GSheet Credentials: %s' % gsheet_auth) users = dict() info = json.loads(gsheet_auth) scopes = ['https://www.googleapis.com/auth/spreadsheets'] credentials = service_account.Credentials.from_service_account_info(info, scopes=scopes) gc = pygsheets.authorize(custom_credentials=credentials) spreadsheet = gc.open_by_url(spreadsheet_url) sheet = spreadsheet.worksheet_by_title(sheetname) keys = sheet.get_row(1) rows = [keys] crisisdata = configuration['crisisdata'] for crisis in crisisdata: data = crisisdata[crisis] startdate = parse_date(data['startdate']) enddate = startdate + timedelta(days=enddays) searchlist = list() for country in data.get('countries', list()): iso3, _ = Country.get_iso3_country_code_fuzzy(country) searchlist.append('groups:%s' % iso3.lower()) for tag in data.get('tags', list()): searchlist.append('vocab_Topics:"%s"' % tag.lower()) search_string = 'metadata_created:[2000-01-01T00:00:00.000Z TO %sZ] AND (%s)' % (enddate.isoformat(), ' OR '.join(searchlist)) datasets = Dataset.search_in_hdx(fq=search_string) row = {'ID': data['id'], 'Crisis name': crisis} count = 0 largest_activities = 0 for dataset in datasets: metadata_created_str = dataset['metadata_created'] orgname = dataset['organization']['name'] metadata_created = parse_date(metadata_created_str) new_or_updated = 'new' updated_when = '' updated_by = '' # if metadata_created < startdate: # activities = Activity.get_all_activities(id=dataset['id'], limit=10000) # activities_len = len(activities) # if activities_len > largest_activities: # largest_activities = activities_len # found = False # for activity in activities: # timestamp = activity['timestamp'] # activity_date = parse_date(timestamp) # if startdate < activity_date < enddate: # new_or_updated = 'updated' # updated_when = timestamp # user_id = activity['user_id'] # check_ignore = True # for user_scrapers in users_scrapers: # if user_id == user_scrapers['id']: # if orgname in user_scrapers['scrapers']: # check_ignore = False # break # if check_ignore: # if user_id in ignore_users: # continue # username = users.get(user_id) # if username is None: # user = User.read_from_hdx(user_id) # username = get_user_name(user) # users[user_id] = username # updated_by = username # found = True # break # if not found: # continue row['dataset title'] = dataset['title'] row['dataset id'] = dataset['id'] row['dataset url'] = dataset.get_hdx_url() row['org name'] = orgname row['org id'] = dataset['organization']['id'] row['created'] = metadata_created_str row['new or updated'] = new_or_updated row['updated when'] = updated_when row['updated by'] = updated_by rows.append([row.get(key, '') for key in keys]) count += 1 logger.info('%s: %d\t%s' % (crisis, count, search_string)) sheet.clear() sheet.update_values('A1', rows) logger.info('Longest activities: %d' % largest_activities)
def do_set_value(self, row, scrapername=None): if self.filtered(row): return None, None adms = [None for _ in range(len(self.admcols))] def get_adm(admcol, i): match = template.search(admcol) if match: template_string = match.group() admcol = self.headers[int(template_string[2:-2])] adm = row[admcol] if not adm: return False adms[i] = row[admcol].strip() return self.admininfo.get_adm(adms, self.admexact, i, scrapername) for i, admcol in enumerate(self.admcols): if admcol is None: continue if isinstance(admcol, str): admcol = [admcol] for admcl in admcol: exact = get_adm(admcl, i) if adms[i] and exact: break if not adms[i]: return None, None indicators_process = list() for indicatorcol in self.indicatorcols: filtercol = indicatorcol['filter_col'] process = True if filtercol: filtercols = filtercol.split('|') match = True for filterstr in filtercols: filter = filterstr.split('=') if row[filter[0]] != filter[1]: match = False break process = match indicators_process.append(process) if self.datecol: if isinstance(self.datecol, list): dates = [str(row[x]) for x in self.datecol] date = ''.join(dates) else: date = row[self.datecol] if self.datetype == 'date': if not isinstance(date, datetime): date = parse_date(date) date = date.replace(tzinfo=None) else: date = int(date) if self.date_condition: if eval(self.date_condition) is False: return None, None for i, process in enumerate(indicators_process): if not process: continue if date > self.maxdate: self.maxdate = date if self.level is None: if self.maxdateonly: if date < self.maxdates[i]: indicators_process[i] = False else: self.maxdates[i] = date else: self.maxdates[i] = date else: if self.maxdateonly: if date < self.maxdates[i][adms[self.level]]: indicators_process[i] = False else: self.maxdates[i][adms[self.level]] = date else: self.maxdates[i][adms[self.level]] = date if self.level is None: return 'global', indicators_process return adms[self.level], indicators_process