def add_additional_json(self, downloader, today=None): # type: (Download, Optional[datetime]) -> None """Download JSON files and add them under keys defined in the configuration Args: downloader (Download): Download object for downloading JSON today (Optional[datetime]): Value to use for today. Defaults to None (datetime.now()). Returns: None """ for datasetinfo in self.json_configuration.get('additional_json', list()): headers, iterator = read(downloader, datasetinfo, today=today) hxlrow = next(iterator) if not isinstance(hxlrow, dict): hxlrow = hxlrow.value name = datasetinfo['name'] for row in iterator: newrow = dict() if not isinstance(row, dict): row = row.value for key in row: hxltag = hxlrow[key] if hxltag != '': newrow[hxlrow[key]] = row[key] self.add_data_row(name, newrow)
def get_covax_deliveries(configuration, today, countryiso3s, downloader, scrapers=None): name = 'covax_deliveries' if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list() datasetinfo = configuration[name] headers, iterator = read(downloader, datasetinfo, today=today) hxlrow = next(iterator) doses_lookup = dict() for row in iterator: newrow = dict() for key in row: newrow[hxlrow[key]] = row[key] countryiso = newrow['#country+code'] if not countryiso or countryiso not in countryiso3s: continue key = f'{countryiso}|{newrow["#meta+vaccine+pipeline"]}|{newrow["#meta+vaccine+producer"]}|{newrow["#meta+vaccine+funder"]}' nodoses = get_numeric_if_possible(newrow['#capacity+vaccine+doses']) if nodoses: doses_lookup[key] = doses_lookup.get(key, 0) + nodoses pipelines = dict() producers = dict() funders = dict() doses = dict() for key in sorted(doses_lookup): countryiso, pipeline, producer, funder = key.split('|') dict_of_lists_add(pipelines, countryiso, pipeline) dict_of_lists_add(producers, countryiso, producer) dict_of_lists_add(funders, countryiso, funder) dict_of_lists_add(doses, countryiso, str(doses_lookup[key])) for countryiso in pipelines: pipelines[countryiso] = '|'.join(pipelines[countryiso]) producers[countryiso] = '|'.join(producers[countryiso]) funders[countryiso] = '|'.join(funders[countryiso]) doses[countryiso] = '|'.join(doses[countryiso]) logger.info('Processed covax deliveries') hxltags = ['#meta+vaccine+pipeline', '#meta+vaccine+producer', '#meta+vaccine+funder', '#capacity+vaccine+doses'] return [['Pipeline', 'Vaccine', 'Funder', 'Doses'], hxltags], \ [pipelines, producers, funders, doses], [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags]
def test_read(self, configuration): url = get_url('http://{{var}}', var='hello') assert url == 'http://hello' with Download(user_agent='test') as downloader: datasetinfo = { 'name': 'test', 'dataset': 'sahel-humanitarian-needs-overview', 'format': 'csv' } headers, iterator = read(downloader, datasetinfo, a='b') assert headers == [ 'Country', 'nutrition', 'Affected 2017', 'In Need 2017', 'Targeted 2017', '% targeted' ] assert next(iterator) == { 'Country': '#country', 'nutrition': '#sector?', 'Affected 2017': '#affected', 'In Need 2017': '#inneed', 'Targeted 2017': '#targeted', '% targeted': '#targeted+percentage' } assert next(iterator) == { 'Country': 'Burkina Faso', 'nutrition': 'MAM', 'Affected 2017': '433,412', 'In Need 2017': '433,412', 'Targeted 2017': ' _', '% targeted': '0' } assert datasetinfo == { 'name': 'test', 'dataset': 'sahel-humanitarian-needs-overview', 'format': 'csv', 'headers': 1, 'date': '2016-09-01', 'source': 'Multiple organisations', 'source_url': 'https://data.humdata.org/dataset/sahel-humanitarian-needs-overview', 'url': 'https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/2527ac5b-66fe-46f0-8b9b-7086d2c4ddd3/download/hno-2017-sahel-nutrition.csv' } datasetinfo = { 'name': 'test', 'dataset': 'sahel-humanitarian-needs-overview', 'resource': 'HNO-2017-Sahel- People in need.xlsx', 'format': 'xlsx', 'sheet': 1 } headers, iterator = read(downloader, datasetinfo) assert headers == [ 'Country', 'Sector', 'People in need', 'Total population' ] assert next(iterator) == { 'Country': '#country', 'Sector': '#sector', 'People in need': '#inneed', 'Total population': '#total' } assert next(iterator) == { 'Country': 'Mali', 'Sector': 'Shelter/NFI', 'People in need': 317000, 'Total population': 100000 } assert datasetinfo == { 'name': 'test', 'dataset': 'sahel-humanitarian-needs-overview', 'resource': 'HNO-2017-Sahel- People in need.xlsx', 'format': 'xlsx', 'sheet': 1, 'headers': 1, 'date': '2016-09-01', 'source': 'Multiple organisations', 'source_url': 'https://data.humdata.org/dataset/sahel-humanitarian-needs-overview', 'url': 'https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/d9248be4-7bfb-4a81-a7aa-c035dcb737a2/download/hno-2017-sahel-people-in-need.xlsx' } with pytest.raises(ValueError): datasetinfo = {'name': 'test', 'format': 'unknown'} read(downloader, datasetinfo) with pytest.raises(ValueError): datasetinfo = { 'name': 'test', 'dataset': 'sahel-humanitarian-needs-overview', 'format': 'json' } read(downloader, datasetinfo)
def get_education(configuration, today, countryiso3s, regionlookup, downloader, scrapers=None): name = 'education' if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list(), list(), list(), list() educationinfo = configuration[name] datasetinfo = educationinfo['closures'] closures_headers, closures_iterator = read(downloader, datasetinfo) closures = dict() country_dates = dict() for row in closures_iterator: countryiso = row['ISO'] if not countryiso or countryiso not in countryiso3s: continue date = row['Date'] if isinstance(date, str): date = parse_date(date) if date > today: continue max_date = country_dates.get(countryiso, default_date) if date < max_date: continue country_dates[countryiso] = date closures[countryiso] = row['Status'] fully_closed = list() for countryiso, closure in closures.items(): if closure.lower() == 'closed due to covid-19': fully_closed.append(countryiso) datasetinfo = educationinfo['enrolment'] learners_headers, learners_iterator = read(downloader, datasetinfo) learners_012 = dict() learners_3 = dict() affected_learners = dict() all_learners = dict() for row in learners_iterator: countryiso = row['ISO3'] if not countryiso or countryiso not in countryiso3s: continue l_0 = row['Pre-primary (both)'] l_1 = row['Primary (both)'] l_2 = row['Secondary (both)'] l_3 = row['Tertiary (both)'] l_012 = None if l_0 != '-': l_012 = int(l_0) if l_1 != '-': l_1 = int(l_1) if l_012 is None: l_012 = l_1 else: l_012 += l_1 if l_2 != '-': l_2 = int(l_2) if l_012 is None: l_012 = l_2 else: l_012 += l_2 if l_012 is not None: learners_012[countryiso] = l_012 if l_3 == '-': l_3 = None else: l_3 = int(l_3) learners_3[countryiso] = l_3 no_learners = None if l_012 is not None: no_learners = l_012 if l_3: no_learners += l_3 elif l_3 is not None: no_learners = l_3 if no_learners is not None: all_learners[countryiso] = no_learners if countryiso in fully_closed: affected_learners[countryiso] = no_learners affected_learners_total = dict() learners_total = dict() closed_countries = dict() for countryiso in closures: country_learners = all_learners.get(countryiso) country_affected_learners = affected_learners.get(countryiso) for region in regionlookup.iso3_to_region_and_hrp[countryiso]: if country_learners is not None: learners_total[region] = learners_total.get( region, 0) + country_learners if country_affected_learners is not None: affected_learners_total[region] = affected_learners_total.get( region, 0) + country_affected_learners closed_countries[region] = closed_countries.get(region, 0) + 1 percentage_affected_learners = dict() for region, no_learners in affected_learners_total.items(): percentage_affected_learners[region] = get_fraction_str( no_learners, learners_total[region]) logger.info('Processed education') grheaders = [ 'No. affected learners', 'Percentage affected learners', 'No. closed countries' ] grhxltags = [ '#affected+learners', '#affected+learners+pct', '#status+country+closed' ] headers = [ 'School Closure', 'No. pre-primary to upper-secondary learners', 'No. tertiary learners', 'No. affected learners' ] hxltags = [ '#impact+type', '#population+learners+pre_primary_to_secondary', '#population+learners+tertiary', '#affected+learners' ] return [grheaders, grhxltags], [affected_learners_total, percentage_affected_learners, closed_countries], \ [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags], \ [headers, hxltags], [closures, learners_012, learners_3, affected_learners], \ [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags]
def run_scrapers(datasets, countryiso3s, adminone, level, maindownloader, basic_auths=dict(), today=None, scrapers=None, population_lookup=None, **kwargs): # type: (Dict, List[str], AdminOne, str, Download, Dict[str,str], Optional[datetime], Optional[List[str]], Dict[str,int], Any) -> Dict """Runs all mini scrapers given in configuration and returns headers, values and sources. Args: datasets (Dict): Configuration for mini scrapers countryiso3s (List[str]): List of ISO3 country codes to process adminone (AdminOne): AdminOne object from HDX Python Country library that handles processing of admin level 1 level (str): Can be global, national or subnational maindownloader (Download): Download object for downloading files basic_auths (Dict[str,str]): Dictionary of basic authentication information today (Optional[datetime]): Value to use for today. Defaults to None (datetime.now()). scrapers (Optional[List[str]])): List of mini scraper names to process population_lookup (Dict[str,int]): Dictionary from admin code to population **kwargs: Variables to use when evaluating template arguments in urls Returns: Dict: Dictionary of output containing output headers, values and sources """ results = { 'headers': [list(), list()], 'values': list(), 'sources': list() } now = datetime.now() for name in datasets: if scrapers: if not any(scraper in name for scraper in scrapers): continue else: if name == 'population': continue logger.info('Processing %s' % name) basic_auth = basic_auths.get(name) if basic_auth is None: downloader = maindownloader else: downloader = Download(basic_auth=basic_auth, rate_limit={ 'calls': 1, 'period': 0.1 }) datasetinfo = datasets[name] datasetinfo['name'] = name headers, iterator = read(downloader, datasetinfo, today=today, **kwargs) if 'source_url' not in datasetinfo: datasetinfo['source_url'] = datasetinfo['url'] if 'date' not in datasetinfo or datasetinfo.get( 'force_date_today', False): today_str = kwargs.get('today_str') if today_str: today = parse_date(today_str) else: if not today: today = now today_str = today.strftime('%Y-%m-%d') datasetinfo['date'] = today_str _run_scraper(countryiso3s, adminone, level, today, name, datasetinfo, headers, iterator, population_lookup, results) if downloader != maindownloader: downloader.close() if population_lookup is not None: add_population(population_lookup, results['headers'], results['values']) return results