Example #1
0
    def add_additional_json(self, downloader, today=None):
        # type: (Download, Optional[datetime]) -> None
        """Download JSON files and add them under keys defined in the configuration

        Args:
            downloader (Download): Download object for downloading JSON
            today (Optional[datetime]): Value to use for today. Defaults to None (datetime.now()).

        Returns:
            None
        """
        for datasetinfo in self.json_configuration.get('additional_json',
                                                       list()):
            headers, iterator = read(downloader, datasetinfo, today=today)
            hxlrow = next(iterator)
            if not isinstance(hxlrow, dict):
                hxlrow = hxlrow.value
            name = datasetinfo['name']
            for row in iterator:
                newrow = dict()
                if not isinstance(row, dict):
                    row = row.value
                for key in row:
                    hxltag = hxlrow[key]
                    if hxltag != '':
                        newrow[hxlrow[key]] = row[key]
                self.add_data_row(name, newrow)
Example #2
0
def get_covax_deliveries(configuration, today, countryiso3s, downloader, scrapers=None):
    name = 'covax_deliveries'
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list()
    datasetinfo = configuration[name]
    headers, iterator = read(downloader, datasetinfo, today=today)
    hxlrow = next(iterator)
    doses_lookup = dict()
    for row in iterator:
        newrow = dict()
        for key in row:
            newrow[hxlrow[key]] = row[key]
        countryiso = newrow['#country+code']
        if not countryiso or countryiso not in countryiso3s:
            continue
        key = f'{countryiso}|{newrow["#meta+vaccine+pipeline"]}|{newrow["#meta+vaccine+producer"]}|{newrow["#meta+vaccine+funder"]}'
        nodoses = get_numeric_if_possible(newrow['#capacity+vaccine+doses'])
        if nodoses:
            doses_lookup[key] = doses_lookup.get(key, 0) + nodoses
    pipelines = dict()
    producers = dict()
    funders = dict()
    doses = dict()
    for key in sorted(doses_lookup):
        countryiso, pipeline, producer, funder = key.split('|')
        dict_of_lists_add(pipelines, countryiso, pipeline)
        dict_of_lists_add(producers, countryiso, producer)
        dict_of_lists_add(funders, countryiso, funder)
        dict_of_lists_add(doses, countryiso, str(doses_lookup[key]))
    for countryiso in pipelines:
        pipelines[countryiso] = '|'.join(pipelines[countryiso])
        producers[countryiso] = '|'.join(producers[countryiso])
        funders[countryiso] = '|'.join(funders[countryiso])
        doses[countryiso] = '|'.join(doses[countryiso])
    logger.info('Processed covax deliveries')
    hxltags = ['#meta+vaccine+pipeline', '#meta+vaccine+producer', '#meta+vaccine+funder', '#capacity+vaccine+doses']
    return [['Pipeline', 'Vaccine', 'Funder', 'Doses'], hxltags], \
           [pipelines, producers, funders, doses], [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags]
 def test_read(self, configuration):
     url = get_url('http://{{var}}', var='hello')
     assert url == 'http://hello'
     with Download(user_agent='test') as downloader:
         datasetinfo = {
             'name': 'test',
             'dataset': 'sahel-humanitarian-needs-overview',
             'format': 'csv'
         }
         headers, iterator = read(downloader, datasetinfo, a='b')
         assert headers == [
             'Country', 'nutrition', 'Affected 2017', 'In Need 2017',
             'Targeted 2017', '% targeted'
         ]
         assert next(iterator) == {
             'Country': '#country',
             'nutrition': '#sector?',
             'Affected 2017': '#affected',
             'In Need 2017': '#inneed',
             'Targeted 2017': '#targeted',
             '% targeted': '#targeted+percentage'
         }
         assert next(iterator) == {
             'Country': 'Burkina Faso',
             'nutrition': 'MAM',
             'Affected 2017': '433,412',
             'In Need 2017': '433,412',
             'Targeted 2017': '             _',
             '% targeted': '0'
         }
         assert datasetinfo == {
             'name':
             'test',
             'dataset':
             'sahel-humanitarian-needs-overview',
             'format':
             'csv',
             'headers':
             1,
             'date':
             '2016-09-01',
             'source':
             'Multiple organisations',
             'source_url':
             'https://data.humdata.org/dataset/sahel-humanitarian-needs-overview',
             'url':
             'https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/2527ac5b-66fe-46f0-8b9b-7086d2c4ddd3/download/hno-2017-sahel-nutrition.csv'
         }
         datasetinfo = {
             'name': 'test',
             'dataset': 'sahel-humanitarian-needs-overview',
             'resource': 'HNO-2017-Sahel- People in need.xlsx',
             'format': 'xlsx',
             'sheet': 1
         }
         headers, iterator = read(downloader, datasetinfo)
         assert headers == [
             'Country', 'Sector', 'People in need', 'Total population'
         ]
         assert next(iterator) == {
             'Country': '#country',
             'Sector': '#sector',
             'People in need': '#inneed',
             'Total population': '#total'
         }
         assert next(iterator) == {
             'Country': 'Mali',
             'Sector': 'Shelter/NFI',
             'People in need': 317000,
             'Total population': 100000
         }
         assert datasetinfo == {
             'name':
             'test',
             'dataset':
             'sahel-humanitarian-needs-overview',
             'resource':
             'HNO-2017-Sahel- People in need.xlsx',
             'format':
             'xlsx',
             'sheet':
             1,
             'headers':
             1,
             'date':
             '2016-09-01',
             'source':
             'Multiple organisations',
             'source_url':
             'https://data.humdata.org/dataset/sahel-humanitarian-needs-overview',
             'url':
             'https://data.humdata.org/dataset/47f6ef46-500f-421a-9fa2-fefd93facf95/resource/d9248be4-7bfb-4a81-a7aa-c035dcb737a2/download/hno-2017-sahel-people-in-need.xlsx'
         }
         with pytest.raises(ValueError):
             datasetinfo = {'name': 'test', 'format': 'unknown'}
             read(downloader, datasetinfo)
         with pytest.raises(ValueError):
             datasetinfo = {
                 'name': 'test',
                 'dataset': 'sahel-humanitarian-needs-overview',
                 'format': 'json'
             }
             read(downloader, datasetinfo)
def get_education(configuration,
                  today,
                  countryiso3s,
                  regionlookup,
                  downloader,
                  scrapers=None):
    name = 'education'
    if scrapers and not any(scraper in name for scraper in scrapers):
        return list(), list(), list(), list(), list(), list()
    educationinfo = configuration[name]
    datasetinfo = educationinfo['closures']
    closures_headers, closures_iterator = read(downloader, datasetinfo)
    closures = dict()
    country_dates = dict()
    for row in closures_iterator:
        countryiso = row['ISO']
        if not countryiso or countryiso not in countryiso3s:
            continue
        date = row['Date']
        if isinstance(date, str):
            date = parse_date(date)
        if date > today:
            continue
        max_date = country_dates.get(countryiso, default_date)
        if date < max_date:
            continue
        country_dates[countryiso] = date
        closures[countryiso] = row['Status']
    fully_closed = list()
    for countryiso, closure in closures.items():
        if closure.lower() == 'closed due to covid-19':
            fully_closed.append(countryiso)
    datasetinfo = educationinfo['enrolment']
    learners_headers, learners_iterator = read(downloader, datasetinfo)
    learners_012 = dict()
    learners_3 = dict()
    affected_learners = dict()
    all_learners = dict()

    for row in learners_iterator:
        countryiso = row['ISO3']
        if not countryiso or countryiso not in countryiso3s:
            continue
        l_0 = row['Pre-primary (both)']
        l_1 = row['Primary (both)']
        l_2 = row['Secondary (both)']
        l_3 = row['Tertiary (both)']
        l_012 = None
        if l_0 != '-':
            l_012 = int(l_0)
        if l_1 != '-':
            l_1 = int(l_1)
            if l_012 is None:
                l_012 = l_1
            else:
                l_012 += l_1
        if l_2 != '-':
            l_2 = int(l_2)
            if l_012 is None:
                l_012 = l_2
            else:
                l_012 += l_2
        if l_012 is not None:
            learners_012[countryiso] = l_012
        if l_3 == '-':
            l_3 = None
        else:
            l_3 = int(l_3)
            learners_3[countryiso] = l_3
        no_learners = None
        if l_012 is not None:
            no_learners = l_012
            if l_3:
                no_learners += l_3
        elif l_3 is not None:
            no_learners = l_3
        if no_learners is not None:
            all_learners[countryiso] = no_learners
            if countryiso in fully_closed:
                affected_learners[countryiso] = no_learners
    affected_learners_total = dict()
    learners_total = dict()
    closed_countries = dict()
    for countryiso in closures:
        country_learners = all_learners.get(countryiso)
        country_affected_learners = affected_learners.get(countryiso)
        for region in regionlookup.iso3_to_region_and_hrp[countryiso]:
            if country_learners is not None:
                learners_total[region] = learners_total.get(
                    region, 0) + country_learners
            if country_affected_learners is not None:
                affected_learners_total[region] = affected_learners_total.get(
                    region, 0) + country_affected_learners
                closed_countries[region] = closed_countries.get(region, 0) + 1
    percentage_affected_learners = dict()
    for region, no_learners in affected_learners_total.items():
        percentage_affected_learners[region] = get_fraction_str(
            no_learners, learners_total[region])
    logger.info('Processed education')
    grheaders = [
        'No. affected learners', 'Percentage affected learners',
        'No. closed countries'
    ]
    grhxltags = [
        '#affected+learners', '#affected+learners+pct',
        '#status+country+closed'
    ]
    headers = [
        'School Closure', 'No. pre-primary to upper-secondary learners',
        'No. tertiary learners', 'No. affected learners'
    ]
    hxltags = [
        '#impact+type', '#population+learners+pre_primary_to_secondary',
        '#population+learners+tertiary', '#affected+learners'
    ]
    return [grheaders, grhxltags], [affected_learners_total, percentage_affected_learners, closed_countries], \
           [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags], \
           [headers, hxltags], [closures, learners_012, learners_3, affected_learners], \
           [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags]
Example #5
0
def run_scrapers(datasets,
                 countryiso3s,
                 adminone,
                 level,
                 maindownloader,
                 basic_auths=dict(),
                 today=None,
                 scrapers=None,
                 population_lookup=None,
                 **kwargs):
    # type: (Dict, List[str], AdminOne, str, Download, Dict[str,str], Optional[datetime], Optional[List[str]], Dict[str,int], Any) -> Dict
    """Runs all mini scrapers given in configuration and returns headers, values and sources.

    Args:
        datasets (Dict): Configuration for mini scrapers
        countryiso3s (List[str]): List of ISO3 country codes to process
        adminone (AdminOne): AdminOne object from HDX Python Country library that handles processing of admin level 1
        level (str): Can be global, national or subnational
        maindownloader (Download): Download object for downloading files
        basic_auths (Dict[str,str]): Dictionary of basic authentication information
        today (Optional[datetime]): Value to use for today. Defaults to None (datetime.now()).
        scrapers (Optional[List[str]])): List of mini scraper names to process
        population_lookup (Dict[str,int]): Dictionary from admin code to population
        **kwargs: Variables to use when evaluating template arguments in urls

    Returns:
        Dict: Dictionary of output containing output headers, values and sources
    """
    results = {
        'headers': [list(), list()],
        'values': list(),
        'sources': list()
    }
    now = datetime.now()
    for name in datasets:
        if scrapers:
            if not any(scraper in name for scraper in scrapers):
                continue
        else:
            if name == 'population':
                continue
        logger.info('Processing %s' % name)
        basic_auth = basic_auths.get(name)
        if basic_auth is None:
            downloader = maindownloader
        else:
            downloader = Download(basic_auth=basic_auth,
                                  rate_limit={
                                      'calls': 1,
                                      'period': 0.1
                                  })
        datasetinfo = datasets[name]
        datasetinfo['name'] = name
        headers, iterator = read(downloader,
                                 datasetinfo,
                                 today=today,
                                 **kwargs)
        if 'source_url' not in datasetinfo:
            datasetinfo['source_url'] = datasetinfo['url']
        if 'date' not in datasetinfo or datasetinfo.get(
                'force_date_today', False):
            today_str = kwargs.get('today_str')
            if today_str:
                today = parse_date(today_str)
            else:
                if not today:
                    today = now
                today_str = today.strftime('%Y-%m-%d')
            datasetinfo['date'] = today_str
        _run_scraper(countryiso3s, adminone, level, today, name, datasetinfo,
                     headers, iterator, population_lookup, results)
        if downloader != maindownloader:
            downloader.close()
        if population_lookup is not None:
            add_population(population_lookup, results['headers'],
                           results['values'])
    return results