def get_tabular(basic_auths,
                configuration,
                level,
                maindownloader,
                scrapers=None,
                population_lookup=None,
                **kwargs):
    datasets = configuration['tabular_%s' % level]
    retheaders = [list(), list()]
    retval = list()
    sources = list()
    for name in datasets:
        if scrapers:
            if not any(scraper in name for scraper in scrapers):
                continue
        else:
            if name == 'population':
                continue
        logger.info('Processing %s' % name)
        basic_auth = basic_auths.get(name)
        if basic_auth is None:
            downloader = maindownloader
        else:
            downloader = Download(basic_auth=basic_auth,
                                  rate_limit={
                                      'calls': 1,
                                      'period': 0.1
                                  })
        datasetinfo = datasets[name]
        format = datasetinfo['format']
        if format == 'json':
            iterator = read_json(downloader, datasetinfo, **kwargs)
            headers = None
        elif format == 'ole':
            headers, iterator = read_ole(downloader, datasetinfo, **kwargs)
        elif format in ['csv', 'xls', 'xlsx']:
            if 'dataset' in datasetinfo:
                headers, iterator = read_hdx(downloader, datasetinfo, **kwargs)
            else:
                headers, iterator = read_tabular(downloader, datasetinfo,
                                                 **kwargs)
        else:
            raise ValueError('Invalid format %s for %s!' % (format, name))
        if 'source_url' not in datasetinfo:
            datasetinfo['source_url'] = datasetinfo['url']
        if 'date' not in datasetinfo or datasetinfo.get(
                'force_date_today', False):
            datasetinfo['date'] = today_str
        sort = datasetinfo.get('sort')
        if sort:
            keys = sort['keys']
            reverse = sort.get('reverse', False)
            iterator = sorted(list(iterator),
                              key=itemgetter(*keys),
                              reverse=reverse)
        _get_tabular(level, name, datasetinfo, headers, iterator,
                     population_lookup, retheaders, retval, sources)
        if downloader != maindownloader:
            downloader.close()
        if population_lookup is not None:
            add_population(population_lookup, retheaders, retval)
    return retheaders, retval, sources
Beispiel #2
0
def run_scrapers(datasets,
                 countryiso3s,
                 adminone,
                 level,
                 maindownloader,
                 basic_auths=dict(),
                 today=None,
                 scrapers=None,
                 population_lookup=None,
                 **kwargs):
    # type: (Dict, List[str], AdminOne, str, Download, Dict[str,str], Optional[datetime], Optional[List[str]], Dict[str,int], Any) -> Dict
    """Runs all mini scrapers given in configuration and returns headers, values and sources.

    Args:
        datasets (Dict): Configuration for mini scrapers
        countryiso3s (List[str]): List of ISO3 country codes to process
        adminone (AdminOne): AdminOne object from HDX Python Country library that handles processing of admin level 1
        level (str): Can be global, national or subnational
        maindownloader (Download): Download object for downloading files
        basic_auths (Dict[str,str]): Dictionary of basic authentication information
        today (Optional[datetime]): Value to use for today. Defaults to None (datetime.now()).
        scrapers (Optional[List[str]])): List of mini scraper names to process
        population_lookup (Dict[str,int]): Dictionary from admin code to population
        **kwargs: Variables to use when evaluating template arguments in urls

    Returns:
        Dict: Dictionary of output containing output headers, values and sources
    """
    results = {
        'headers': [list(), list()],
        'values': list(),
        'sources': list()
    }
    now = datetime.now()
    for name in datasets:
        if scrapers:
            if not any(scraper in name for scraper in scrapers):
                continue
        else:
            if name == 'population':
                continue
        logger.info('Processing %s' % name)
        basic_auth = basic_auths.get(name)
        if basic_auth is None:
            downloader = maindownloader
        else:
            downloader = Download(basic_auth=basic_auth,
                                  rate_limit={
                                      'calls': 1,
                                      'period': 0.1
                                  })
        datasetinfo = datasets[name]
        datasetinfo['name'] = name
        headers, iterator = read(downloader,
                                 datasetinfo,
                                 today=today,
                                 **kwargs)
        if 'source_url' not in datasetinfo:
            datasetinfo['source_url'] = datasetinfo['url']
        if 'date' not in datasetinfo or datasetinfo.get(
                'force_date_today', False):
            today_str = kwargs.get('today_str')
            if today_str:
                today = parse_date(today_str)
            else:
                if not today:
                    today = now
                today_str = today.strftime('%Y-%m-%d')
            datasetinfo['date'] = today_str
        _run_scraper(countryiso3s, adminone, level, today, name, datasetinfo,
                     headers, iterator, population_lookup, results)
        if downloader != maindownloader:
            downloader.close()
        if population_lookup is not None:
            add_population(population_lookup, results['headers'],
                           results['values'])
    return results