def get_tabular(basic_auths, configuration, level, maindownloader, scrapers=None, population_lookup=None, **kwargs): datasets = configuration['tabular_%s' % level] retheaders = [list(), list()] retval = list() sources = list() for name in datasets: if scrapers: if not any(scraper in name for scraper in scrapers): continue else: if name == 'population': continue logger.info('Processing %s' % name) basic_auth = basic_auths.get(name) if basic_auth is None: downloader = maindownloader else: downloader = Download(basic_auth=basic_auth, rate_limit={ 'calls': 1, 'period': 0.1 }) datasetinfo = datasets[name] format = datasetinfo['format'] if format == 'json': iterator = read_json(downloader, datasetinfo, **kwargs) headers = None elif format == 'ole': headers, iterator = read_ole(downloader, datasetinfo, **kwargs) elif format in ['csv', 'xls', 'xlsx']: if 'dataset' in datasetinfo: headers, iterator = read_hdx(downloader, datasetinfo, **kwargs) else: headers, iterator = read_tabular(downloader, datasetinfo, **kwargs) else: raise ValueError('Invalid format %s for %s!' % (format, name)) if 'source_url' not in datasetinfo: datasetinfo['source_url'] = datasetinfo['url'] if 'date' not in datasetinfo or datasetinfo.get( 'force_date_today', False): datasetinfo['date'] = today_str sort = datasetinfo.get('sort') if sort: keys = sort['keys'] reverse = sort.get('reverse', False) iterator = sorted(list(iterator), key=itemgetter(*keys), reverse=reverse) _get_tabular(level, name, datasetinfo, headers, iterator, population_lookup, retheaders, retval, sources) if downloader != maindownloader: downloader.close() if population_lookup is not None: add_population(population_lookup, retheaders, retval) return retheaders, retval, sources
def run_scrapers(datasets, countryiso3s, adminone, level, maindownloader, basic_auths=dict(), today=None, scrapers=None, population_lookup=None, **kwargs): # type: (Dict, List[str], AdminOne, str, Download, Dict[str,str], Optional[datetime], Optional[List[str]], Dict[str,int], Any) -> Dict """Runs all mini scrapers given in configuration and returns headers, values and sources. Args: datasets (Dict): Configuration for mini scrapers countryiso3s (List[str]): List of ISO3 country codes to process adminone (AdminOne): AdminOne object from HDX Python Country library that handles processing of admin level 1 level (str): Can be global, national or subnational maindownloader (Download): Download object for downloading files basic_auths (Dict[str,str]): Dictionary of basic authentication information today (Optional[datetime]): Value to use for today. Defaults to None (datetime.now()). scrapers (Optional[List[str]])): List of mini scraper names to process population_lookup (Dict[str,int]): Dictionary from admin code to population **kwargs: Variables to use when evaluating template arguments in urls Returns: Dict: Dictionary of output containing output headers, values and sources """ results = { 'headers': [list(), list()], 'values': list(), 'sources': list() } now = datetime.now() for name in datasets: if scrapers: if not any(scraper in name for scraper in scrapers): continue else: if name == 'population': continue logger.info('Processing %s' % name) basic_auth = basic_auths.get(name) if basic_auth is None: downloader = maindownloader else: downloader = Download(basic_auth=basic_auth, rate_limit={ 'calls': 1, 'period': 0.1 }) datasetinfo = datasets[name] datasetinfo['name'] = name headers, iterator = read(downloader, datasetinfo, today=today, **kwargs) if 'source_url' not in datasetinfo: datasetinfo['source_url'] = datasetinfo['url'] if 'date' not in datasetinfo or datasetinfo.get( 'force_date_today', False): today_str = kwargs.get('today_str') if today_str: today = parse_date(today_str) else: if not today: today = now today_str = today.strftime('%Y-%m-%d') datasetinfo['date'] = today_str _run_scraper(countryiso3s, adminone, level, today, name, datasetinfo, headers, iterator, population_lookup, results) if downloader != maindownloader: downloader.close() if population_lookup is not None: add_population(population_lookup, results['headers'], results['values']) return results