def add_vaccination_campaigns(configuration, countryiso3s, downloader, outputs, scrapers=None): name = 'vaccination_campaigns' if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list() datasetinfo = configuration[name] headers, iterator = read_hdx(downloader, datasetinfo) hxlrow = next(iterator) campaigns_per_country = dict() affected_campaigns_per_country = dict() for row in iterator: newrow = dict() countryiso = None for key in row: hxltag = hxlrow[key] if hxltag != '': value = row[key] newrow[hxlrow[key]] = value if hxltag == '#country+code': countryiso = value if countryiso not in countryiso3s: countryiso = None break campaigns_per_country[countryiso] = campaigns_per_country.get(countryiso, 0) + 1 if hxltag == '#status+name': value = value.lower() if value != 'on track' and 'reinstated' not in value: affected_campaigns_per_country[countryiso] = affected_campaigns_per_country.get(countryiso, 0) + 1 if countryiso: outputs['json'].add_data_row(name, newrow) ratios = calculate_ratios(campaigns_per_country, affected_campaigns_per_country) hxltag = '#vaccination+num+ratio' logger.info('Processed vaccination campaigns') return [['Vaccination Ratio'], [hxltag]], [ratios], [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url'])]
def add_additional_json(self, downloader): for datasetinfo in self.json_configuration.get('additional_json', list()): name = datasetinfo['name'] format = datasetinfo['format'] if format == 'json': iterator = read_json(downloader, datasetinfo) headers = None elif format == 'ole': headers, iterator = read_ole(downloader, datasetinfo) elif format in ['csv', 'xls', 'xlsx']: if 'dataset' in datasetinfo: headers, iterator = read_hdx(downloader, datasetinfo) else: headers, iterator = read_tabular(downloader, datasetinfo) else: raise ValueError('Invalid format %s for %s!' % (format, name)) hxlrow = next(iterator) for row in iterator: newrow = dict() for key in row: hxltag = hxlrow[key] if hxltag != '': newrow[hxlrow[key]] = row[key] self.add_data_row(name, newrow)
def read_regional(configuration, countryiso3s, hrp_iso3s, downloader): regional_config = configuration['regional'] _, iterator = read_hdx(downloader, regional_config) iso3_to_region = dict() iso3_to_region_and_hrp = dict() regions = set() for row in iterator: countryiso = row[regional_config['iso3']] if countryiso and countryiso in countryiso3s: region = row[regional_config['region']] if region == 'NO COVERAGE': continue regions.add(region) dict_of_sets_add(iso3_to_region_and_hrp, countryiso, region) iso3_to_region[countryiso] = region regions = sorted(list(regions)) region = 'H25' regions.insert(0, region) for countryiso in hrp_iso3s: dict_of_sets_add(iso3_to_region_and_hrp, countryiso, region) region = 'H63' regions.insert(0, region) for countryiso in countryiso3s: dict_of_sets_add(iso3_to_region_and_hrp, countryiso, region) return regions, iso3_to_region, iso3_to_region_and_hrp
def get_tabular(configuration, level, downloader, scrapers=None, **kwargs): datasets = configuration['tabular_%s' % level] retheaders = [list(), list()] retval = list() sources = list() for name in datasets: if scrapers and not any( scraper in name for scraper in scrapers) and name != 'population': continue datasetinfo = datasets[name] format = datasetinfo['format'] if format == 'json': iterator = read_json(downloader, datasetinfo, **kwargs) headers = None elif format == 'ole': headers, iterator = read_ole(downloader, datasetinfo, **kwargs) elif format in ['csv', 'xls', 'xlsx']: if 'dataset' in datasetinfo: headers, iterator = read_hdx(downloader, datasetinfo, **kwargs) else: headers, iterator = read_tabular(downloader, datasetinfo, **kwargs) else: raise ValueError('Invalid format %s for %s!' % (format, name)) if 'source_url' not in datasetinfo: datasetinfo['source_url'] = datasetinfo['url'] if 'date' not in datasetinfo or datasetinfo.get( 'force_date_today', False): datasetinfo['date'] = today_str _get_tabular(level, name, datasetinfo, headers, iterator, retheaders, retval, sources) return retheaders, retval, sources
def add_food_prices(configuration, countryiso3s, downloader, scrapers=None): name = 'food_prices' if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list() datasetinfo = configuration[name] headers, iterator = read_hdx(downloader, datasetinfo) allowed_months = set() for i in range(1, 7, 1): month = today.month - i if month > 0: allowed_months.add('%d/%d' % (today.year, month)) else: month = 12 - month allowed_months.add('%d/%d' % (today.year - 1, month)) commods_per_country = dict() affected_commods_per_country = dict() for row in iterator: year_month = '%s/%s' % (row['Year'], row['Month']) if year_month not in allowed_months: continue countryiso, _ = Country.get_iso3_country_code_fuzzy(row['Country']) if not countryiso or countryiso not in countryiso3s: continue commods_per_country[countryiso] = commods_per_country.get( countryiso, 0) + 1 if row['ALPS'] != 'Normal': affected_commods_per_country[ countryiso] = affected_commods_per_country.get(countryiso, 0) + 1 ratios = calculate_ratios(commods_per_country, affected_commods_per_country) hxltag = '#value+food+num+ratio' logger.info('Processed WFP') return [['Food Prices Ratio'], [hxltag] ], [ratios], [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url'])]
def get_access(configuration, admininfo, downloader, scrapers=None): name = inspect.currentframe().f_code.co_name if scrapers and not any(scraper in name for scraper in scrapers): return list(), list(), list(), list(), list(), list(), list(), list( ), list() access_configuration = configuration['access_constraints'] ranking_url = access_configuration['ranking_url'] headers, rows = read_tabular(downloader, { 'url': ranking_url, 'headers': 1, 'format': 'csv' }) sheets = access_configuration['sheets'] constraint_rankings = {x: dict() for x in sheets} nocountries_per_region = {'global': 0} top3counts = {'global': dict()} for region in admininfo.regions: nocountries_per_region[region] = 0 top3counts[region] = dict() for row in rows: countryiso = row['iso3'] nocountries_per_region['global'] += 1 for region in admininfo.iso3_to_region_and_hrp.get(countryiso, list()): nocountries_per_region[region] += 1 for sheet in sheets: if '%s_1' % sheet not in row: continue type_ranking = constraint_rankings.get(sheet, dict()) for i in range(1, 4): constraint = row['%s_%d' % (sheet, i)] dict_of_lists_add(type_ranking, countryiso, constraint) constraint_rankings[sheet] = type_ranking data = dict() datasetinfo = { 'dataset': access_configuration['dataset'], 'headers': 1, 'format': 'xlsx' } for sheet, sheetinfo in sheets.items(): datasetinfo['sheet'] = sheetinfo['sheetname'] headers, rows = read_hdx(downloader, datasetinfo) datasheet = data.get(sheet, dict()) for row in rows: countryiso = Country.get_iso3_country_code( row[sheetinfo['isocol']]) if countryiso not in admininfo.countryiso3s: continue countrydata = datasheet.get(countryiso, dict()) score = countrydata.get('score', 0) newscore = row[sheetinfo['scorecol']] textcol = sheetinfo.get('textcol') if textcol: text = row[textcol] dict_of_lists_add(countrydata, 'text', (newscore, text)) for region, top3countsregion in top3counts.items(): if region != 'global' and region not in admininfo.iso3_to_region_and_hrp.get( countryiso, list()): continue top3countssheet = top3countsregion.get(sheet, dict()) if sheet == 'impact': if newscore != 0: top3countssheet[text] = top3countssheet.get( text, 0) + 1 else: if newscore == 3: top3countssheet[text] = top3countssheet.get( text, 0) + 1 top3countsregion[sheet] = top3countssheet weights = sheetinfo.get('weights') if weights: weight = weights.get(text) if weight: newscore *= weight score += newscore else: dict_of_lists_add(countrydata, 'text', (newscore, newscore)) for region, top3countsregion in top3counts.items(): if region != 'global' and region not in admininfo.iso3_to_region_and_hrp.get( countryiso, list()): continue top3countssheet = top3countsregion.get(sheet, dict()) if newscore == 'yes': top3countssheet[sheet] = top3countssheet.get(sheet, 0) + 1 top3countsregion[sheet] = top3countssheet score = newscore countrydata['score'] = score datasheet[countryiso] = countrydata data[sheet] = datasheet gvaluedicts = [dict() for _ in range(7)] rvaluedicts = [dict() for _ in range(7)] for region, top3countsregion in top3counts.items(): if region == 'global': valuedicts = gvaluedicts else: valuedicts = rvaluedicts for i, (sheet, top3countssheet) in enumerate(top3countsregion.items()): sortedcounts = sorted(top3countssheet, key=top3countssheet.get, reverse=True) texts = list() pcts = list() for text in sortedcounts[:3]: texts.append(text) pcts.append( get_fraction_str(top3countssheet[text], nocountries_per_region[region])) if sheet == 'mitigation': valuedicts[i * 2][region] = pcts[0] else: valuedicts[i * 2][region] = '|'.join(texts) valuedicts[i * 2 + 1][region] = '|'.join(pcts) valuedicts = [dict() for _ in range(6)] severityscore = valuedicts[0] for i, sheet in enumerate(data): datasheet = data[sheet] for countryiso in datasheet: countrydata = datasheet[countryiso] ranked = sorted(countrydata['text'], reverse=True) top_value = ranked[0][0] texts = list() for value, text in countrydata['text']: if value == top_value: if sheet == 'mitigation' or text in constraint_rankings[ sheet][countryiso]: texts.append(text) valuedicts[i + 2][countryiso] = '|'.join(texts) if 'constraints' in sheet: score = severityscore.get(countryiso, 0) score += countrydata['score'] severityscore[countryiso] = score ranges = access_configuration['category'] severitycategory = valuedicts[1] for countryiso in severityscore: score = severityscore.get(countryiso) if score is None: severitycategory[countryiso] = None continue severitycategory[countryiso] = process_range(ranges, score) logger.info('Processed access') grheaders = [ 'Access Constraints Into', 'Access Constraints Into Pct', 'Access Constraints Within', 'Access Constraints Within Pct', 'Access Impact', 'Access Impact Pct', 'Mitigation Pct' ] headers = [ 'Access Severity Score', 'Access Severity Category', 'Access Constraints Into', 'Access Constraints Within', 'Access Impact', 'Mitigation' ] grhxltags = [ '#access+constraints+into+desc', '#access+constraints+into+pct', '#access+constraints+within+desc', '#access+constraints+within+pct', '#access+impact+desc', '#access+impact+pct', '#access+mitigation+pct' ] hxltags = [ '#severity+access+num+score', '#severity+access+category+num', '#access+constraints+into+desc', '#access+constraints+within+desc', '#access+impact+desc', '#access+mitigation+desc' ] return [grheaders, grhxltags], gvaluedicts, \ [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in grhxltags], \ [grheaders, grhxltags], rvaluedicts, \ [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in grhxltags], \ [headers, hxltags], valuedicts, \ [(hxltag, datasetinfo['date'], datasetinfo['source'], datasetinfo['source_url']) for hxltag in hxltags]
def get_tabular(basic_auths, configuration, level, maindownloader, scrapers=None, population_lookup=None, **kwargs): datasets = configuration['tabular_%s' % level] retheaders = [list(), list()] retval = list() sources = list() for name in datasets: if scrapers: if not any(scraper in name for scraper in scrapers): continue else: if name == 'population': continue logger.info('Processing %s' % name) basic_auth = basic_auths.get(name) if basic_auth is None: downloader = maindownloader else: downloader = Download(basic_auth=basic_auth, rate_limit={ 'calls': 1, 'period': 0.1 }) datasetinfo = datasets[name] format = datasetinfo['format'] if format == 'json': iterator = read_json(downloader, datasetinfo, **kwargs) headers = None elif format == 'ole': headers, iterator = read_ole(downloader, datasetinfo, **kwargs) elif format in ['csv', 'xls', 'xlsx']: if 'dataset' in datasetinfo: headers, iterator = read_hdx(downloader, datasetinfo, **kwargs) else: headers, iterator = read_tabular(downloader, datasetinfo, **kwargs) else: raise ValueError('Invalid format %s for %s!' % (format, name)) if 'source_url' not in datasetinfo: datasetinfo['source_url'] = datasetinfo['url'] if 'date' not in datasetinfo or datasetinfo.get( 'force_date_today', False): datasetinfo['date'] = today_str sort = datasetinfo.get('sort') if sort: keys = sort['keys'] reverse = sort.get('reverse', False) iterator = sorted(list(iterator), key=itemgetter(*keys), reverse=reverse) _get_tabular(level, name, datasetinfo, headers, iterator, population_lookup, retheaders, retval, sources) if downloader != maindownloader: downloader.close() if population_lookup is not None: add_population(population_lookup, retheaders, retval) return retheaders, retval, sources