def do_jobs(): print "jobs" indID = "reliefweb_jobs" indicator = { 'indID': indID, 'name': "Number of jobs on ReliefWeb at specified time", 'units': 'uno' } orm.Indicator(**indicator).save() for country in countries: url = "http://api.rwlabs.org/v0/job/list" r = requests.get(url, data=get_job_query(country)) if 'data' not in r.json(): print r.json() print country continue value = { 'region': country, 'period': orm.now()[:10], # we don't need sub-day precision. 'value': r.json()['data']['total'], 'dsID': dsID, 'indID': indID, 'source': url, 'is_number': True } orm.Value(**value).save()
def do_indicator(ind="566"): baseurl = "http://mdgs.un.org/unsd/mdg/Handlers/ExportHandler.ashx?Type=Csv&Series=%s" url = baseurl % ind value_template['source'] = url handle = dl.grab(url) mt, = messytables.any.any_tableset(handle).tables table = xypath.Table.from_messy(mt) country_anchor = table.filter("Country").assert_one() years = country_anchor.fill(xypath.RIGHT).filter(re.compile("\d\d\d\d")) countries = country_anchor.fill(xypath.DOWN) indicator = table.filter("Series").shift(xypath.DOWN).value SEPARATOR = ', ' if SEPARATOR in indicator: i_name = SEPARATOR.join(indicator.split(SEPARATOR)[:-1]) i_unit = indicator.split(SEPARATOR)[-1] else: i_name = indicator i_unit = '' value_template['indID'] = indicator assert i_name indicator = {'indID': indicator, 'name': i_name, 'units': i_unit} orm.Indicator(**indicator).save() # countries also gets some rubbish, but junction will ignore it. for c_cell, y_cell, v_cell in countries.junction(years): value = dict(value_template) value['region'] = c_cell.value value['period'] = y_cell.value value['value'] = v_cell.value orm.Value(**value).save()
def getstats(url, country="PLACEHOLDER"): handle = dl.grab(url) mts = messytables.any.any_tableset(handle) saves = 0 for mt in mts.tables: table = xypath.Table.from_messy(mt) inds = table.filter(lambda b: b.x == 0 and "EPI" in b.value) if not inds: continue assert len(inds) == 1 top, = table.filter(lambda b: 'to the top' in b.value) value, = inds.junction(top) for ind in inds: split = split_ind(ind.value) values_tosave = dict(value_template) values_tosave['source'] = url values_tosave['region'] = country values_tosave['value'] = value[2].value indicator = { 'indID': split['indID'], 'name': split['indID'], 'units': split['units'] } orm.Indicator(**indicator).save() values_tosave['indID'] = split['indID'] orm.Value(**values_tosave).save() saves = saves + 1 if saves != 1: print "huh, %d saves for %r" % (saves, url)
def main(): for sheet in spreadsheets: print sheet shortname = sheet.split('/')[-1].split('.')[0] dsID = 'esa-unpd-' + shortname.replace('_', '-').split('-')[0] year_text, = re.findall('\d{4}', dsID) dataset = { "dsID": dsID, "last_updated": year_text, "last_scraped": orm.now(), "name": "esa-unpd" } orm.DataSet(**dataset).save() indicator = {"indID": shortname, "name": shortname, "units": ''} # we replace the indicator name, so not saving now. # orm.Indicator(**indicator).save() value_template = {"dsID": dsID, "is_number": True, "source": sheet} raw = dl.grab(sheet) mtables = messytables.any.any_tableset(raw) names = [x.name for x in mtables.tables] if 'ESTIMATES' in names: mt = mtables['ESTIMATES'] else: mt = mtables['PROPORTION-URBAN'] table = xypath.Table.from_messy(mt) filestring = table.filter( re.compile("File[^:]*:.*")).assert_one().value indicator['name'], indicator['units'] = parse_file_string(filestring) print indicator['name'] orm.Indicator(**indicator).save() region_header = table.filter( re.compile("Major area, region, country or area.*")).assert_one() ccode_header = table.filter(re.compile("Country.code")).assert_one() regions = region_header.fill(xypath.DOWN) years = ccode_header.fill(xypath.RIGHT) for region_cell, year_cell, value_cell in regions.junction(years): value = dict(value_template) value['indID'] = indicator['indID'] value['region'] = region_cell.value year_value = year_cell.value if isinstance(year_value, basestring) and '-' in year_value: year1, _, year2 = year_value.partition('-') year_count = int(year2) - int(year1) assert year_count == 5 year_value = "%s/P%dY" % (year1, year_count) value['period'] = year_value value['value'] = value_cell.value orm.Value(**value).save() #print value orm.session.commit()
def do_indicator(ind): print "indicator:", ind fh = dl.grab(baseurl % ind) mt, = messytables.commas.CSVTableSet(fh).tables mt_list = list(mt) try: headers = mt_list[0] except IndexError: headers = [] if len(headers) == 0: print "Error getting headers from ", ind raise RuntimeError("No header in {}".format(ind)) logging.warn("headers {!r}".format(headers)) rest = mt_list[1:] for row in rest: if len(row) == 0: continue # skip empty row rowdict = {x[0].value: x[1].value for x in zip(headers, row)} try: name, unit = units(rowdict['GHO (DISPLAY)']) except Exception: fh.seek(0) print fh.read() raise indID = rowdict['GHO (CODE)'] for lookup in [ "SEX", "RESIDENCEAREATYPE", "EDUCATIONLEVEL", "WEALTHQUINTILE" ]: lookup_code = lookup + " (CODE)" lookup_name = lookup + " (DISPLAY)" if lookup_code in rowdict: # header = "SEX (CODE)" if rowdict[lookup_code]: # value != "" indID = indID + "({}={})".format(lookup, rowdict[lookup_code]) name = name + " - " + rowdict[lookup_name] value_dict = { "value": rowdict['Display Value'], "period": rowdict['YEAR (DISPLAY)'], "indID": indID, "region": rowdict["COUNTRY (CODE)"], "dsID": "athena-api", "source": baseurl % ind, "is_number": True } indicator_dict = {'indID': indID, 'name': name, 'units': unit} orm.Indicator(**indicator_dict).save() orm.Value(**value_dict).save()
def doit(targets, names, year): # country_cells: we used to assert_one(), but sometimes there's two! country_cells = table.filter('iso').fill(xypath.DOWN) country_cells = country_cells - country_cells.filter('iso') # remove other if not country_cells: print "no countries" country_year_filter = country_cells.filter( lambda b: b.shift(xypath.RIGHT).value == year) if not country_year_filter: print "no countries for ", year target_cells = table.filter(lambda b: b.value in targets) if not target_cells: print "didn't find ", targets value = { 'dsID': 'emdat', 'period': "%s/P1Y" % (year), 'source': url, 'is_number': True } dataset = { 'dsID': 'emdat', 'last_updated': None, 'last_scraped': orm.now(), 'name': 'EM-DAT' } orm.DataSet(**dataset).save() for i, t in enumerate(targets): indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'} if t == 'total_dam': indicator['units'] = ",000$ USD" orm.Indicator(**indicator).save() for cname, one_country_cells in itertools.groupby(country_year_filter, lambda b: b.value): value['region'] = cname one_country_bag = xypath.Bag.from_list(one_country_cells, name=cname) for target_cell in target_cells: j = one_country_bag.junction(target_cell) value['indID'] = 'emdat:%s' % target_cell.value value['value'] = sum(int(x[2].value) for x in j) orm.Value(**value).save() print value orm.session.commit()
def doit(): # country_cells: we used to assert_one(), but sometimes there's two! dataset = { 'dsID': 'emdat', 'last_updated': None, 'last_scraped': orm.now(), 'name': 'EM-DAT' } orm.DataSet(**dataset).save() for i, t in enumerate(targets): indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'} if t == 'total_damage': indicator['units'] = ",000$ USD" orm.Indicator(**indicator).save() for country in country_list(): # TODO country_list print country raw = dl.grab(url.format(country)) m_tables = messytables.any.any_tableset(raw) mt, = m_tables.tables table = xypath.Table.from_messy(mt) yr = table.filter('year').assert_one() years = yr.fill(xypath.DOWN) cats = yr.fill(xypath.RIGHT) for year, cat, value in years.junction(cats): value = { 'dsID': 'emdat', 'region': country, 'indID': 'emdat:{}'.format(cat.value), 'period': '{}/P1Y'.format(year.value), 'value': value.value, 'source': url, 'is_number': True } orm.Value(**value).save() orm.session.commit()
def do_products(): for product in ocha_products: print product niceproduct = product.replace(" ", "_") indID = "reliefweb_" + niceproduct indicator = { 'indID': indID, 'name': "Number of ReliefWeb reports flagged with ocha_product: %s" % product, 'units': 'uno' } orm.Indicator(**indicator).save() for country in countries: for year in range(1990, 2014): params = dict() params['PRODUCT'] = product params['COUNTRY'] = country params['FROM'] = 1000 * yeartotimestamp(year) params['TO'] = 1000 * yeartotimestamp(year + 1) - 1 url = "http://api.rwlabs.org/v0/report/list" r = requests.get(url, data=get_product_query(**params)) if 'data' not in r.json(): print r.json() print country continue value = { 'region': country, 'period': str(year), 'value': r.json()['data']['total'], 'dsID': dsID, 'indID': indID, 'source': url, 'is_number': True } orm.Value(**value).save()
def do_indicator(ind): fh = dl.grab(baseurl % ind) mt, = messytables.commas.CSVTableSet(fh).tables mt_list = list(mt) headers = mt_list[0] if len(headers) == 0: print ind exit() rest = mt_list[1:] for row in rest: if len(row) == 0: continue # skip empty row rowdict = {x[0].value: x[1].value for x in zip(headers, row)} name, unit = units(rowdict['GHO (DISPLAY)']) if 'SEX (CODE)' in rowdict: indID = rowdict['GHO (CODE)'] + "-" + rowdict['SEX (CODE)'] name = name + " - " + rowdict['SEX (DISPLAY)'] else: indID = rowdict['GHO (CODE)'] value_dict = { "value": rowdict['Display Value'], "period": rowdict['YEAR (DISPLAY)'], "indID": indID, "region": rowdict["COUNTRY (CODE)"], "dsID": "athena-api", "source": baseurl % ind, "is_number": True } indicator_dict = {'indID': indID, 'name': name, 'units': unit} orm.Indicator(**indicator_dict).save() orm.Value(**value_dict).save() print value_dict
DataSet: dsID, last_updated, last_scraped, name Indicator: indID, name, units """ dataset = {'dsID': 'wikipedia', 'last_updated': None, # TODO 'last_scraped': orm.now(), 'name': 'Wikipedia'} orm.DataSet(**dataset).save() for h in headers: indicator = {'indID': 'wikipedia:' + h, 'name': 'Wikipedia: ' + h, 'units': 'url'} orm.Indicator(**indicator).save() value_template = {'dsID': 'wikipedia', 'period': None, 'is_number': False} def exact_match(level, header, country): matches = [x.lower() for x in country[level] if x == header] if len(matches) > 1: print matches, "\n\n\n\n" if matches: return matches[0] def partial_match(level, header, country):
def save_indicator(): indicator = {"indID": INDID, "name": IND_NAME, "units": "count"} orm.Indicator(**indicator).save()
for reg in regions: j = requests.get(baseindexurl % reg).json() for country in j['Countries']: yield { 'region': country['Code'], 'value': baseleafurl % (country['Code'], country['OfficialName']) } print list(accuweather()) orm.DataSet(dsID="accuweather", last_updated=None, last_scraped=orm.now(), name="Accuweather").save() orm.Indicator(indID="accuweather_url", name="AccuWeather URL", units="").save() valuetemplate = { 'dsID': 'accuweather', 'indID': 'accuweather_url', 'period': None, 'source': 'http://www.accuweather.com' } for datarow in accuweather(): olap_row = dict(valuetemplate) olap_row.update(datarow) orm.Value(**olap_row).save()
} indicators = [{ 'indID': 'm49-name', 'name': 'm49-name', 'units': 'string' }, { 'indID': 'm49-num', 'name': 'm49-num', 'units': 'string' }] orm.DataSet(**dataset).save() for x in indicators: orm.Indicator(**x).save() mt = messytables.HTMLTableSet(fh) mt_prune = [x for x in mt.tables if len(list(x)) > 200] assert len(mt_prune) == 1 gb = False for messy in mt_prune: table = xypath.Table.from_messy(messy) alpha_code_header = table.filter( contains_string("ISO ALPHA-3")).assert_one() country_header = table.filter(contains_string("or area name")).assert_one() num_code_header = table.filter(contains_string("Numerical")).assert_one() countries = country_header.fill(xypath.DOWN) num_code_header = countries.shift(x=-1) alpha_j = alpha_code_header.junction(countries)