def do_jobs(): print "jobs" indID = "reliefweb_jobs" indicator = { 'indID': indID, 'name': "Number of jobs on ReliefWeb at specified time", 'units': 'uno' } orm.Indicator(**indicator).save() for country in countries: url = "http://api.rwlabs.org/v0/job/list" r = requests.get(url, data=get_job_query(country)) if 'data' not in r.json(): print r.json() print country continue value = { 'region': country, 'period': orm.now()[:10], # we don't need sub-day precision. 'value': r.json()['data']['total'], 'dsID': dsID, 'indID': indID, 'source': url, 'is_number': True } orm.Value(**value).save()
def do_zip(url): print url fh = dl.grab(url) mt, = list(messytables.zip.ZIPTableSet(fh).tables) fh = None xy = xypath.Table.from_messy(mt) mt = None print "...got" headers = xy.filter(lambda c: c.y == 0) country = headers.filter("Country").assert_one() items = headers.filter("Item").assert_one().fill(xypath.DOWN).filter("Grand Total + (Total)") elements = headers.filter("Element").assert_one().fill(xypath.DOWN).filter("Food supply (kcal/capita/day)") filtered_items = items.select_other(lambda a, b: a.y == b.y, elements) years = country.fill(xypath.RIGHT).filter(re.compile("Y\d\d\d\d$")) for i in filtered_items: values = dict(valuetemplate) values['source'] = url countrycodecell, = i.junction(country) values['region'] = countrycodecell[2].value year_junction = i.junction(years) for _, period, value in year_junction: values['period'] = period.value.replace("Y", "") values['value'] = value.value orm.Value(**values).save() orm.session.commit()
def getstats(url, country="PLACEHOLDER"): handle = dl.grab(url) mts = messytables.any.any_tableset(handle) saves = 0 for mt in mts.tables: table = xypath.Table.from_messy(mt) inds = table.filter(lambda b: b.x == 0 and "EPI" in b.value) if not inds: continue assert len(inds) == 1 top, = table.filter(lambda b: 'to the top' in b.value) value, = inds.junction(top) for ind in inds: split = split_ind(ind.value) values_tosave = dict(value_template) values_tosave['source'] = url values_tosave['region'] = country values_tosave['value'] = value[2].value indicator = { 'indID': split['indID'], 'name': split['indID'], 'units': split['units'] } orm.Indicator(**indicator).save() values_tosave['indID'] = split['indID'] orm.Value(**values_tosave).save() saves = saves + 1 if saves != 1: print "huh, %d saves for %r" % (saves, url)
def do_indicator(ind="566"): baseurl = "http://mdgs.un.org/unsd/mdg/Handlers/ExportHandler.ashx?Type=Csv&Series=%s" url = baseurl % ind value_template['source'] = url handle = dl.grab(url) mt, = messytables.any.any_tableset(handle).tables table = xypath.Table.from_messy(mt) country_anchor = table.filter("Country").assert_one() years = country_anchor.fill(xypath.RIGHT).filter(re.compile("\d\d\d\d")) countries = country_anchor.fill(xypath.DOWN) indicator = table.filter("Series").shift(xypath.DOWN).value SEPARATOR = ', ' if SEPARATOR in indicator: i_name = SEPARATOR.join(indicator.split(SEPARATOR)[:-1]) i_unit = indicator.split(SEPARATOR)[-1] else: i_name = indicator i_unit = '' value_template['indID'] = indicator assert i_name indicator = {'indID': indicator, 'name': i_name, 'units': i_unit} orm.Indicator(**indicator).save() # countries also gets some rubbish, but junction will ignore it. for c_cell, y_cell, v_cell in countries.junction(years): value = dict(value_template) value['region'] = c_cell.value value['period'] = y_cell.value value['value'] = v_cell.value orm.Value(**value).save()
def main(): for sheet in spreadsheets: print sheet shortname = sheet.split('/')[-1].split('.')[0] dsID = 'esa-unpd-' + shortname.replace('_', '-').split('-')[0] year_text, = re.findall('\d{4}', dsID) dataset = { "dsID": dsID, "last_updated": year_text, "last_scraped": orm.now(), "name": "esa-unpd" } orm.DataSet(**dataset).save() indicator = {"indID": shortname, "name": shortname, "units": ''} # we replace the indicator name, so not saving now. # orm.Indicator(**indicator).save() value_template = {"dsID": dsID, "is_number": True, "source": sheet} raw = dl.grab(sheet) mtables = messytables.any.any_tableset(raw) names = [x.name for x in mtables.tables] if 'ESTIMATES' in names: mt = mtables['ESTIMATES'] else: mt = mtables['PROPORTION-URBAN'] table = xypath.Table.from_messy(mt) filestring = table.filter( re.compile("File[^:]*:.*")).assert_one().value indicator['name'], indicator['units'] = parse_file_string(filestring) print indicator['name'] orm.Indicator(**indicator).save() region_header = table.filter( re.compile("Major area, region, country or area.*")).assert_one() ccode_header = table.filter(re.compile("Country.code")).assert_one() regions = region_header.fill(xypath.DOWN) years = ccode_header.fill(xypath.RIGHT) for region_cell, year_cell, value_cell in regions.junction(years): value = dict(value_template) value['indID'] = indicator['indID'] value['region'] = region_cell.value year_value = year_cell.value if isinstance(year_value, basestring) and '-' in year_value: year1, _, year2 = year_value.partition('-') year_count = int(year2) - int(year1) assert year_count == 5 year_value = "%s/P%dY" % (year1, year_count) value['period'] = year_value value['value'] = value_cell.value orm.Value(**value).save() #print value orm.session.commit()
def do_indicator(ind): print "indicator:", ind fh = dl.grab(baseurl % ind) mt, = messytables.commas.CSVTableSet(fh).tables mt_list = list(mt) try: headers = mt_list[0] except IndexError: headers = [] if len(headers) == 0: print "Error getting headers from ", ind raise RuntimeError("No header in {}".format(ind)) logging.warn("headers {!r}".format(headers)) rest = mt_list[1:] for row in rest: if len(row) == 0: continue # skip empty row rowdict = {x[0].value: x[1].value for x in zip(headers, row)} try: name, unit = units(rowdict['GHO (DISPLAY)']) except Exception: fh.seek(0) print fh.read() raise indID = rowdict['GHO (CODE)'] for lookup in [ "SEX", "RESIDENCEAREATYPE", "EDUCATIONLEVEL", "WEALTHQUINTILE" ]: lookup_code = lookup + " (CODE)" lookup_name = lookup + " (DISPLAY)" if lookup_code in rowdict: # header = "SEX (CODE)" if rowdict[lookup_code]: # value != "" indID = indID + "({}={})".format(lookup, rowdict[lookup_code]) name = name + " - " + rowdict[lookup_name] value_dict = { "value": rowdict['Display Value'], "period": rowdict['YEAR (DISPLAY)'], "indID": indID, "region": rowdict["COUNTRY (CODE)"], "dsID": "athena-api", "source": baseurl % ind, "is_number": True } indicator_dict = {'indID': indID, 'name': name, 'units': unit} orm.Indicator(**indicator_dict).save() orm.Value(**value_dict).save()
def do_file(url="http://bit.ly/14FRxGV"): fh = dl.grab(url) mts = messytables.excel.XLSTableSet(fh) xy = discover_table(mts) print "...got" home = xy.filter("(home)") years = home.fill(xypath.RIGHT) countries = home.fill(xypath.DOWN) for country, year, value in countries.junction(years): values = dict(valuetemplate) values['source'] = url values['region'] = country.value values['period'] = niceyear(year.value) # TODO values['value'] = value.value orm.Value(**values).save() orm.session.commit()
def getstats(): url = 'http://hdr.undp.org/en/content/combined-gross-enrolment-education-both-sexes' handle = dl.grab(url) mts = messytables.any.any_tableset(handle) saves = 0 mt = mts.tables[0] table = xypath.Table.from_messy(mt) pivot, = table.filter(lambda c: 'Country' in c.value) years = pivot.fill(xypath.RIGHT) countries = pivot.fill(xypath.DOWN) for year, country, value in years.junction(countries): output = dict(output_template) output['source'] = url output['region'] = country.value.strip() output['value'] = value.value.strip() orm.Value(**output).save() saves = saves + 1 assert saves
def doit(targets, names, year): # country_cells: we used to assert_one(), but sometimes there's two! country_cells = table.filter('iso').fill(xypath.DOWN) country_cells = country_cells - country_cells.filter('iso') # remove other if not country_cells: print "no countries" country_year_filter = country_cells.filter( lambda b: b.shift(xypath.RIGHT).value == year) if not country_year_filter: print "no countries for ", year target_cells = table.filter(lambda b: b.value in targets) if not target_cells: print "didn't find ", targets value = { 'dsID': 'emdat', 'period': "%s/P1Y" % (year), 'source': url, 'is_number': True } dataset = { 'dsID': 'emdat', 'last_updated': None, 'last_scraped': orm.now(), 'name': 'EM-DAT' } orm.DataSet(**dataset).save() for i, t in enumerate(targets): indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'} if t == 'total_dam': indicator['units'] = ",000$ USD" orm.Indicator(**indicator).save() for cname, one_country_cells in itertools.groupby(country_year_filter, lambda b: b.value): value['region'] = cname one_country_bag = xypath.Bag.from_list(one_country_cells, name=cname) for target_cell in target_cells: j = one_country_bag.junction(target_cell) value['indID'] = 'emdat:%s' % target_cell.value value['value'] = sum(int(x[2].value) for x in j) orm.Value(**value).save() print value orm.session.commit()
def doit(): # country_cells: we used to assert_one(), but sometimes there's two! dataset = { 'dsID': 'emdat', 'last_updated': None, 'last_scraped': orm.now(), 'name': 'EM-DAT' } orm.DataSet(**dataset).save() for i, t in enumerate(targets): indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'} if t == 'total_damage': indicator['units'] = ",000$ USD" orm.Indicator(**indicator).save() for country in country_list(): # TODO country_list print country raw = dl.grab(url.format(country)) m_tables = messytables.any.any_tableset(raw) mt, = m_tables.tables table = xypath.Table.from_messy(mt) yr = table.filter('year').assert_one() years = yr.fill(xypath.DOWN) cats = yr.fill(xypath.RIGHT) for year, cat, value in years.junction(cats): value = { 'dsID': 'emdat', 'region': country, 'indID': 'emdat:{}'.format(cat.value), 'period': '{}/P1Y'.format(year.value), 'value': value.value, 'source': url, 'is_number': True } orm.Value(**value).save() orm.session.commit()
def do_products(): for product in ocha_products: print product niceproduct = product.replace(" ", "_") indID = "reliefweb_" + niceproduct indicator = { 'indID': indID, 'name': "Number of ReliefWeb reports flagged with ocha_product: %s" % product, 'units': 'uno' } orm.Indicator(**indicator).save() for country in countries: for year in range(1990, 2014): params = dict() params['PRODUCT'] = product params['COUNTRY'] = country params['FROM'] = 1000 * yeartotimestamp(year) params['TO'] = 1000 * yeartotimestamp(year + 1) - 1 url = "http://api.rwlabs.org/v0/report/list" r = requests.get(url, data=get_product_query(**params)) if 'data' not in r.json(): print r.json() print country continue value = { 'region': country, 'period': str(year), 'value': r.json()['data']['total'], 'dsID': dsID, 'indID': indID, 'source': url, 'is_number': True } orm.Value(**value).save()
def do_indicator(ind): fh = dl.grab(baseurl % ind) mt, = messytables.commas.CSVTableSet(fh).tables mt_list = list(mt) headers = mt_list[0] if len(headers) == 0: print ind exit() rest = mt_list[1:] for row in rest: if len(row) == 0: continue # skip empty row rowdict = {x[0].value: x[1].value for x in zip(headers, row)} name, unit = units(rowdict['GHO (DISPLAY)']) if 'SEX (CODE)' in rowdict: indID = rowdict['GHO (CODE)'] + "-" + rowdict['SEX (CODE)'] name = name + " - " + rowdict['SEX (DISPLAY)'] else: indID = rowdict['GHO (CODE)'] value_dict = { "value": rowdict['Display Value'], "period": rowdict['YEAR (DISPLAY)'], "indID": indID, "region": rowdict["COUNTRY (CODE)"], "dsID": "athena-api", "source": baseurl % ind, "is_number": True } indicator_dict = {'indID': indID, 'name': name, 'units': unit} orm.Indicator(**indicator_dict).save() orm.Value(**value_dict).save() print value_dict
def main(): save_dataset() save_indicator() raw = dl.grab(SHEET_URL) mtables = messytables.any.any_tableset(raw) table = xypath.Table.from_messy(mtables.tables[0]) table.filter(IND_NAME).assert_one() # we have the right table region_header = table.filter(REGION_HEADER_VALUE).assert_one() regions = region_header.fill(xypath.DOWN) years = region_header.fill(xypath.RIGHT, stop_before=lambda c: c.value == '') assert len(years) < 15 # left side. for region_cell, year_cell, value_cell in regions.junction(years): value = { "dsID": DSID, 'region': region_cell.value, 'indID': INDID, 'source': SHEET_URL, 'is_number': True, 'period': year_cell.value, 'value': value_cell.value } orm.Value(**value).save() orm.session.commit()
for cand_header in country[level]: if header in cand_header.lower(): matches.append(cand_header) if len(matches) > 1: print matches, "\n\n\n\n" if matches: return matches[0] def best_match(header, country): return exact_match(2, header, country) or \ exact_match(3, header, country) or \ partial_match(2, header, country) or \ partial_match(3, header, country) wikibase = "http://en.wikipedia.org/wiki/%s#%s" headinglist = headings() for country in headinglist: matches = [best_match(header, headinglist[country]) for header in headers] d_matches = dict(zip(headers, matches)) for head in d_matches: if d_matches[head]: value = dict(value_template) value['region'] = country value['indID'] = "wikipedia:" + head value['source'] = wikibase % (country, d_matches[head]) value['value'] = value['source'] print value orm.Value(**value).save()
country_header = table.filter(contains_string("or area name")).assert_one() num_code_header = table.filter(contains_string("Numerical")).assert_one() countries = country_header.fill(xypath.DOWN) num_code_header = countries.shift(x=-1) alpha_j = alpha_code_header.junction(countries) num_j = alpha_code_header.junction(num_code_header) alphas = [[x.value.strip() for x in row[1:]] for row in alpha_j] nums = [[x.value.strip() for x in row[1:]] for row in num_j] # 729 - Sudan - SDN alphas.extend([['Sudan', 'SDN']]) nums.extend([['729', 'SDN']]) v_template = { 'dsID': 'm49', 'period': updated, 'source': 'http://unstats.un.org/unsd/methods/m49/m49alpha.htm', 'is_number': False } builder = [] for entry in alphas: v = dict(v_template) v.update({'value': entry[0], 'region': entry[1], 'indID': 'm49-name'}) orm.Value(**v).save() if 'GBR' in repr(v): gb = True for entry in nums: v = dict(v_template) v.update({'value': entry[0], 'region': entry[1], 'indID': 'm49-num'}) orm.Value(**v).save() orm.session.commit() assert gb
"dsID": "echo", "period": 2012, "source": baseurl, "is_number": True } xls_raw = dl.grab(baseurl) mt = messytables.excel.XLSTableSet(xls_raw).tables[0] assert mt.name == "GNA Final Index (rank)" xy = xypath.Table.from_messy(mt) countries = xy.filter("ISO3").assert_one().fill(xypath.DOWN) vuln_h = xy.filter("GNA Vulnerability Index (VI)").assert_one() crisis_h = xy.filter("GNA Crisis Index (CI)").assert_one() headerheader(xy.filter("ISO3").assert_one(), xypath.DOWN, xypath.RIGHT) big = { 'region': headerheader(xy.filter("ISO3").assert_one(), xypath.DOWN, xypath.RIGHT), 'indID': { 'gna-vi': vuln_h.fill(xypath.DOWN), 'gna-ci': crisis_h.fill(xypath.DOWN) } } for olap_row in xypath.xyzzy.xyzzy(xy, big, valuename="value"): print olap_row full_olap = dict(value_template) full_olap.update(olap_row) orm.Value(**full_olap).save()
for reg in regions: j = requests.get(baseindexurl % reg).json() for country in j['Countries']: yield { 'region': country['Code'], 'value': baseleafurl % (country['Code'], country['OfficialName']) } print list(accuweather()) orm.DataSet(dsID="accuweather", last_updated=None, last_scraped=orm.now(), name="Accuweather").save() orm.Indicator(indID="accuweather_url", name="AccuWeather URL", units="").save() valuetemplate = { 'dsID': 'accuweather', 'indID': 'accuweather_url', 'period': None, 'source': 'http://www.accuweather.com' } for datarow in accuweather(): olap_row = dict(valuetemplate) olap_row.update(datarow) orm.Value(**olap_row).save()
root = lxml.html.fromstring(html) root.make_links_absolute(baseurl) return root.xpath( "//div[@id='content']//article//a[contains(text(),'xls')]/@href") def keyfunc(item): return (item['YEAR'], item['COUNTRY']) events = [] for url in geturls(): for row in parsesheet(url): if row['EVENT_TYPE'].strip() in NONVIOLENT: print row['EVENT_TYPE'] ## TODO: not working!!!! continue exit() events.append(row) sorted_e = sorted(events, key=keyfunc) for item in itertools.groupby(sorted_e, keyfunc): value_item = { 'dsID': 'acled', 'indID': 'PVX040', 'period': item[0][0], 'region': item[0][1], 'value': len(list(item[1])), 'source': 'http://www.acleddata.com/data/types-and-groups/' } orm.Value(**value_item).save()