def getcountry(threeletter="PAK"): print threeletter baseurl = "http://api.worldbank.org/datafiles/%s_Country_MetaData_en_EXCEL.xls" value = {'dsID': 'World Bank', 'region': threeletter, 'source': baseurl % threeletter, 'is_number': True} fh = dl.grab(baseurl % threeletter, [404]) if not fh: return messy = messytables.excel.XLSTableSet(fh) table = xypath.Table.from_messy(list(messy.tables)[0]) indicators = table.filter(is_in(indicator_list)) indname = indicators.shift(x=-1) if not len(indname) == len(indicator_list): print "missing indicators", [x.value for x in indname] code = table.filter(equal_to('Indicator Code')) years = code.fill(xypath.RIGHT) junction = indname.junction(years) for ind_cell, year_cell, value_cell in junction: vdict = dict(value) vdict['indID'] = ind_cell.value vdict['period'] = year_cell.value vdict['value'] = value_cell.value indicator = {'indID': vdict['indID']} nameunits = re.search('(.*)\((.*)\)', vdict['indID']) if nameunits: (indicator['name'], indicator['units']) = nameunits.groups() else: indicator['name'] = vdict['indID'] indicator['units'] = 'uno' Indicator(**indicator).save() v = Value(**vdict) if not v.is_blank(): v.save() print len(session.query(Value).filter(Value.dsID == 'World Bank').all()) session.commit()
def export(self, meta): ind = { 'indID': meta['indID'], 'name': self.name_for_fieldname(meta['fieldname']), 'units': meta['unit'] } Indicator(**ind).save() for item in self.extract(meta['fieldname']): if not item.get('region'): logging.warn("No region in {}".format(meta)) continue value = { 'dsID': dsID, 'region': item['region'], 'period': meta.get('period') or get_period(meta['fieldname']), 'value': item['value'], 'indID': meta['indID'], 'source': self.url, 'is_number': meta.get('is_number') or True } if value['region'] and value['value']: print value Value(**value).save()
code = table.filter(equal_to('Indicator Code')) years = code.fill(xypath.RIGHT) junction = indname.junction(years) for ind_cell, year_cell, value_cell in junction: vdict = dict(value) vdict['indID'] = ind_cell.value vdict['period'] = year_cell.value vdict['value'] = value_cell.value indicator = {'indID': vdict['indID']} nameunits = re.search('(.*)\((.*)\)', vdict['indID']) if nameunits: (indicator['name'], indicator['units']) = nameunits.groups() else: indicator['name'] = vdict['indID'] indicator['units'] = 'uno' Indicator(**indicator).save() v = Value(**vdict) if not v.is_blank(): v.save() print len(session.query(Value).filter(Value.dsID == 'World Bank').all()) session.commit() for country in getcountrylist(): try: getcountry(country) except Exception, e: print country, e raise
def parse_rank(socrata_id, countries): for country in countries: if 'hdi_rank' in country: yield {"dsID": dsID, "region": country['country'], "period": 2012, # TODO "value": int(country['hdi_rank']), "indID": "PSE220", "source": data_url.format(socrata_id), "is_number": True} DataSet(**dataset).save() maxdate=None for socrata_code in lookup: ind = get_metadata(socrata_code) Indicator(**ind).save() for value in get_numbers(socrata_code): Value(**value).save() print "rank" ind = {"indID": "PSE220", "name": "HDI Rank", "units": "rank"} Indicator(**ind).save() for rank in get_rank("u2dx-y6wx"): Value(**rank).save()
root = lxml.html.fromstring(html) root.make_links_absolute(url) return root.xpath('//a/@href') for country in country_urls(): print country html = requests.get(country).content root = lxml.html.fromstring(html) eng_tables = root.xpath('//table[following::font/text()="French"]') eng_text = ''.join(lxml.html.tostring(table) for table in eng_tables) data = {} for m_table in messytables.any.any_tableset( StringIO.StringIO(eng_text)).tables: table = xypath.Table.from_messy(m_table) for ind in indicators: target = table.filter(ind) if target: data[ind] = target.shift(indicators[ind]).value.strip() for item in data: value_data = dict(value_static) value_data['indID'] = 'unterm:' + item value_data['value'] = data[item].encode('latin1').decode('utf-8') value_data['region'] = data['ISO Country alpha-3-code'] value_data['source'] = country value_data['period'] = datetime.datetime.now().isoformat()[:10] if value_data['value']: Value(**value_data).save() assert len(data) == len(indicators) session.commit()
def getindicator(ind="100106", overridefunction=None): if not overridefunction: baseurl = 'http://hdrstats.undp.org/en/indicators/display_cf_xls_indicator.cfm?indicator_id=%s&lang=en' % ind html = requests.get(baseurl).content else: html, baseurl = overridefunction() value = { 'dsID': 'HDRStats', 'indID': "HDR:" + ind, 'source': baseurl, 'is_number': True } dataset = { 'dsID': 'HDRStats', 'last_scraped': orm.now(), 'name': 'Human Development Indicators, UNDP' } indicator = {'indID': "HDR:" + ind} hdi_indicator = { 'indID': 'HDR:HDI Rank', 'name': 'Human Development Index rank', 'units': '' } Indicator(**hdi_indicator).save() DataSet(**dataset).save() print html exit(3) htmlio = StringIO.StringIO(html) messy = messytables.html.HTMLTableSet(htmlio) table = xypath.Table.from_messy(list(messy.tables)[0]) root = lxml.html.fromstring(html) "get odd indicator / update time" indicator_text = root.xpath("//h2/text()")[-1] print indicator_text try: indicator_split, = re.findall("(.*)\(([^\(\)]+)\)", indicator_text) except ValueError: indicator_split = [indicator_text, ""] indicator['name'], indicator['units'] = indicator_split indicator['name'] = indicator['name'].strip() access_text, = [ x.tail.strip() for x in root.xpath("//br") if str(x.tail) != "None" and x.tail.strip() ] access_date_raw, = re.findall('Accessed:(.*)from', access_text) dataset['last_updated'] = dateutil.parser.parse( access_date_raw).isoformat() print dataset['last_updated'], indicator['name'], "*", indicator['units'] Indicator(**indicator).save() country_cell = table.filter("Country").assert_one() years = country_cell.fill(xypath.RIGHT).filter(lambda b: b.value != '') countries = country_cell.fill(xypath.DOWN) hdi_rank = table.filter("HDI Rank").assert_one() max_year = max(year.value for year in years) for i in countries.junction(hdi_rank): newvalue = dict(value) newvalue['indID'] = "HDR:HDI Rank" newvalue['region'] = get_region(i[0]) newvalue['value'] = i[2].value.strip() newvalue[ 'period'] = 2012 # TODO Hard coded for now because year it pertains to is not clear if newvalue['value'].strip() != '..': Value(**newvalue).save() for i in countries.junction(years): newvalue = dict(value) newvalue['region'] = get_region(i[0]) newvalue['value'] = i[2].value.strip() newvalue['period'] = i[1].value.strip() if newvalue['value'].strip() != '..': Value(**newvalue).save() print newvalue session.commit()