"ISO Currency Code": xypath.RIGHT, "Short Name": xypath.RIGHT, "Formal Name": xypath.RIGHT, "Capital City": xypath.RIGHT, "Languages": xypath.RIGHT, "Currency Abbr.": xypath.DOWN } dataset_data = { 'dsID': 'unterm', 'last_updated': "", 'last_scraped': orm.now(), 'name': 'unterm' } DataSet(**dataset_data).save() indicator_data = [{ 'indID': 'unterm:' + i, 'name': i, 'units': '' } for i in indicators] for db_row in indicator_data: Indicator(**db_row).save() """Value: dsID, region, indID, period, value, source, is_number DataSet: dsID, last_updated, last_scraped, name Indicator: indID, name, units """ value_static = {'dsID': 'unterm', 'period': '', 'is_number': False}
logging.basicConfig() """Value: dsID, region, indID, period, value, source, is_number DataSet: dsID, last_updated, last_scraped, name Indicator: indID, name, units """ dsID = "data.undp.org" dataset = { "dsID": dsID, "last_updated": None, # TODO max(pubdate) "last_scraped": orm.now(), "name": "UNDP Open Data" } DataSet(**dataset).save() data_url = "http://data.undp.org/resource/{}.json" lookup = [ { 'soc': 'ku9i-8fxp', 'fieldname': 'gender_inequality_index_value_2013', 'indID': 'HDR:68606', 'unit': 'Index' }, { 'soc': 'myer-egms', 'fieldname': '_2013_gross_national_income_gni_per_capita_2011_ppp', 'indID': 'chd.eco.135', 'unit': '2011 PPP $',
def getindicator(ind="100106", overridefunction=None): if not overridefunction: baseurl = 'http://hdrstats.undp.org/en/indicators/display_cf_xls_indicator.cfm?indicator_id=%s&lang=en' % ind html = requests.get(baseurl).content else: html, baseurl = overridefunction() value = { 'dsID': 'HDRStats', 'indID': "HDR:" + ind, 'source': baseurl, 'is_number': True } dataset = { 'dsID': 'HDRStats', 'last_scraped': orm.now(), 'name': 'Human Development Indicators, UNDP' } indicator = {'indID': "HDR:" + ind} hdi_indicator = { 'indID': 'HDR:HDI Rank', 'name': 'Human Development Index rank', 'units': '' } Indicator(**hdi_indicator).save() DataSet(**dataset).save() print html exit(3) htmlio = StringIO.StringIO(html) messy = messytables.html.HTMLTableSet(htmlio) table = xypath.Table.from_messy(list(messy.tables)[0]) root = lxml.html.fromstring(html) "get odd indicator / update time" indicator_text = root.xpath("//h2/text()")[-1] print indicator_text try: indicator_split, = re.findall("(.*)\(([^\(\)]+)\)", indicator_text) except ValueError: indicator_split = [indicator_text, ""] indicator['name'], indicator['units'] = indicator_split indicator['name'] = indicator['name'].strip() access_text, = [ x.tail.strip() for x in root.xpath("//br") if str(x.tail) != "None" and x.tail.strip() ] access_date_raw, = re.findall('Accessed:(.*)from', access_text) dataset['last_updated'] = dateutil.parser.parse( access_date_raw).isoformat() print dataset['last_updated'], indicator['name'], "*", indicator['units'] Indicator(**indicator).save() country_cell = table.filter("Country").assert_one() years = country_cell.fill(xypath.RIGHT).filter(lambda b: b.value != '') countries = country_cell.fill(xypath.DOWN) hdi_rank = table.filter("HDI Rank").assert_one() max_year = max(year.value for year in years) for i in countries.junction(hdi_rank): newvalue = dict(value) newvalue['indID'] = "HDR:HDI Rank" newvalue['region'] = get_region(i[0]) newvalue['value'] = i[2].value.strip() newvalue[ 'period'] = 2012 # TODO Hard coded for now because year it pertains to is not clear if newvalue['value'].strip() != '..': Value(**newvalue).save() for i in countries.junction(years): newvalue = dict(value) newvalue['region'] = get_region(i[0]) newvalue['value'] = i[2].value.strip() newvalue['period'] = i[1].value.strip() if newvalue['value'].strip() != '..': Value(**newvalue).save() print newvalue session.commit()