Example #1
0
    "ISO Currency Code": xypath.RIGHT,
    "Short Name": xypath.RIGHT,
    "Formal Name": xypath.RIGHT,
    "Capital City": xypath.RIGHT,
    "Languages": xypath.RIGHT,
    "Currency Abbr.": xypath.DOWN
}

dataset_data = {
    'dsID': 'unterm',
    'last_updated': "",
    'last_scraped': orm.now(),
    'name': 'unterm'
}

DataSet(**dataset_data).save()

indicator_data = [{
    'indID': 'unterm:' + i,
    'name': i,
    'units': ''
} for i in indicators]
for db_row in indicator_data:
    Indicator(**db_row).save()
"""Value: dsID, region, indID, period, value, source, is_number
   DataSet: dsID, last_updated, last_scraped, name
   Indicator: indID, name, units
   """

value_static = {'dsID': 'unterm', 'period': '', 'is_number': False}
Example #2
0
logging.basicConfig()
"""Value: dsID, region, indID, period, value, source, is_number
   DataSet: dsID, last_updated, last_scraped, name
   Indicator: indID, name, units
   """

dsID = "data.undp.org"

dataset = {
    "dsID": dsID,
    "last_updated": None,  # TODO max(pubdate)
    "last_scraped": orm.now(),
    "name": "UNDP Open Data"
}
DataSet(**dataset).save()

data_url = "http://data.undp.org/resource/{}.json"

lookup = [
    {
        'soc': 'ku9i-8fxp',
        'fieldname': 'gender_inequality_index_value_2013',
        'indID': 'HDR:68606',
        'unit': 'Index'
    },
    {
        'soc': 'myer-egms',
        'fieldname': '_2013_gross_national_income_gni_per_capita_2011_ppp',
        'indID': 'chd.eco.135',
        'unit': '2011 PPP $',
Example #3
0
def getindicator(ind="100106", overridefunction=None):
    if not overridefunction:
        baseurl = 'http://hdrstats.undp.org/en/indicators/display_cf_xls_indicator.cfm?indicator_id=%s&lang=en' % ind
        html = requests.get(baseurl).content
    else:
        html, baseurl = overridefunction()
    value = {
        'dsID': 'HDRStats',
        'indID': "HDR:" + ind,
        'source': baseurl,
        'is_number': True
    }

    dataset = {
        'dsID': 'HDRStats',
        'last_scraped': orm.now(),
        'name': 'Human Development Indicators, UNDP'
    }

    indicator = {'indID': "HDR:" + ind}
    hdi_indicator = {
        'indID': 'HDR:HDI Rank',
        'name': 'Human Development Index rank',
        'units': ''
    }
    Indicator(**hdi_indicator).save()
    DataSet(**dataset).save()
    print html
    exit(3)
    htmlio = StringIO.StringIO(html)
    messy = messytables.html.HTMLTableSet(htmlio)
    table = xypath.Table.from_messy(list(messy.tables)[0])
    root = lxml.html.fromstring(html)

    "get odd indicator / update time"
    indicator_text = root.xpath("//h2/text()")[-1]
    print indicator_text
    try:
        indicator_split, = re.findall("(.*)\(([^\(\)]+)\)", indicator_text)
    except ValueError:
        indicator_split = [indicator_text, ""]
    indicator['name'], indicator['units'] = indicator_split
    indicator['name'] = indicator['name'].strip()
    access_text, = [
        x.tail.strip() for x in root.xpath("//br")
        if str(x.tail) != "None" and x.tail.strip()
    ]
    access_date_raw, = re.findall('Accessed:(.*)from', access_text)
    dataset['last_updated'] = dateutil.parser.parse(
        access_date_raw).isoformat()
    print dataset['last_updated'], indicator['name'], "*", indicator['units']
    Indicator(**indicator).save()

    country_cell = table.filter("Country").assert_one()
    years = country_cell.fill(xypath.RIGHT).filter(lambda b: b.value != '')
    countries = country_cell.fill(xypath.DOWN)
    hdi_rank = table.filter("HDI Rank").assert_one()
    max_year = max(year.value for year in years)

    for i in countries.junction(hdi_rank):
        newvalue = dict(value)
        newvalue['indID'] = "HDR:HDI Rank"
        newvalue['region'] = get_region(i[0])
        newvalue['value'] = i[2].value.strip()
        newvalue[
            'period'] = 2012  # TODO Hard coded for now because year it pertains to is not clear
        if newvalue['value'].strip() != '..':
            Value(**newvalue).save()

    for i in countries.junction(years):
        newvalue = dict(value)
        newvalue['region'] = get_region(i[0])
        newvalue['value'] = i[2].value.strip()
        newvalue['period'] = i[1].value.strip()
        if newvalue['value'].strip() != '..':
            Value(**newvalue).save()
        print newvalue
    session.commit()