Example #1
0
def getcountry(threeletter="PAK"):
    print threeletter
    baseurl = "http://api.worldbank.org/datafiles/%s_Country_MetaData_en_EXCEL.xls"
    value = {'dsID': 'World Bank',
             'region': threeletter,
             'source': baseurl % threeletter,
             'is_number': True}

    fh = dl.grab(baseurl % threeletter, [404])
    if not fh:
        return
    messy = messytables.excel.XLSTableSet(fh)
    table = xypath.Table.from_messy(list(messy.tables)[0])
    indicators = table.filter(is_in(indicator_list))
    indname = indicators.shift(x=-1)
    if not len(indname) == len(indicator_list):
        print "missing indicators", [x.value for x in indname]

    code = table.filter(equal_to('Indicator Code'))

    years = code.fill(xypath.RIGHT)
    junction = indname.junction(years)
    for ind_cell, year_cell, value_cell in junction:
        vdict = dict(value)
        vdict['indID'] = ind_cell.value
        vdict['period'] = year_cell.value
        vdict['value'] = value_cell.value

        indicator = {'indID': vdict['indID']}
        nameunits = re.search('(.*)\((.*)\)', vdict['indID'])
        if nameunits:
            (indicator['name'], indicator['units']) = nameunits.groups()
        else:
            indicator['name'] = vdict['indID']
            indicator['units'] = 'uno'
        Indicator(**indicator).save()
        v = Value(**vdict)
        if not v.is_blank():
            v.save()
    print len(session.query(Value).filter(Value.dsID == 'World Bank').all())
    session.commit()
Example #2
0
    def export(self, meta):
        ind = {
            'indID': meta['indID'],
            'name': self.name_for_fieldname(meta['fieldname']),
            'units': meta['unit']
        }
        Indicator(**ind).save()

        for item in self.extract(meta['fieldname']):
            if not item.get('region'):
                logging.warn("No region in {}".format(meta))
                continue
            value = {
                'dsID': dsID,
                'region': item['region'],
                'period': meta.get('period') or get_period(meta['fieldname']),
                'value': item['value'],
                'indID': meta['indID'],
                'source': self.url,
                'is_number': meta.get('is_number') or True
            }
            if value['region'] and value['value']:
                print value
                Value(**value).save()
    code = table.filter(equal_to('Indicator Code'))

    years = code.fill(xypath.RIGHT)
    junction = indname.junction(years)
    for ind_cell, year_cell, value_cell in junction:
        vdict = dict(value)
        vdict['indID'] = ind_cell.value
        vdict['period'] = year_cell.value
        vdict['value'] = value_cell.value

        indicator = {'indID': vdict['indID']}
        nameunits = re.search('(.*)\((.*)\)', vdict['indID'])
        if nameunits:
            (indicator['name'], indicator['units']) = nameunits.groups()
        else:
            indicator['name'] = vdict['indID']
            indicator['units'] = 'uno'
        Indicator(**indicator).save()
        v = Value(**vdict)
        if not v.is_blank():
            v.save()
    print len(session.query(Value).filter(Value.dsID == 'World Bank').all())
    session.commit()

for country in getcountrylist():
    try:
        getcountry(country)
    except Exception, e:
        print country, e
        raise
Example #4
0
def parse_rank(socrata_id, countries):
    for country in countries:
        if 'hdi_rank' in country:
	    yield {"dsID": dsID,
		   "region": country['country'],
		   "period": 2012,  # TODO
		   "value": int(country['hdi_rank']),
		   "indID": "PSE220",
		   "source": data_url.format(socrata_id),
		   "is_number": True}
		   
                           
DataSet(**dataset).save()
maxdate=None
for socrata_code in lookup:
    ind = get_metadata(socrata_code)
    Indicator(**ind).save()
    for value in get_numbers(socrata_code):
        Value(**value).save()

print "rank"
ind = {"indID": "PSE220",
       "name": "HDI Rank",
       "units": "rank"}
Indicator(**ind).save()
for rank in get_rank("u2dx-y6wx"):
    Value(**rank).save()


Example #5
0
    root = lxml.html.fromstring(html)
    root.make_links_absolute(url)
    return root.xpath('//a/@href')


for country in country_urls():
    print country
    html = requests.get(country).content
    root = lxml.html.fromstring(html)
    eng_tables = root.xpath('//table[following::font/text()="French"]')
    eng_text = ''.join(lxml.html.tostring(table) for table in eng_tables)
    data = {}
    for m_table in messytables.any.any_tableset(
            StringIO.StringIO(eng_text)).tables:
        table = xypath.Table.from_messy(m_table)
        for ind in indicators:
            target = table.filter(ind)
            if target:
                data[ind] = target.shift(indicators[ind]).value.strip()
    for item in data:
        value_data = dict(value_static)
        value_data['indID'] = 'unterm:' + item
        value_data['value'] = data[item].encode('latin1').decode('utf-8')
        value_data['region'] = data['ISO Country alpha-3-code']
        value_data['source'] = country
        value_data['period'] = datetime.datetime.now().isoformat()[:10]
        if value_data['value']:
            Value(**value_data).save()
    assert len(data) == len(indicators)
session.commit()
Example #6
0
def getindicator(ind="100106", overridefunction=None):
    if not overridefunction:
        baseurl = 'http://hdrstats.undp.org/en/indicators/display_cf_xls_indicator.cfm?indicator_id=%s&lang=en' % ind
        html = requests.get(baseurl).content
    else:
        html, baseurl = overridefunction()
    value = {
        'dsID': 'HDRStats',
        'indID': "HDR:" + ind,
        'source': baseurl,
        'is_number': True
    }

    dataset = {
        'dsID': 'HDRStats',
        'last_scraped': orm.now(),
        'name': 'Human Development Indicators, UNDP'
    }

    indicator = {'indID': "HDR:" + ind}
    hdi_indicator = {
        'indID': 'HDR:HDI Rank',
        'name': 'Human Development Index rank',
        'units': ''
    }
    Indicator(**hdi_indicator).save()
    DataSet(**dataset).save()
    print html
    exit(3)
    htmlio = StringIO.StringIO(html)
    messy = messytables.html.HTMLTableSet(htmlio)
    table = xypath.Table.from_messy(list(messy.tables)[0])
    root = lxml.html.fromstring(html)

    "get odd indicator / update time"
    indicator_text = root.xpath("//h2/text()")[-1]
    print indicator_text
    try:
        indicator_split, = re.findall("(.*)\(([^\(\)]+)\)", indicator_text)
    except ValueError:
        indicator_split = [indicator_text, ""]
    indicator['name'], indicator['units'] = indicator_split
    indicator['name'] = indicator['name'].strip()
    access_text, = [
        x.tail.strip() for x in root.xpath("//br")
        if str(x.tail) != "None" and x.tail.strip()
    ]
    access_date_raw, = re.findall('Accessed:(.*)from', access_text)
    dataset['last_updated'] = dateutil.parser.parse(
        access_date_raw).isoformat()
    print dataset['last_updated'], indicator['name'], "*", indicator['units']
    Indicator(**indicator).save()

    country_cell = table.filter("Country").assert_one()
    years = country_cell.fill(xypath.RIGHT).filter(lambda b: b.value != '')
    countries = country_cell.fill(xypath.DOWN)
    hdi_rank = table.filter("HDI Rank").assert_one()
    max_year = max(year.value for year in years)

    for i in countries.junction(hdi_rank):
        newvalue = dict(value)
        newvalue['indID'] = "HDR:HDI Rank"
        newvalue['region'] = get_region(i[0])
        newvalue['value'] = i[2].value.strip()
        newvalue[
            'period'] = 2012  # TODO Hard coded for now because year it pertains to is not clear
        if newvalue['value'].strip() != '..':
            Value(**newvalue).save()

    for i in countries.junction(years):
        newvalue = dict(value)
        newvalue['region'] = get_region(i[0])
        newvalue['value'] = i[2].value.strip()
        newvalue['period'] = i[1].value.strip()
        if newvalue['value'].strip() != '..':
            Value(**newvalue).save()
        print newvalue
    session.commit()