Ejemplo n.º 1
0
def do_jobs():
    print "jobs"
    indID = "reliefweb_jobs"
    indicator = {
        'indID': indID,
        'name': "Number of jobs on ReliefWeb at specified time",
        'units': 'uno'
    }
    orm.Indicator(**indicator).save()
    for country in countries:
        url = "http://api.rwlabs.org/v0/job/list"
        r = requests.get(url, data=get_job_query(country))
        if 'data' not in r.json():
            print r.json()
            print country
            continue
        value = {
            'region': country,
            'period': orm.now()[:10],  # we don't need sub-day precision.
            'value': r.json()['data']['total'],
            'dsID': dsID,
            'indID': indID,
            'source': url,
            'is_number': True
        }
        orm.Value(**value).save()
Ejemplo n.º 2
0
def save_dataset():
    dataset = {
        "dsID": DSID,
        "last_updated": None,
        "last_scraped": orm.now(),
        "name": "United Nations Office on Drugs and Crime"
    }
    orm.DataSet(**dataset).save()
Ejemplo n.º 3
0
def main():
    for sheet in spreadsheets:
        print sheet
        shortname = sheet.split('/')[-1].split('.')[0]
        dsID = 'esa-unpd-' + shortname.replace('_', '-').split('-')[0]
        year_text, = re.findall('\d{4}', dsID)
        dataset = {
            "dsID": dsID,
            "last_updated": year_text,
            "last_scraped": orm.now(),
            "name": "esa-unpd"
        }

        orm.DataSet(**dataset).save()
        indicator = {"indID": shortname, "name": shortname, "units": ''}
        # we replace the indicator name, so not saving now.
        # orm.Indicator(**indicator).save()
        value_template = {"dsID": dsID, "is_number": True, "source": sheet}

        raw = dl.grab(sheet)
        mtables = messytables.any.any_tableset(raw)
        names = [x.name for x in mtables.tables]
        if 'ESTIMATES' in names:
            mt = mtables['ESTIMATES']
        else:
            mt = mtables['PROPORTION-URBAN']
        table = xypath.Table.from_messy(mt)

        filestring = table.filter(
            re.compile("File[^:]*:.*")).assert_one().value
        indicator['name'], indicator['units'] = parse_file_string(filestring)
        print indicator['name']
        orm.Indicator(**indicator).save()

        region_header = table.filter(
            re.compile("Major area, region, country or area.*")).assert_one()
        ccode_header = table.filter(re.compile("Country.code")).assert_one()
        regions = region_header.fill(xypath.DOWN)
        years = ccode_header.fill(xypath.RIGHT)
        for region_cell, year_cell, value_cell in regions.junction(years):
            value = dict(value_template)
            value['indID'] = indicator['indID']
            value['region'] = region_cell.value
            year_value = year_cell.value
            if isinstance(year_value, basestring) and '-' in year_value:
                year1, _, year2 = year_value.partition('-')
                year_count = int(year2) - int(year1)
                assert year_count == 5
                year_value = "%s/P%dY" % (year1, year_count)
            value['period'] = year_value
            value['value'] = value_cell.value
            orm.Value(**value).save()
            #print value
    orm.session.commit()
Ejemplo n.º 4
0
def main():
  for sheet in spreadsheets:
    shortname = sheet.split('/')[-1].split('.')[0]
    dsID = 'esa-unpd-' + shortname.replace('_', '-').split('-')[0]
    year_text, = re.findall('\d{4}', dsID)
    dataset = {"dsID": dsID,
               "last_updated": year_text,
               "last_scraped": orm.now(),
               "name": "esa-unpd"}

    orm.DataSet(**dataset).save()
    indicator = {"indID": shortname,
                 "name": shortname,
                 "units": ''
                }
    # we replace the indicator name, so not saving now.
    # orm.Indicator(**indicator).save()
    value_template = {"dsID": dsID,
                      "is_number": True,
                      "source": sheet}

    raw = dl.grab(sheet)
    mtables = messytables.any.any_tableset(raw)
    names = [x.name for x in mtables.tables]
    if 'ESTIMATES' in names:
        mt = mtables['ESTIMATES']
    else:
        mt = mtables['PROPORTION-URBAN']
    table = xypath.Table.from_messy(mt)

    filestring = table.filter(re.compile("File[^:]*:.*")).assert_one().value
    indicator['name'], indicator['units'] = parse_file_string(filestring)
    print indicator['name']
    orm.Indicator(**indicator).save()

    region_header = table.filter(re.compile("Major area, region, country or area.*")).assert_one()
    ccode_header = table.filter(re.compile("Country.code")).assert_one()
    regions = region_header.fill(xypath.DOWN)
    years = ccode_header.fill(xypath.RIGHT)
    for region_cell, year_cell, value_cell in regions.junction(years):
        value = dict(value_template)
        value['indID'] = indicator['indID']
        value['region'] = region_cell.value
        year_value = year_cell.value
        if isinstance(year_value, basestring) and '-' in year_value:
            year1, _, year2 = year_value.partition('-')
            year_count = int(year2) - int(year1)
            assert year_count == 5
            year_value = "%s/P%dY" % (year1, year_count)
        value['period'] = year_value
        value['value'] = value_cell.value
        orm.Value(**value).save()
        #print value
  orm.session.commit()
Ejemplo n.º 5
0
def doit(targets, names, year):
    # country_cells: we used to assert_one(), but sometimes there's two!
    country_cells = table.filter('iso').fill(xypath.DOWN)
    country_cells = country_cells - country_cells.filter('iso')  # remove other
    if not country_cells: print "no countries"
    country_year_filter = country_cells.filter(
        lambda b: b.shift(xypath.RIGHT).value == year)
    if not country_year_filter: print "no countries for ", year
    target_cells = table.filter(lambda b: b.value in targets)
    if not target_cells: print "didn't find ", targets

    value = {
        'dsID': 'emdat',
        'period': "%s/P1Y" % (year),
        'source': url,
        'is_number': True
    }

    dataset = {
        'dsID': 'emdat',
        'last_updated': None,
        'last_scraped': orm.now(),
        'name': 'EM-DAT'
    }
    orm.DataSet(**dataset).save()

    for i, t in enumerate(targets):
        indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'}
        if t == 'total_dam':
            indicator['units'] = ",000$ USD"
        orm.Indicator(**indicator).save()
    for cname, one_country_cells in itertools.groupby(country_year_filter,
                                                      lambda b: b.value):
        value['region'] = cname
        one_country_bag = xypath.Bag.from_list(one_country_cells, name=cname)
        for target_cell in target_cells:
            j = one_country_bag.junction(target_cell)
            value['indID'] = 'emdat:%s' % target_cell.value
            value['value'] = sum(int(x[2].value) for x in j)
            orm.Value(**value).save()
            print value
    orm.session.commit()
Ejemplo n.º 6
0
def do_jobs():
    print "jobs"
    indID = "reliefweb_jobs"
    indicator = {'indID': indID,
                 'name': "Number of jobs on ReliefWeb at specified time",
                 'units': 'uno'}
    orm.Indicator(**indicator).save()
    for country in countries:
        url = "http://api.rwlabs.org/v0/job/list"
        r = requests.get(url, data=get_job_query(country))
        if 'data' not in r.json():
            print r.json()
            print country
            continue
        value = {'region': country,
                 'period': orm.now()[:10],  # we don't need sub-day precision.
                 'value': r.json()['data']['total'],
                 'dsID': dsID,
                 'indID': indID,
                 'source': url,
                 'is_number': True}
        orm.Value(**value).save()
Ejemplo n.º 7
0
def doit():
    # country_cells: we used to assert_one(), but sometimes there's two!

    dataset = {
        'dsID': 'emdat',
        'last_updated': None,
        'last_scraped': orm.now(),
        'name': 'EM-DAT'
    }
    orm.DataSet(**dataset).save()

    for i, t in enumerate(targets):
        indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'}
        if t == 'total_damage':
            indicator['units'] = ",000$ USD"
        orm.Indicator(**indicator).save()

    for country in country_list():  # TODO country_list
        print country
        raw = dl.grab(url.format(country))
        m_tables = messytables.any.any_tableset(raw)
        mt, = m_tables.tables
        table = xypath.Table.from_messy(mt)
        yr = table.filter('year').assert_one()
        years = yr.fill(xypath.DOWN)
        cats = yr.fill(xypath.RIGHT)
        for year, cat, value in years.junction(cats):
            value = {
                'dsID': 'emdat',
                'region': country,
                'indID': 'emdat:{}'.format(cat.value),
                'period': '{}/P1Y'.format(year.value),
                'value': value.value,
                'source': url,
                'is_number': True
            }
            orm.Value(**value).save()
    orm.session.commit()
Ejemplo n.º 8
0
def doit(targets, names, year):
    # country_cells: we used to assert_one(), but sometimes there's two!
    country_cells = table.filter('iso').fill(xypath.DOWN)
    country_cells = country_cells - country_cells.filter('iso')  # remove other
    if not country_cells: print "no countries"
    country_year_filter = country_cells.filter(lambda b: b.shift(xypath.RIGHT).value == year)
    if not country_year_filter: print "no countries for ", year
    target_cells = table.filter(lambda b: b.value in targets)
    if not target_cells: print "didn't find ", targets

    value = {'dsID': 'emdat',
             'period': "%s/P1Y" % (year),
             'source': url,
             'is_number': True}

    dataset = {'dsID': 'emdat',
               'last_updated': None,
               'last_scraped': orm.now(),
               'name': 'EM-DAT'}
    orm.DataSet(**dataset).save()

    for i, t in enumerate(targets):
        indicator = {'indID': "emdat:%s" % t,
                     'name': names[i],
                     'units': 'uno'}
        if t == 'total_dam':
            indicator['units'] = ",000$ USD"
        orm.Indicator(**indicator).save()
    for cname, one_country_cells in itertools.groupby(country_year_filter, lambda b: b.value):
        value['region'] = cname
        one_country_bag = xypath.Bag.from_list(one_country_cells, name=cname)
        for target_cell in target_cells:
            j = one_country_bag.junction(target_cell)
            value['indID'] = 'emdat:%s' % target_cell.value
            value['value'] = sum(int(x[2].value) for x in j)
            orm.Value(**value).save()
            print value
    orm.session.commit()
Ejemplo n.º 9
0
def doit():
    # country_cells: we used to assert_one(), but sometimes there's two!

    dataset = {'dsID': 'emdat',
               'last_updated': None,
               'last_scraped': orm.now(),
               'name': 'EM-DAT'}
    orm.DataSet(**dataset).save()

    for i, t in enumerate(targets):
        indicator = {'indID': "emdat:%s" % t,
                     'name': names[i],
                     'units': 'uno'}
        if t == 'total_damage':
            indicator['units'] = ",000$ USD"
        orm.Indicator(**indicator).save()
    
    for country in country_list():  # TODO country_list
        print country
        raw = dl.grab(url.format(country))
        m_tables = messytables.any.any_tableset(raw)
        mt, = m_tables.tables
        table = xypath.Table.from_messy(mt)
        yr = table.filter('year').assert_one()
        years = yr.fill(xypath.DOWN)
        cats = yr.fill(xypath.RIGHT)
        for year, cat, value in years.junction(cats):
            value = {'dsID': 'emdat',
                     'region': country,
                     'indID': 'emdat:{}'.format(cat.value),
                     'period': '{}/P1Y'.format(year.value),
                     'value': value.value,
                     'source': url,
                     'is_number': True}
            orm.Value(**value).save()
    orm.session.commit()
Ejemplo n.º 10
0
    "665",  # improved water
    "668",  # improved sanitation
    "553",  # maternal mortality
    "561",  # under 5 mortality
    "589",  # primary education ratio
    "559",  # severely underweight
    "755",  # }
    "756",  # } telecoms x3
    "605",  # }
    "640",  # energy consumption
]

dataset = {
    "dsID": "mdgs",
    "last_updated": None,
    "last_scraped": orm.now(),
    "name": "Millennium Development Goals"
}

value_template = {"dsID": "mdgs", "is_number": True}


def do_indicator(ind="566"):
    baseurl = "http://mdgs.un.org/unsd/mdg/Handlers/ExportHandler.ashx?Type=Csv&Series=%s"
    url = baseurl % ind
    value_template['source'] = url
    handle = dl.grab(url)
    mt, = messytables.any.any_tableset(handle).tables
    table = xypath.Table.from_messy(mt)
    country_anchor = table.filter("Country").assert_one()
    years = country_anchor.fill(xypath.RIGHT).filter(re.compile("\d\d\d\d"))
Ejemplo n.º 11
0
Economy
Transport
Education
Demographics
Religion
""".strip().lower().split('\n')

import orm
"""Value: dsID, region, indID, period, value, source, is_number
   DataSet: dsID, last_updated, last_scraped, name
      Indicator: indID, name, units
         """

dataset = {'dsID': 'wikipedia',
           'last_updated': None,  # TODO
           'last_scraped': orm.now(),
           'name': 'Wikipedia'}

orm.DataSet(**dataset).save()

for h in headers:
    indicator = {'indID': 'wikipedia:' + h,
                 'name': 'Wikipedia: ' + h,
                 'units': 'url'}
    orm.Indicator(**indicator).save()

value_template = {'dsID': 'wikipedia',
                  'period': None,
                  'is_number': False}

Ejemplo n.º 12
0
import re
import datetime
import requests
from orm import session, Value, DataSet, Indicator
import orm
"""Value: dsID, region, indID, period, value, source, is_number
   DataSet: dsID, last_updated, last_scraped, name
   Indicator: indID, name, units
   """

dsID = "data.undp.org"

dataset = {"dsID": dsID,
           "last_updated": None,  # TODO max(pubdate)
           "last_scraped": orm.now(), 
           "name": "UNDP Open Data"}

metadata_url = "https://data.undp.org/api/views/{}/rows.json?accessType=DOWNLOAD"
data_url = "http://data.undp.org/resource/{}.json"
lookup = {"u2dx-y6wx": "PSE110",  # GNI per capita in PPP terms (constant 2005 international $)
          "bkr7-unqh": "PVE010",  # Public expenditure on education (% of GDP) (%)
          "m67k-vi5c": "PVE110",  # Mean years of schooling (of adults)|years
          "jbhn-xkjv": "PVE120",  # Combined gross enrolment in education (both sexes)
          "ehe9-pgud": "PSE160",  # MPI: Population living below $1.25 PPP per day (%)
          "a4ay-qce2": "PVH120",  # Under-five mortality
          "bh77-rzbn": "HDR:68606",  # GII: Gender Inequality Index, value
          "qnam-f624": "PVE030",  # Expected Year of Schooling (of children)
          "4gkx-mq89": "PVH180",  # Maternal mortality ratio
          "x22y-8m6h": "PVE040",  # Adult literacy rate, both sexes (% aged 15 and above)

         # "---------": "------",  # Impact of natural disasters: number of deaths
Ejemplo n.º 13
0
def getindicator(ind="100106", overridefunction=None):
    if not overridefunction:
        baseurl = 'http://hdrstats.undp.org/en/indicators/display_cf_xls_indicator.cfm?indicator_id=%s&lang=en' % ind
        html = requests.get(baseurl).content
    else:
        html, baseurl = overridefunction()
    value = {
        'dsID': 'HDRStats',
        'indID': "HDR:" + ind,
        'source': baseurl,
        'is_number': True
    }

    dataset = {
        'dsID': 'HDRStats',
        'last_scraped': orm.now(),
        'name': 'Human Development Indicators, UNDP'
    }

    indicator = {'indID': "HDR:" + ind}
    hdi_indicator = {
        'indID': 'HDR:HDI Rank',
        'name': 'Human Development Index rank',
        'units': ''
    }
    Indicator(**hdi_indicator).save()
    DataSet(**dataset).save()
    print html
    exit(3)
    htmlio = StringIO.StringIO(html)
    messy = messytables.html.HTMLTableSet(htmlio)
    table = xypath.Table.from_messy(list(messy.tables)[0])
    root = lxml.html.fromstring(html)

    "get odd indicator / update time"
    indicator_text = root.xpath("//h2/text()")[-1]
    print indicator_text
    try:
        indicator_split, = re.findall("(.*)\(([^\(\)]+)\)", indicator_text)
    except ValueError:
        indicator_split = [indicator_text, ""]
    indicator['name'], indicator['units'] = indicator_split
    indicator['name'] = indicator['name'].strip()
    access_text, = [
        x.tail.strip() for x in root.xpath("//br")
        if str(x.tail) != "None" and x.tail.strip()
    ]
    access_date_raw, = re.findall('Accessed:(.*)from', access_text)
    dataset['last_updated'] = dateutil.parser.parse(
        access_date_raw).isoformat()
    print dataset['last_updated'], indicator['name'], "*", indicator['units']
    Indicator(**indicator).save()

    country_cell = table.filter("Country").assert_one()
    years = country_cell.fill(xypath.RIGHT).filter(lambda b: b.value != '')
    countries = country_cell.fill(xypath.DOWN)
    hdi_rank = table.filter("HDI Rank").assert_one()
    max_year = max(year.value for year in years)

    for i in countries.junction(hdi_rank):
        newvalue = dict(value)
        newvalue['indID'] = "HDR:HDI Rank"
        newvalue['region'] = get_region(i[0])
        newvalue['value'] = i[2].value.strip()
        newvalue[
            'period'] = 2012  # TODO Hard coded for now because year it pertains to is not clear
        if newvalue['value'].strip() != '..':
            Value(**newvalue).save()

    for i in countries.junction(years):
        newvalue = dict(value)
        newvalue['region'] = get_region(i[0])
        newvalue['value'] = i[2].value.strip()
        newvalue['period'] = i[1].value.strip()
        if newvalue['value'].strip() != '..':
            Value(**newvalue).save()
        print newvalue
    session.commit()
Ejemplo n.º 14
0
def yeartotimestamp(year):
    d = datetime.datetime(year=year, month=1, day=1)
    return int(d.strftime('%s'))


def getcountrylist():
    for value in orm.session.query(
            orm.Value).filter(orm.Value.indID == "CG060").all():
        yield value.region


dsID = "reliefweb-api"

dataset = {
    'dsID': dsID,
    'last_updated': orm.now(),
    'last_scraped': orm.now(),
    'name': "ReliefWeb API"
}

orm.DataSet(**dataset).save()

ocha_products = """Situation Report
Humanitarian Bulletin
Humanitarian Dashboard
Humanitarian Snapshot
Key Messages
Press Release
Press Review
Statement/Speech
Other
Ejemplo n.º 15
0
    baseindexurl = "http://www.accuweather.com/ajax-service/getcountrylist?region=%s&languageID=1"
    baseleafurl = "http://www.accuweather.com/en/%s/%s-weather"
    regions = "afr ant arc asi cac eur mea nam ocn sam".split(" ")

    for reg in regions:
        j = requests.get(baseindexurl % reg).json()
        for country in j['Countries']:
            yield {'region': country['Code'],
                   'value': baseleafurl % (country['Code'],
                                          country['OfficialName'])}

print list(accuweather())

orm.DataSet(dsID="accuweather",
            last_updated=None,
            last_scraped=orm.now(),
            name="Accuweather").save()

orm.Indicator(indID="accuweather_url",
              name="AccuWeather URL",
              units="").save()

valuetemplate = {'dsID': 'accuweather',
                 'indID': 'accuweather_url',
                 'period': None,
                 'source': 'http://www.accuweather.com'}

for datarow in accuweather():
    olap_row = dict(valuetemplate)
    olap_row.update(datarow)
    orm.Value(**olap_row).save()
Ejemplo n.º 16
0
    for reg in regions:
        j = requests.get(baseindexurl % reg).json()
        for country in j['Countries']:
            yield {
                'region': country['Code'],
                'value':
                baseleafurl % (country['Code'], country['OfficialName'])
            }


print list(accuweather())

orm.DataSet(dsID="accuweather",
            last_updated=None,
            last_scraped=orm.now(),
            name="Accuweather").save()

orm.Indicator(indID="accuweather_url", name="AccuWeather URL", units="").save()

valuetemplate = {
    'dsID': 'accuweather',
    'indID': 'accuweather_url',
    'period': None,
    'source': 'http://www.accuweather.com'
}

for datarow in accuweather():
    olap_row = dict(valuetemplate)
    olap_row.update(datarow)
    orm.Value(**olap_row).save()
Ejemplo n.º 17
0
"""


def yeartotimestamp(year):
    d = datetime.datetime(year=year, month=1, day=1)
    return int(d.strftime('%s'))


def getcountrylist():
    for value in orm.session.query(orm.Value).filter(orm.Value.indID == "CG060").all():
        yield value.region

dsID = "reliefweb-api"

dataset = {'dsID': dsID,
           'last_updated': orm.now(),
           'last_scraped': orm.now(),
           'name': "ReliefWeb API"}

orm.DataSet(**dataset).save()

ocha_products = """Situation Report
Humanitarian Bulletin
Humanitarian Dashboard
Humanitarian Snapshot
Key Messages
Press Release
Press Review
Statement/Speech
Other
Thematic Map
Ejemplo n.º 18
0
def getindicator(ind="100106", overridefunction=None):
    if not overridefunction:
        baseurl = 'http://hdrstats.undp.org/en/indicators/display_cf_xls_indicator.cfm?indicator_id=%s&lang=en' % ind
        html = requests.get(baseurl).content
    else:
        html, baseurl = overridefunction()
    value = {'dsID': 'HDRStats',
             'indID': "HDR:"+ind,
             'source': baseurl,
             'is_number': True}

    dataset = {'dsID': 'HDRStats',
               'last_scraped': orm.now(),
               'name': 'Human Development Indicators, UNDP'}

    indicator = {'indID': "HDR:"+ind}
    hdi_indicator = {'indID': 'HDR:HDI Rank',
                     'name': 'Human Development Index rank',
                     'units': ''}
    Indicator(**hdi_indicator).save()
    DataSet(**dataset).save()
    print html
    exit(3)
    htmlio = StringIO.StringIO(html)
    messy = messytables.html.HTMLTableSet(htmlio)
    table = xypath.Table.from_messy(list(messy.tables)[0])
    root = lxml.html.fromstring(html)

    "get odd indicator / update time"
    indicator_text = root.xpath("//h2/text()")[-1]
    print indicator_text
    try:
        indicator_split, = re.findall("(.*)\(([^\(\)]+)\)", indicator_text)
    except ValueError:
        indicator_split = [indicator_text, ""]
    indicator['name'], indicator['units'] = indicator_split
    indicator['name'] = indicator['name'].strip()
    access_text, = [x.tail.strip() for x in root.xpath("//br") if str(x.tail) != "None" and x.tail.strip()]
    access_date_raw, = re.findall('Accessed:(.*)from', access_text)
    dataset['last_updated'] = dateutil.parser.parse(access_date_raw).isoformat()
    print dataset['last_updated'], indicator['name'], "*", indicator['units']
    Indicator(**indicator).save()

    country_cell = table.filter("Country").assert_one()
    years = country_cell.fill(xypath.RIGHT).filter(lambda b: b.value != '')
    countries = country_cell.fill(xypath.DOWN)
    hdi_rank = table.filter("HDI Rank").assert_one()
    max_year = max(year.value for year in years)

    for i in countries.junction(hdi_rank):
        newvalue = dict(value)    
        newvalue['indID'] = "HDR:HDI Rank"
        newvalue['region'] = get_region(i[0])
        newvalue['value'] = i[2].value.strip()
        newvalue['period'] = 2012 # TODO Hard coded for now because year it pertains to is not clear 
        if newvalue['value'].strip() != '..':
            Value(**newvalue).save()
  
    for i in countries.junction(years):
        newvalue = dict(value)
        newvalue['region'] = get_region(i[0])
        newvalue['value'] = i[2].value.strip()
        newvalue['period'] =i[1].value.strip()
        if newvalue['value'].strip() != '..':
            Value(**newvalue).save()
        print newvalue
    session.commit()