コード例 #1
0
def do_jobs():
    print "jobs"
    indID = "reliefweb_jobs"
    indicator = {
        'indID': indID,
        'name': "Number of jobs on ReliefWeb at specified time",
        'units': 'uno'
    }
    orm.Indicator(**indicator).save()
    for country in countries:
        url = "http://api.rwlabs.org/v0/job/list"
        r = requests.get(url, data=get_job_query(country))
        if 'data' not in r.json():
            print r.json()
            print country
            continue
        value = {
            'region': country,
            'period': orm.now()[:10],  # we don't need sub-day precision.
            'value': r.json()['data']['total'],
            'dsID': dsID,
            'indID': indID,
            'source': url,
            'is_number': True
        }
        orm.Value(**value).save()
コード例 #2
0
def do_zip(url):
    print url
    fh = dl.grab(url)
    mt, = list(messytables.zip.ZIPTableSet(fh).tables)
    fh = None
    xy = xypath.Table.from_messy(mt)
    mt = None
    print "...got"
    headers = xy.filter(lambda c: c.y == 0)
    country = headers.filter("Country").assert_one()
    items = headers.filter("Item").assert_one().fill(xypath.DOWN).filter("Grand Total + (Total)")
    elements = headers.filter("Element").assert_one().fill(xypath.DOWN).filter("Food supply (kcal/capita/day)")
    filtered_items = items.select_other(lambda a, b: a.y == b.y, elements)

    years = country.fill(xypath.RIGHT).filter(re.compile("Y\d\d\d\d$"))

    for i in filtered_items:
        values = dict(valuetemplate)
        values['source'] = url
        countrycodecell, = i.junction(country)
        values['region'] = countrycodecell[2].value
        year_junction = i.junction(years)
        for _, period, value in year_junction:
            values['period'] = period.value.replace("Y", "")
            values['value'] = value.value
            orm.Value(**values).save()
    orm.session.commit()
コード例 #3
0
def getstats(url, country="PLACEHOLDER"):
    handle = dl.grab(url)
    mts = messytables.any.any_tableset(handle)
    saves = 0
    for mt in mts.tables:
        table = xypath.Table.from_messy(mt)
        inds = table.filter(lambda b: b.x == 0 and "EPI" in b.value)
        if not inds:
            continue
        assert len(inds) == 1
        top, = table.filter(lambda b: 'to the top' in b.value)
        value, = inds.junction(top)
        for ind in inds:
            split = split_ind(ind.value)
            values_tosave = dict(value_template)
            values_tosave['source'] = url
            values_tosave['region'] = country
            values_tosave['value'] = value[2].value
            indicator = {
                'indID': split['indID'],
                'name': split['indID'],
                'units': split['units']
            }
            orm.Indicator(**indicator).save()
            values_tosave['indID'] = split['indID']
            orm.Value(**values_tosave).save()
            saves = saves + 1
    if saves != 1:
        print "huh, %d saves for %r" % (saves, url)
コード例 #4
0
def do_indicator(ind="566"):
    baseurl = "http://mdgs.un.org/unsd/mdg/Handlers/ExportHandler.ashx?Type=Csv&Series=%s"
    url = baseurl % ind
    value_template['source'] = url
    handle = dl.grab(url)
    mt, = messytables.any.any_tableset(handle).tables
    table = xypath.Table.from_messy(mt)
    country_anchor = table.filter("Country").assert_one()
    years = country_anchor.fill(xypath.RIGHT).filter(re.compile("\d\d\d\d"))
    countries = country_anchor.fill(xypath.DOWN)
    indicator = table.filter("Series").shift(xypath.DOWN).value
    SEPARATOR = ', '
    if SEPARATOR in indicator:
        i_name = SEPARATOR.join(indicator.split(SEPARATOR)[:-1])
        i_unit = indicator.split(SEPARATOR)[-1]
    else:
        i_name = indicator
        i_unit = ''
    value_template['indID'] = indicator
    assert i_name
    indicator = {'indID': indicator, 'name': i_name, 'units': i_unit}
    orm.Indicator(**indicator).save()
    # countries also gets some rubbish, but junction will ignore it.
    for c_cell, y_cell, v_cell in countries.junction(years):
        value = dict(value_template)
        value['region'] = c_cell.value
        value['period'] = y_cell.value
        value['value'] = v_cell.value
        orm.Value(**value).save()
コード例 #5
0
def main():
    for sheet in spreadsheets:
        print sheet
        shortname = sheet.split('/')[-1].split('.')[0]
        dsID = 'esa-unpd-' + shortname.replace('_', '-').split('-')[0]
        year_text, = re.findall('\d{4}', dsID)
        dataset = {
            "dsID": dsID,
            "last_updated": year_text,
            "last_scraped": orm.now(),
            "name": "esa-unpd"
        }

        orm.DataSet(**dataset).save()
        indicator = {"indID": shortname, "name": shortname, "units": ''}
        # we replace the indicator name, so not saving now.
        # orm.Indicator(**indicator).save()
        value_template = {"dsID": dsID, "is_number": True, "source": sheet}

        raw = dl.grab(sheet)
        mtables = messytables.any.any_tableset(raw)
        names = [x.name for x in mtables.tables]
        if 'ESTIMATES' in names:
            mt = mtables['ESTIMATES']
        else:
            mt = mtables['PROPORTION-URBAN']
        table = xypath.Table.from_messy(mt)

        filestring = table.filter(
            re.compile("File[^:]*:.*")).assert_one().value
        indicator['name'], indicator['units'] = parse_file_string(filestring)
        print indicator['name']
        orm.Indicator(**indicator).save()

        region_header = table.filter(
            re.compile("Major area, region, country or area.*")).assert_one()
        ccode_header = table.filter(re.compile("Country.code")).assert_one()
        regions = region_header.fill(xypath.DOWN)
        years = ccode_header.fill(xypath.RIGHT)
        for region_cell, year_cell, value_cell in regions.junction(years):
            value = dict(value_template)
            value['indID'] = indicator['indID']
            value['region'] = region_cell.value
            year_value = year_cell.value
            if isinstance(year_value, basestring) and '-' in year_value:
                year1, _, year2 = year_value.partition('-')
                year_count = int(year2) - int(year1)
                assert year_count == 5
                year_value = "%s/P%dY" % (year1, year_count)
            value['period'] = year_value
            value['value'] = value_cell.value
            orm.Value(**value).save()
            #print value
    orm.session.commit()
コード例 #6
0
def do_indicator(ind):
    print "indicator:", ind
    fh = dl.grab(baseurl % ind)
    mt, = messytables.commas.CSVTableSet(fh).tables
    mt_list = list(mt)
    try:
        headers = mt_list[0]
    except IndexError:
        headers = []
    if len(headers) == 0:
        print "Error getting headers from ", ind
        raise RuntimeError("No header in {}".format(ind))
    logging.warn("headers {!r}".format(headers))
    rest = mt_list[1:]
    for row in rest:
        if len(row) == 0:
            continue  # skip empty row

        rowdict = {x[0].value: x[1].value for x in zip(headers, row)}
        try:
            name, unit = units(rowdict['GHO (DISPLAY)'])
        except Exception:
            fh.seek(0)
            print fh.read()
            raise
        indID = rowdict['GHO (CODE)']
        for lookup in [
                "SEX", "RESIDENCEAREATYPE", "EDUCATIONLEVEL", "WEALTHQUINTILE"
        ]:
            lookup_code = lookup + " (CODE)"
            lookup_name = lookup + " (DISPLAY)"
            if lookup_code in rowdict:  # header = "SEX (CODE)"
                if rowdict[lookup_code]:  # value != ""
                    indID = indID + "({}={})".format(lookup,
                                                     rowdict[lookup_code])
                    name = name + " - " + rowdict[lookup_name]

        value_dict = {
            "value": rowdict['Display Value'],
            "period": rowdict['YEAR (DISPLAY)'],
            "indID": indID,
            "region": rowdict["COUNTRY (CODE)"],
            "dsID": "athena-api",
            "source": baseurl % ind,
            "is_number": True
        }

        indicator_dict = {'indID': indID, 'name': name, 'units': unit}

        orm.Indicator(**indicator_dict).save()
        orm.Value(**value_dict).save()
コード例 #7
0
def do_file(url="http://bit.ly/14FRxGV"):
    fh = dl.grab(url)
    mts = messytables.excel.XLSTableSet(fh)
    xy = discover_table(mts)
    print "...got"
    home = xy.filter("(home)")
    years = home.fill(xypath.RIGHT)
    countries = home.fill(xypath.DOWN)
    for country, year, value in countries.junction(years):
        values = dict(valuetemplate)
        values['source'] = url
        values['region'] = country.value
        values['period'] = niceyear(year.value)  # TODO
        values['value'] = value.value
        orm.Value(**values).save()
    orm.session.commit()
コード例 #8
0
def getstats():
    url = 'http://hdr.undp.org/en/content/combined-gross-enrolment-education-both-sexes'
    handle = dl.grab(url)
    mts = messytables.any.any_tableset(handle)
    saves = 0
    mt = mts.tables[0]
    table = xypath.Table.from_messy(mt)

    pivot, = table.filter(lambda c: 'Country' in c.value)
    years = pivot.fill(xypath.RIGHT)
    countries = pivot.fill(xypath.DOWN)
    for year, country, value in years.junction(countries):
        output = dict(output_template)
        output['source'] = url
        output['region'] = country.value.strip()
        output['value'] = value.value.strip()
        orm.Value(**output).save()
        saves = saves + 1
    assert saves
コード例 #9
0
ファイル: emdat.py プロジェクト: luiscape/dap-scrapers
def doit(targets, names, year):
    # country_cells: we used to assert_one(), but sometimes there's two!
    country_cells = table.filter('iso').fill(xypath.DOWN)
    country_cells = country_cells - country_cells.filter('iso')  # remove other
    if not country_cells: print "no countries"
    country_year_filter = country_cells.filter(
        lambda b: b.shift(xypath.RIGHT).value == year)
    if not country_year_filter: print "no countries for ", year
    target_cells = table.filter(lambda b: b.value in targets)
    if not target_cells: print "didn't find ", targets

    value = {
        'dsID': 'emdat',
        'period': "%s/P1Y" % (year),
        'source': url,
        'is_number': True
    }

    dataset = {
        'dsID': 'emdat',
        'last_updated': None,
        'last_scraped': orm.now(),
        'name': 'EM-DAT'
    }
    orm.DataSet(**dataset).save()

    for i, t in enumerate(targets):
        indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'}
        if t == 'total_dam':
            indicator['units'] = ",000$ USD"
        orm.Indicator(**indicator).save()
    for cname, one_country_cells in itertools.groupby(country_year_filter,
                                                      lambda b: b.value):
        value['region'] = cname
        one_country_bag = xypath.Bag.from_list(one_country_cells, name=cname)
        for target_cell in target_cells:
            j = one_country_bag.junction(target_cell)
            value['indID'] = 'emdat:%s' % target_cell.value
            value['value'] = sum(int(x[2].value) for x in j)
            orm.Value(**value).save()
            print value
    orm.session.commit()
コード例 #10
0
def doit():
    # country_cells: we used to assert_one(), but sometimes there's two!

    dataset = {
        'dsID': 'emdat',
        'last_updated': None,
        'last_scraped': orm.now(),
        'name': 'EM-DAT'
    }
    orm.DataSet(**dataset).save()

    for i, t in enumerate(targets):
        indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'}
        if t == 'total_damage':
            indicator['units'] = ",000$ USD"
        orm.Indicator(**indicator).save()

    for country in country_list():  # TODO country_list
        print country
        raw = dl.grab(url.format(country))
        m_tables = messytables.any.any_tableset(raw)
        mt, = m_tables.tables
        table = xypath.Table.from_messy(mt)
        yr = table.filter('year').assert_one()
        years = yr.fill(xypath.DOWN)
        cats = yr.fill(xypath.RIGHT)
        for year, cat, value in years.junction(cats):
            value = {
                'dsID': 'emdat',
                'region': country,
                'indID': 'emdat:{}'.format(cat.value),
                'period': '{}/P1Y'.format(year.value),
                'value': value.value,
                'source': url,
                'is_number': True
            }
            orm.Value(**value).save()
    orm.session.commit()
コード例 #11
0
def do_products():
    for product in ocha_products:
        print product
        niceproduct = product.replace(" ", "_")
        indID = "reliefweb_" + niceproduct
        indicator = {
            'indID': indID,
            'name':
            "Number of ReliefWeb reports flagged with ocha_product: %s" %
            product,
            'units': 'uno'
        }
        orm.Indicator(**indicator).save()

        for country in countries:
            for year in range(1990, 2014):
                params = dict()
                params['PRODUCT'] = product
                params['COUNTRY'] = country
                params['FROM'] = 1000 * yeartotimestamp(year)
                params['TO'] = 1000 * yeartotimestamp(year + 1) - 1
                url = "http://api.rwlabs.org/v0/report/list"
                r = requests.get(url, data=get_product_query(**params))
                if 'data' not in r.json():
                    print r.json()
                    print country
                    continue
                value = {
                    'region': country,
                    'period': str(year),
                    'value': r.json()['data']['total'],
                    'dsID': dsID,
                    'indID': indID,
                    'source': url,
                    'is_number': True
                }
                orm.Value(**value).save()
コード例 #12
0
ファイル: athena.py プロジェクト: luiscape/dap-scrapers
def do_indicator(ind):
    fh = dl.grab(baseurl % ind)
    mt, = messytables.commas.CSVTableSet(fh).tables
    mt_list = list(mt)
    headers = mt_list[0]
    if len(headers) == 0:
        print ind
        exit()
    rest = mt_list[1:]
    for row in rest:
        if len(row) == 0:
            continue  # skip empty row

        rowdict = {x[0].value: x[1].value for x in zip(headers, row)}
        name, unit = units(rowdict['GHO (DISPLAY)'])
        if 'SEX (CODE)' in rowdict:
            indID = rowdict['GHO (CODE)'] + "-" + rowdict['SEX (CODE)']
            name = name + " - " + rowdict['SEX (DISPLAY)']
        else:
            indID = rowdict['GHO (CODE)']

        value_dict = {
            "value": rowdict['Display Value'],
            "period": rowdict['YEAR (DISPLAY)'],
            "indID": indID,
            "region": rowdict["COUNTRY (CODE)"],
            "dsID": "athena-api",
            "source": baseurl % ind,
            "is_number": True
        }

        indicator_dict = {'indID': indID, 'name': name, 'units': unit}

        orm.Indicator(**indicator_dict).save()
        orm.Value(**value_dict).save()
        print value_dict
コード例 #13
0
def main():
    save_dataset()
    save_indicator()
    raw = dl.grab(SHEET_URL)
    mtables = messytables.any.any_tableset(raw)
    table = xypath.Table.from_messy(mtables.tables[0])
    table.filter(IND_NAME).assert_one()  # we have the right table
    region_header = table.filter(REGION_HEADER_VALUE).assert_one()
    regions = region_header.fill(xypath.DOWN)
    years = region_header.fill(xypath.RIGHT,
                               stop_before=lambda c: c.value == '')
    assert len(years) < 15  # left side.
    for region_cell, year_cell, value_cell in regions.junction(years):
        value = {
            "dsID": DSID,
            'region': region_cell.value,
            'indID': INDID,
            'source': SHEET_URL,
            'is_number': True,
            'period': year_cell.value,
            'value': value_cell.value
        }
        orm.Value(**value).save()
    orm.session.commit()
コード例 #14
0
    for cand_header in country[level]:
        if header in cand_header.lower():
            matches.append(cand_header)
    if len(matches) > 1:
        print matches, "\n\n\n\n"
    if matches:
        return matches[0]


def best_match(header, country):
    return exact_match(2, header, country) or \
        exact_match(3, header, country) or \
        partial_match(2, header, country) or \
        partial_match(3, header, country)


wikibase = "http://en.wikipedia.org/wiki/%s#%s"
headinglist = headings()
for country in headinglist:
    matches = [best_match(header, headinglist[country]) for header in headers]
    d_matches = dict(zip(headers, matches))
    for head in d_matches:
        if d_matches[head]:
            value = dict(value_template)
            value['region'] = country
            value['indID'] = "wikipedia:" + head
            value['source'] = wikibase % (country, d_matches[head])
            value['value'] = value['source']
            print value
            orm.Value(**value).save()
コード例 #15
0
    country_header = table.filter(contains_string("or area name")).assert_one()
    num_code_header = table.filter(contains_string("Numerical")).assert_one()
    countries = country_header.fill(xypath.DOWN)
    num_code_header = countries.shift(x=-1)
    alpha_j = alpha_code_header.junction(countries)
    num_j = alpha_code_header.junction(num_code_header)
    alphas = [[x.value.strip() for x in row[1:]] for row in alpha_j]
    nums = [[x.value.strip() for x in row[1:]] for row in num_j]
    # 729 - Sudan - SDN
    alphas.extend([['Sudan', 'SDN']])
    nums.extend([['729', 'SDN']])
    v_template = {
        'dsID': 'm49',
        'period': updated,
        'source': 'http://unstats.un.org/unsd/methods/m49/m49alpha.htm',
        'is_number': False
    }
    builder = []
    for entry in alphas:
        v = dict(v_template)
        v.update({'value': entry[0], 'region': entry[1], 'indID': 'm49-name'})
        orm.Value(**v).save()
        if 'GBR' in repr(v):
            gb = True
    for entry in nums:
        v = dict(v_template)
        v.update({'value': entry[0], 'region': entry[1], 'indID': 'm49-num'})
        orm.Value(**v).save()
orm.session.commit()
assert gb
コード例 #16
0
    "dsID": "echo",
    "period": 2012,
    "source": baseurl,
    "is_number": True
}

xls_raw = dl.grab(baseurl)
mt = messytables.excel.XLSTableSet(xls_raw).tables[0]
assert mt.name == "GNA Final Index (rank)"
xy = xypath.Table.from_messy(mt)
countries = xy.filter("ISO3").assert_one().fill(xypath.DOWN)
vuln_h = xy.filter("GNA Vulnerability Index (VI)").assert_one()
crisis_h = xy.filter("GNA Crisis Index (CI)").assert_one()

headerheader(xy.filter("ISO3").assert_one(), xypath.DOWN, xypath.RIGHT)

big = {
    'region':
    headerheader(xy.filter("ISO3").assert_one(), xypath.DOWN, xypath.RIGHT),
    'indID': {
        'gna-vi': vuln_h.fill(xypath.DOWN),
        'gna-ci': crisis_h.fill(xypath.DOWN)
    }
}

for olap_row in xypath.xyzzy.xyzzy(xy, big, valuename="value"):
    print olap_row
    full_olap = dict(value_template)
    full_olap.update(olap_row)
    orm.Value(**full_olap).save()
コード例 #17
0
    for reg in regions:
        j = requests.get(baseindexurl % reg).json()
        for country in j['Countries']:
            yield {
                'region': country['Code'],
                'value':
                baseleafurl % (country['Code'], country['OfficialName'])
            }


print list(accuweather())

orm.DataSet(dsID="accuweather",
            last_updated=None,
            last_scraped=orm.now(),
            name="Accuweather").save()

orm.Indicator(indID="accuweather_url", name="AccuWeather URL", units="").save()

valuetemplate = {
    'dsID': 'accuweather',
    'indID': 'accuweather_url',
    'period': None,
    'source': 'http://www.accuweather.com'
}

for datarow in accuweather():
    olap_row = dict(valuetemplate)
    olap_row.update(datarow)
    orm.Value(**olap_row).save()
コード例 #18
0
    root = lxml.html.fromstring(html)
    root.make_links_absolute(baseurl)
    return root.xpath(
        "//div[@id='content']//article//a[contains(text(),'xls')]/@href")


def keyfunc(item):
    return (item['YEAR'], item['COUNTRY'])


events = []
for url in geturls():
    for row in parsesheet(url):
        if row['EVENT_TYPE'].strip() in NONVIOLENT:
            print row['EVENT_TYPE']  ## TODO: not working!!!!
            continue
        exit()
        events.append(row)
sorted_e = sorted(events, key=keyfunc)

for item in itertools.groupby(sorted_e, keyfunc):
    value_item = {
        'dsID': 'acled',
        'indID': 'PVX040',
        'period': item[0][0],
        'region': item[0][1],
        'value': len(list(item[1])),
        'source': 'http://www.acleddata.com/data/types-and-groups/'
    }
    orm.Value(**value_item).save()