def save_dataset():
    dataset = {
        "dsID": DSID,
        "last_updated": None,
        "last_scraped": orm.now(),
        "name": "United Nations Office on Drugs and Crime"
    }
    orm.DataSet(**dataset).save()
Example #2
0
def main():
    for sheet in spreadsheets:
        print sheet
        shortname = sheet.split('/')[-1].split('.')[0]
        dsID = 'esa-unpd-' + shortname.replace('_', '-').split('-')[0]
        year_text, = re.findall('\d{4}', dsID)
        dataset = {
            "dsID": dsID,
            "last_updated": year_text,
            "last_scraped": orm.now(),
            "name": "esa-unpd"
        }

        orm.DataSet(**dataset).save()
        indicator = {"indID": shortname, "name": shortname, "units": ''}
        # we replace the indicator name, so not saving now.
        # orm.Indicator(**indicator).save()
        value_template = {"dsID": dsID, "is_number": True, "source": sheet}

        raw = dl.grab(sheet)
        mtables = messytables.any.any_tableset(raw)
        names = [x.name for x in mtables.tables]
        if 'ESTIMATES' in names:
            mt = mtables['ESTIMATES']
        else:
            mt = mtables['PROPORTION-URBAN']
        table = xypath.Table.from_messy(mt)

        filestring = table.filter(
            re.compile("File[^:]*:.*")).assert_one().value
        indicator['name'], indicator['units'] = parse_file_string(filestring)
        print indicator['name']
        orm.Indicator(**indicator).save()

        region_header = table.filter(
            re.compile("Major area, region, country or area.*")).assert_one()
        ccode_header = table.filter(re.compile("Country.code")).assert_one()
        regions = region_header.fill(xypath.DOWN)
        years = ccode_header.fill(xypath.RIGHT)
        for region_cell, year_cell, value_cell in regions.junction(years):
            value = dict(value_template)
            value['indID'] = indicator['indID']
            value['region'] = region_cell.value
            year_value = year_cell.value
            if isinstance(year_value, basestring) and '-' in year_value:
                year1, _, year2 = year_value.partition('-')
                year_count = int(year2) - int(year1)
                assert year_count == 5
                year_value = "%s/P%dY" % (year1, year_count)
            value['period'] = year_value
            value['value'] = value_cell.value
            orm.Value(**value).save()
            #print value
    orm.session.commit()
Example #3
0
def doit(targets, names, year):
    # country_cells: we used to assert_one(), but sometimes there's two!
    country_cells = table.filter('iso').fill(xypath.DOWN)
    country_cells = country_cells - country_cells.filter('iso')  # remove other
    if not country_cells: print "no countries"
    country_year_filter = country_cells.filter(
        lambda b: b.shift(xypath.RIGHT).value == year)
    if not country_year_filter: print "no countries for ", year
    target_cells = table.filter(lambda b: b.value in targets)
    if not target_cells: print "didn't find ", targets

    value = {
        'dsID': 'emdat',
        'period': "%s/P1Y" % (year),
        'source': url,
        'is_number': True
    }

    dataset = {
        'dsID': 'emdat',
        'last_updated': None,
        'last_scraped': orm.now(),
        'name': 'EM-DAT'
    }
    orm.DataSet(**dataset).save()

    for i, t in enumerate(targets):
        indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'}
        if t == 'total_dam':
            indicator['units'] = ",000$ USD"
        orm.Indicator(**indicator).save()
    for cname, one_country_cells in itertools.groupby(country_year_filter,
                                                      lambda b: b.value):
        value['region'] = cname
        one_country_bag = xypath.Bag.from_list(one_country_cells, name=cname)
        for target_cell in target_cells:
            j = one_country_bag.junction(target_cell)
            value['indID'] = 'emdat:%s' % target_cell.value
            value['value'] = sum(int(x[2].value) for x in j)
            orm.Value(**value).save()
            print value
    orm.session.commit()
Example #4
0
def doit():
    # country_cells: we used to assert_one(), but sometimes there's two!

    dataset = {
        'dsID': 'emdat',
        'last_updated': None,
        'last_scraped': orm.now(),
        'name': 'EM-DAT'
    }
    orm.DataSet(**dataset).save()

    for i, t in enumerate(targets):
        indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'}
        if t == 'total_damage':
            indicator['units'] = ",000$ USD"
        orm.Indicator(**indicator).save()

    for country in country_list():  # TODO country_list
        print country
        raw = dl.grab(url.format(country))
        m_tables = messytables.any.any_tableset(raw)
        mt, = m_tables.tables
        table = xypath.Table.from_messy(mt)
        yr = table.filter('year').assert_one()
        years = yr.fill(xypath.DOWN)
        cats = yr.fill(xypath.RIGHT)
        for year, cat, value in years.junction(cats):
            value = {
                'dsID': 'emdat',
                'region': country,
                'indID': 'emdat:{}'.format(cat.value),
                'period': '{}/P1Y'.format(year.value),
                'value': value.value,
                'source': url,
                'is_number': True
            }
            orm.Value(**value).save()
    orm.session.commit()
    mt, = messytables.any.any_tableset(handle).tables
    table = xypath.Table.from_messy(mt)
    country_anchor = table.filter("Country").assert_one()
    years = country_anchor.fill(xypath.RIGHT).filter(re.compile("\d\d\d\d"))
    countries = country_anchor.fill(xypath.DOWN)
    indicator = table.filter("Series").shift(xypath.DOWN).value
    SEPARATOR = ', '
    if SEPARATOR in indicator:
        i_name = SEPARATOR.join(indicator.split(SEPARATOR)[:-1])
        i_unit = indicator.split(SEPARATOR)[-1]
    else:
        i_name = indicator
        i_unit = ''
    value_template['indID'] = indicator
    assert i_name
    indicator = {'indID': indicator, 'name': i_name, 'units': i_unit}
    orm.Indicator(**indicator).save()
    # countries also gets some rubbish, but junction will ignore it.
    for c_cell, y_cell, v_cell in countries.junction(years):
        value = dict(value_template)
        value['region'] = c_cell.value
        value['period'] = y_cell.value
        value['value'] = v_cell.value
        orm.Value(**value).save()


orm.DataSet(**dataset).save()
for ind in indicators:
    do_indicator(ind)
orm.session.commit()
Example #6
0
    'REGION (DISPLAY)': 'x_supregion',
    'GHO (DISPLAY)': 'indname'
}
"""Value: dsID, region, indID, period, value, source, is_number
   DataSet: dsID, last_updated, last_scraped, name
   Indicator: indID, name, units
   """

dataset_template = {
    "dsID": "athena-api",
    "last_updated": None,
    "last_scraped": orm.now(),
    "name": "WHO Athena API"
}

orm.DataSet(**dataset_template).save()

indicators = list(
    "CM_01,CM_02,CM_03,DEVICES09,DEVICES22,HIV_0000000020,MALARIA001,MDG_0000000001,MDG_0000000003,MDG_0000000005,MDG_0000000005_AGE1519,MDG_0000000006,MDG_0000000006_AGE1519,MDG_0000000007,MDG_0000000010,MDG_0000000011,MDG_0000000013,MDG_0000000014,MDG_0000000017,MDG_0000000020,MDG_0000000023,MDG_0000000025,MDG_0000000025_AGE1519,MDG_0000000026,MDG_0000000029,MDG_0000000031,MDG_0000000033,MDG_0000000034,MH_17,MH_18,MH_6,SA_0000001688,TB_1,TB_tot_newrel,TOBACCO_0000000192,WHOSIS_000001,WHOSIS_000003,WHOSIS_000005,WHOSIS_000011,WHOSIS_000015,WHS10_1,WHS10_8,WHS10_9,WHS2_138,WHS2_162,WHS2_163,WHS2_164,WHS2_165,WHS2_166,WHS2_167,WHS2_168,WHS2_170,WHS2_171,WHS2_172,WHS2_173,WHS2_174,WHS2_3070_all,WHS2_3070_cancer,WHS2_3070_cdd,WHS2_3070_chronic,WHS2_513,WHS2_514,WHS2_515,WHS2_516,WHS2_523,WHS3_40,WHS3_41,WHS3_42,WHS3_43,WHS3_45,WHS3_46,WHS3_47,WHS3_48,WHS3_49,WHS3_50,WHS3_51,WHS3_52,WHS3_53,WHS3_55,WHS3_56,WHS3_57,WHS3_62,WHS4_100,WHS4_106,WHS4_107,WHS4_108,WHS4_111,WHS4_111_AGE1519,WHS4_115,WHS4_117,WHS4_124,WHS4_128,WHS4_129,WHS4_154,WHS4_2530,WHS6_101,WHS6_102,WHS6_116,WHS6_123,WHS6_125,WHS6_127,WHS6_136,WHS6_140,WHS6_144,WHS6_148,WHS6_150,WHS6_517,WHS6_518,WHS6_519,WHS6_520,WHS7_103,WHS7_104,WHS7_105,WHS7_108,WHS7_113,WHS7_120,WHS7_134,WHS7_139,WHS7_143,WHS7_147,WHS7_149,WHS7_156,WHS8_110,WHS9_85,WHS9_86,WHS9_88,WHS9_89,WHS9_90,WHS9_91,WHS9_92,WHS9_93,WHS9_95,WHS9_96,WHS9_97,WHS9_CBR,WHS9_CDR,WHS9_CS,CM_01,CM_02,CM_03,DEVICES09,DEVICES22,HIV_0000000020,MALARIA001,MDG_0000000001,MDG_0000000003,MDG_0000000005,MDG_0000000005_AGE1519,MDG_0000000006,MDG_0000000006_AGE1519,MDG_0000000007,MDG_0000000010,MDG_0000000011,MDG_0000000013,MDG_0000000014,MDG_0000000017,MDG_0000000020,MDG_0000000023,MDG_0000000025,MDG_0000000025_AGE1519,MDG_0000000026,MDG_0000000029,MDG_0000000031,MDG_0000000033,MDG_0000000034,MH_17,MH_18,MH_6,SA_0000001688,TB_1,TB_tot_newrel,TOBACCO_0000000192,WHOSIS_000001,WHOSIS_000003,WHOSIS_000005,WHOSIS_000011,WHOSIS_000015,WHS10_1,WHS10_8,WHS10_9,WHS2_138,WHS2_162,WHS2_163,WHS2_164,WHS2_165,WHS2_166,WHS2_167,WHS2_168,WHS2_170,WHS2_171,WHS2_172,WHS2_173,WHS2_174,WHS2_3070_all,WHS2_3070_cancer,WHS2_3070_cdd,WHS2_3070_chronic,WHS2_513,WHS2_514,WHS2_515,WHS2_516,WHS2_523,WHS3_40,WHS3_41,WHS3_42,WHS3_43,WHS3_45,WHS3_46,WHS3_47,WHS3_48,WHS3_49,WHS3_50,WHS3_51,WHS3_52,WHS3_53,WHS3_55,WHS3_56,WHS3_57,WHS3_62,WHS4_100,WHS4_106,WHS4_107,WHS4_108,WHS4_111,WHS4_111_AGE1519,WHS4_115,WHS4_117,WHS4_124,WHS4_128,WHS4_129,WHS4_154,WHS4_2530,WHS6_101,WHS6_102,WHS6_116,WHS6_123,WHS6_125,WHS6_127,WHS6_136,WHS6_140,WHS6_144,WHS6_148,WHS6_150,WHS6_517,WHS6_518,WHS6_519,WHS6_520,WHS7_103,WHS7_104,WHS7_105,WHS7_108,WHS7_113,WHS7_120,WHS7_134,WHS7_139,WHS7_143,WHS7_147,WHS7_149,WHS7_156,WHS8_110,WHS9_85,WHS9_86,WHS9_88,WHS9_89,WHS9_90,WHS9_91,WHS9_92,WHS9_93,WHS9_95,WHS9_96,WHS9_97,WHS9_CBR,WHS9_CDR,WHS9_CS,WHS9_86,WHS9_88,WHS9_89,WHS9_92,WHS9_96,WHS9_97,WHS9_90,WHS3_48,MALARIA002,MALARIA001,MALARIA003,MDG_0000000013,MDG_0000000014,MENING_2,MENING_1,MENING_3,CHOLERA_0000000001,CHOLERA_0000000002,CHOLERA_0000000003,MDG_0000000027,WHOSIS_000009,NUTRITION_564,sba,sba3,sba5,anc1,anc13,MDG_0000000015,MDG_0000000021,anc4,anc43,anc45"
    .split(","))

indicators = reversed(indicators)

baseurl = "http://apps.who.int/gho/athena/data/GHO/%s.csv?profile=verbose"


def units(s):
    # print repr(s)
    z = re.findall("(.*)\(([^)]*)\)$", s)
    if z:
        assert len(z) == 1
    regions = "afr ant arc asi cac eur mea nam ocn sam".split(" ")

    for reg in regions:
        j = requests.get(baseindexurl % reg).json()
        for country in j['Countries']:
            yield {
                'region': country['Code'],
                'value':
                baseleafurl % (country['Code'], country['OfficialName'])
            }


print list(accuweather())

orm.DataSet(dsID="accuweather",
            last_updated=None,
            last_scraped=orm.now(),
            name="Accuweather").save()

orm.Indicator(indID="accuweather_url", name="AccuWeather URL", units="").save()

valuetemplate = {
    'dsID': 'accuweather',
    'indID': 'accuweather_url',
    'period': None,
    'source': 'http://www.accuweather.com'
}

for datarow in accuweather():
    olap_row = dict(valuetemplate)
    olap_row.update(datarow)
    orm.Value(**olap_row).save()