Ejemplos de grab en Python, ejemplos de dl.grab en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: faostat.py Proyecto: OCHA-DAP/dap-scrapers

def do_zip(url):
    print url
    fh = dl.grab(url)
    mt, = list(messytables.zip.ZIPTableSet(fh).tables)
    fh = None
    xy = xypath.Table.from_messy(mt)
    mt = None
    print "...got"
    headers = xy.filter(lambda c: c.y == 0)
    country = headers.filter("Country").assert_one()
    items = headers.filter("Item").assert_one().fill(xypath.DOWN).filter("Grand Total + (Total)")
    units = headers.filter("Unit").assert_one().fill(xypath.DOWN).filter("kcal/capita/day")
    filtered_items = items.select_other(lambda a, b: a.y == b.y, units)

    years = country.fill(xypath.RIGHT).filter(re.compile("Y\d\d\d\d$"))

    assert items
    assert units
    assert filtered_items
    assert years

    for i in filtered_items:
        values = dict(valuetemplate)
        values['source'] = url
        countrycodecell, = i.junction(country)
        values['region'] = countrycodecell[2].value
        year_junction = i.junction(years)
        for _, period, value in year_junction:
            values['period'] = period.value.replace("Y", "")
            values['value'] = value.value
            orm.Value(**values).save()
        print values
    orm.session.commit()

Ejemplo n.º 2

0

Mostrar archivo

def parsesheet(url):
    rowset = messytables.excel.XLSTableSet(dl.grab(url)).tables[0]
    for i, row in enumerate(rowset):
        if i == 0:
            headers = [x.value for x in row]
            continue
        yield OrderedDict(list(zip(headers, [x.value for x in row])))

Ejemplo n.º 3

0

Mostrar archivo

Archivo: unicef.py Proyecto: OCHA-DAP/dap-scrapers

def getstats(url, country="PLACEHOLDER"):
    handle = dl.grab(url)
    mts = messytables.any.any_tableset(handle)
    saves = 0
    for mt in mts.tables:
        table = xypath.Table.from_messy(mt)
        inds = table.filter(lambda b: b.x == 0 and "EPI" in b.value)
        if not inds:
            continue
        assert len(inds) == 1
        top, = table.filter(lambda b: 'to the top' in b.value)
        value, = inds.junction(top)
        for ind in inds:
            split = split_ind(ind.value)
            values_tosave = dict(value_template)
            values_tosave['source'] = url
            values_tosave['region'] = country
            values_tosave['value'] = value[2].value
            indicator = {'indID': split['indID'],
                         'name': split['indID'],
                         'units': split['units']}
            orm.Indicator(**indicator).save()
            values_tosave['indID'] = split['indID']
            orm.Value(**values_tosave).save()
            saves = saves + 1
    if saves != 1:
        print "huh, %d saves for %r" % (saves, url)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: mdg.py Proyecto: luiscape/dap-scrapers

def do_indicator(ind="566"):
    baseurl="http://mdgs.un.org/unsd/mdg/Handlers/ExportHandler.ashx?Type=Csv&Series=%s"
    url = baseurl % ind
    value_template['source'] = url
    handle = dl.grab(url)
    mt, = messytables.any.any_tableset(handle).tables
    table = xypath.Table.from_messy(mt)
    country_anchor = table.filter("Country").assert_one()
    years = country_anchor.fill(xypath.RIGHT).filter(re.compile("\d\d\d\d"))
    countries = country_anchor.fill(xypath.DOWN)
    indicator = table.filter("Series").shift(xypath.DOWN).value
    SEPARATOR = ', '
    if SEPARATOR in indicator:
        i_name = SEPARATOR.join(indicator.split(SEPARATOR)[:-1])
        i_unit = indicator.split(SEPARATOR)[-1]
    else:
        i_name = indicator
        i_unit = ''
    value_template['indID'] = indicator
    assert i_name
    indicator = {'indID': indicator,
                 'name': i_name,
                 'units': i_unit}
    orm.Indicator(**indicator).save()
    # countries also gets some rubbish, but junction will ignore it.
    for c_cell, y_cell, v_cell in countries.junction(years):
        value = dict(value_template)
        value['region'] = c_cell.value
        value['period'] = y_cell.value
        value['value'] = v_cell.value
        orm.Value(**value).save()

Ejemplo n.º 5

0

Mostrar archivo

Archivo: acled.py Proyecto: OCHA-DAP/dap-scrapers

def parsesheet(url):
    rowset = messytables.excel.XLSTableSet(dl.grab(url)).tables[0]
    for i, row in enumerate(rowset):
        if i == 0:
            headers = [x.value for x in row]
            continue
        yield OrderedDict(list(zip(headers, [x.value for x in row])))

Ejemplo n.º 6

0

Mostrar archivo

Archivo: unicef.py Proyecto: timofonic-otherdevstuff/dap-scrapers

def getstats(url, country="PLACEHOLDER"):
    handle = dl.grab(url)
    mts = messytables.any.any_tableset(handle)
    saves = 0
    for mt in mts.tables:
        table = xypath.Table.from_messy(mt)
        inds = table.filter(lambda b: b.x == 0 and "EPI" in b.value)
        if not inds:
            continue
        assert len(inds) == 1
        top, = table.filter(lambda b: 'to the top' in b.value)
        value, = inds.junction(top)
        for ind in inds:
            split = split_ind(ind.value)
            values_tosave = dict(value_template)
            values_tosave['source'] = url
            values_tosave['region'] = country
            values_tosave['value'] = value[2].value
            indicator = {
                'indID': split['indID'],
                'name': split['indID'],
                'units': split['units']
            }
            orm.Indicator(**indicator).save()
            values_tosave['indID'] = split['indID']
            orm.Value(**values_tosave).save()
            saves = saves + 1
    if saves != 1:
        print "huh, %d saves for %r" % (saves, url)

Ejemplo n.º 7

0

Mostrar archivo

def do_zip(url):
    print url
    fh = dl.grab(url)
    mt, = list(messytables.zip.ZIPTableSet(fh).tables)
    fh = None
    xy = xypath.Table.from_messy(mt)
    mt = None
    print "...got"
    headers = xy.filter(lambda c: c.y == 0)
    country = headers.filter("Country").assert_one()
    items = headers.filter("Item").assert_one().fill(xypath.DOWN).filter("Grand Total + (Total)")
    elements = headers.filter("Element").assert_one().fill(xypath.DOWN).filter("Food supply (kcal/capita/day)")
    filtered_items = items.select_other(lambda a, b: a.y == b.y, elements)

    years = country.fill(xypath.RIGHT).filter(re.compile("Y\d\d\d\d$"))

    for i in filtered_items:
        values = dict(valuetemplate)
        values['source'] = url
        countrycodecell, = i.junction(country)
        values['region'] = countrycodecell[2].value
        year_junction = i.junction(years)
        for _, period, value in year_junction:
            values['period'] = period.value.replace("Y", "")
            values['value'] = value.value
            orm.Value(**values).save()
    orm.session.commit()

Ejemplo n.º 8

0

Mostrar archivo

Archivo: mdg.py Proyecto: timofonic-otherdevstuff/dap-scrapers

def do_indicator(ind="566"):
    baseurl = "http://mdgs.un.org/unsd/mdg/Handlers/ExportHandler.ashx?Type=Csv&Series=%s"
    url = baseurl % ind
    value_template['source'] = url
    handle = dl.grab(url)
    mt, = messytables.any.any_tableset(handle).tables
    table = xypath.Table.from_messy(mt)
    country_anchor = table.filter("Country").assert_one()
    years = country_anchor.fill(xypath.RIGHT).filter(re.compile("\d\d\d\d"))
    countries = country_anchor.fill(xypath.DOWN)
    indicator = table.filter("Series").shift(xypath.DOWN).value
    SEPARATOR = ', '
    if SEPARATOR in indicator:
        i_name = SEPARATOR.join(indicator.split(SEPARATOR)[:-1])
        i_unit = indicator.split(SEPARATOR)[-1]
    else:
        i_name = indicator
        i_unit = ''
    value_template['indID'] = indicator
    assert i_name
    indicator = {'indID': indicator, 'name': i_name, 'units': i_unit}
    orm.Indicator(**indicator).save()
    # countries also gets some rubbish, but junction will ignore it.
    for c_cell, y_cell, v_cell in countries.junction(years):
        value = dict(value_template)
        value['region'] = c_cell.value
        value['period'] = y_cell.value
        value['value'] = v_cell.value
        orm.Value(**value).save()

Ejemplo n.º 9

0

Mostrar archivo

Archivo: esa.py Proyecto: OCHA-DAP/dap-scrapers

def main():
  for sheet in spreadsheets:
    shortname = sheet.split('/')[-1].split('.')[0]
    dsID = 'esa-unpd-' + shortname.replace('_', '-').split('-')[0]
    year_text, = re.findall('\d{4}', dsID)
    dataset = {"dsID": dsID,
               "last_updated": year_text,
               "last_scraped": orm.now(),
               "name": "esa-unpd"}

    orm.DataSet(**dataset).save()
    indicator = {"indID": shortname,
                 "name": shortname,
                 "units": ''
                }
    # we replace the indicator name, so not saving now.
    # orm.Indicator(**indicator).save()
    value_template = {"dsID": dsID,
                      "is_number": True,
                      "source": sheet}

    raw = dl.grab(sheet)
    mtables = messytables.any.any_tableset(raw)
    names = [x.name for x in mtables.tables]
    if 'ESTIMATES' in names:
        mt = mtables['ESTIMATES']
    else:
        mt = mtables['PROPORTION-URBAN']
    table = xypath.Table.from_messy(mt)

    filestring = table.filter(re.compile("File[^:]*:.*")).assert_one().value
    indicator['name'], indicator['units'] = parse_file_string(filestring)
    print indicator['name']
    orm.Indicator(**indicator).save()

    region_header = table.filter(re.compile("Major area, region, country or area.*")).assert_one()
    ccode_header = table.filter(re.compile("Country.code")).assert_one()
    regions = region_header.fill(xypath.DOWN)
    years = ccode_header.fill(xypath.RIGHT)
    for region_cell, year_cell, value_cell in regions.junction(years):
        value = dict(value_template)
        value['indID'] = indicator['indID']
        value['region'] = region_cell.value
        year_value = year_cell.value
        if isinstance(year_value, basestring) and '-' in year_value:
            year1, _, year2 = year_value.partition('-')
            year_count = int(year2) - int(year1)
            assert year_count == 5
            year_value = "%s/P%dY" % (year1, year_count)
        value['period'] = year_value
        value['value'] = value_cell.value
        orm.Value(**value).save()
        #print value
  orm.session.commit()

Ejemplo n.º 10

0

Mostrar archivo

def main():
    for sheet in spreadsheets:
        print sheet
        shortname = sheet.split('/')[-1].split('.')[0]
        dsID = 'esa-unpd-' + shortname.replace('_', '-').split('-')[0]
        year_text, = re.findall('\d{4}', dsID)
        dataset = {
            "dsID": dsID,
            "last_updated": year_text,
            "last_scraped": orm.now(),
            "name": "esa-unpd"
        }

        orm.DataSet(**dataset).save()
        indicator = {"indID": shortname, "name": shortname, "units": ''}
        # we replace the indicator name, so not saving now.
        # orm.Indicator(**indicator).save()
        value_template = {"dsID": dsID, "is_number": True, "source": sheet}

        raw = dl.grab(sheet)
        mtables = messytables.any.any_tableset(raw)
        names = [x.name for x in mtables.tables]
        if 'ESTIMATES' in names:
            mt = mtables['ESTIMATES']
        else:
            mt = mtables['PROPORTION-URBAN']
        table = xypath.Table.from_messy(mt)

        filestring = table.filter(
            re.compile("File[^:]*:.*")).assert_one().value
        indicator['name'], indicator['units'] = parse_file_string(filestring)
        print indicator['name']
        orm.Indicator(**indicator).save()

        region_header = table.filter(
            re.compile("Major area, region, country or area.*")).assert_one()
        ccode_header = table.filter(re.compile("Country.code")).assert_one()
        regions = region_header.fill(xypath.DOWN)
        years = ccode_header.fill(xypath.RIGHT)
        for region_cell, year_cell, value_cell in regions.junction(years):
            value = dict(value_template)
            value['indID'] = indicator['indID']
            value['region'] = region_cell.value
            year_value = year_cell.value
            if isinstance(year_value, basestring) and '-' in year_value:
                year1, _, year2 = year_value.partition('-')
                year_count = int(year2) - int(year1)
                assert year_count == 5
                year_value = "%s/P%dY" % (year1, year_count)
            value['period'] = year_value
            value['value'] = value_cell.value
            orm.Value(**value).save()
            #print value
    orm.session.commit()

Ejemplo n.º 11

0

Mostrar archivo

def do_indicator(ind):
    print "indicator:", ind
    fh = dl.grab(baseurl % ind)
    mt, = messytables.commas.CSVTableSet(fh).tables
    mt_list = list(mt)
    try:
        headers = mt_list[0]
    except IndexError:
        headers = []
    if len(headers) == 0:
        print "Error getting headers from ", ind
        raise RuntimeError("No header in {}".format(ind))
    logging.warn("headers {!r}".format(headers))
    rest = mt_list[1:]
    for row in rest:
        if len(row) == 0:
            continue  # skip empty row

        rowdict = {x[0].value: x[1].value for x in zip(headers, row)}
        try:
            name, unit = units(rowdict['GHO (DISPLAY)'])
        except Exception:
            fh.seek(0)
            print fh.read()
            raise
        indID = rowdict['GHO (CODE)']
        for lookup in [
                "SEX", "RESIDENCEAREATYPE", "EDUCATIONLEVEL", "WEALTHQUINTILE"
        ]:
            lookup_code = lookup + " (CODE)"
            lookup_name = lookup + " (DISPLAY)"
            if lookup_code in rowdict:  # header = "SEX (CODE)"
                if rowdict[lookup_code]:  # value != ""
                    indID = indID + "({}={})".format(lookup,
                                                     rowdict[lookup_code])
                    name = name + " - " + rowdict[lookup_name]

        value_dict = {
            "value": rowdict['Display Value'],
            "period": rowdict['YEAR (DISPLAY)'],
            "indID": indID,
            "region": rowdict["COUNTRY (CODE)"],
            "dsID": "athena-api",
            "source": baseurl % ind,
            "is_number": True
        }

        indicator_dict = {'indID': indID, 'name': name, 'units': unit}

        orm.Indicator(**indicator_dict).save()
        orm.Value(**value_dict).save()

Ejemplo n.º 12

0

Mostrar archivo

Archivo: faosec.py Proyecto: OCHA-DAP/dap-scrapers

def do_file(url="http://bit.ly/14FRxGV"):
    fh = dl.grab(url)
    mts = messytables.excel.XLSTableSet(fh)
    xy = discover_table(mts)
    print "...got"
    home = xy.filter("(home)")
    years = home.fill(xypath.RIGHT)
    countries = home.fill(xypath.DOWN)
    for country, year, value in countries.junction(years):
        values = dict(valuetemplate)
        values['source'] = url
        values['region'] = country.value
        values['period'] = niceyear(year.value)  # TODO
        values['value'] = value.value
        orm.Value(**values).save()
    orm.session.commit()

Ejemplo n.º 13

0

Mostrar archivo

Archivo: faosec.py Proyecto: timofonic-otherdevstuff/dap-scrapers

def do_file(url="http://bit.ly/14FRxGV"):
    fh = dl.grab(url)
    mts = messytables.excel.XLSTableSet(fh)
    xy = discover_table(mts)
    print "...got"
    home = xy.filter("(home)")
    years = home.fill(xypath.RIGHT)
    countries = home.fill(xypath.DOWN)
    for country, year, value in countries.junction(years):
        values = dict(valuetemplate)
        values['source'] = url
        values['region'] = country.value
        values['period'] = niceyear(year.value)  # TODO
        values['value'] = value.value
        orm.Value(**values).save()
    orm.session.commit()

Ejemplo n.º 14

0

Mostrar archivo

Archivo: worldbank.py Proyecto: timofonic-otherdevstuff/dap-scrapers

def getcountry(threeletter="PAK"):
    print threeletter
    baseurl = "http://api.worldbank.org/v2/en/country/{}?downloadformat=excel"
    value = {'dsID': 'World Bank',
             'region': threeletter,
             'source': baseurl.format(threeletter.lower()),
             'is_number': True}

    while True:
        fh = dl.grab(baseurl.format(threeletter), [404])
        if not fh:
            return
        try:
            messy = messytables.excel.XLSTableSet(fh)
            break  # success!
        except messytables.error.ReadError, e:
            print e
            return

Ejemplo n.º 15

0

Mostrar archivo

Archivo: enrol_undp.py Proyecto: OCHA-DAP/dap-scrapers

def getstats():
    url = 'http://hdr.undp.org/en/content/combined-gross-enrolment-education-both-sexes'
    handle = dl.grab(url)
    mts = messytables.any.any_tableset(handle)
    saves = 0
    mt = mts.tables[0]
    table = xypath.Table.from_messy(mt)

    pivot, = table.filter(lambda c: 'Country' in c.value)
    years = pivot.fill(xypath.RIGHT)
    countries = pivot.fill(xypath.DOWN)
    for year, country, value in years.junction(countries):
        output = dict(output_template)
        output['source'] = url
        output['region'] = country.value.strip()
        output['value'] = value.value.strip()
        orm.Value(**output).save()
        saves = saves + 1
    assert saves

Ejemplo n.º 16

0

Mostrar archivo

def getstats():
    url = 'http://hdr.undp.org/en/content/combined-gross-enrolment-education-both-sexes'
    handle = dl.grab(url)
    mts = messytables.any.any_tableset(handle)
    saves = 0
    mt = mts.tables[0]
    table = xypath.Table.from_messy(mt)

    pivot, = table.filter(lambda c: 'Country' in c.value)
    years = pivot.fill(xypath.RIGHT)
    countries = pivot.fill(xypath.DOWN)
    for year, country, value in years.junction(countries):
        output = dict(output_template)
        output['source'] = url
        output['region'] = country.value.strip()
        output['value'] = value.value.strip()
        orm.Value(**output).save()
        saves = saves + 1
    assert saves

Ejemplo n.º 17

0

Mostrar archivo

Archivo: worldbank.py Proyecto: luiscape/dap-scrapers

def getcountry(threeletter="PAK"):
    print threeletter
    baseurl = "http://api.worldbank.org/datafiles/%s_Country_MetaData_en_EXCEL.xls"
    value = {'dsID': 'World Bank',
             'region': threeletter,
             'source': baseurl % threeletter,
             'is_number': True}

    fh = dl.grab(baseurl % threeletter, [404])
    if not fh:
        return
    messy = messytables.excel.XLSTableSet(fh)
    table = xypath.Table.from_messy(list(messy.tables)[0])
    indicators = table.filter(is_in(indicator_list))
    indname = indicators.shift(x=-1)
    if not len(indname) == len(indicator_list):
        print "missing indicators", [x.value for x in indname]

    code = table.filter(equal_to('Indicator Code'))

    years = code.fill(xypath.RIGHT)
    junction = indname.junction(years)
    for ind_cell, year_cell, value_cell in junction:
        vdict = dict(value)
        vdict['indID'] = ind_cell.value
        vdict['period'] = year_cell.value
        vdict['value'] = value_cell.value

        indicator = {'indID': vdict['indID']}
        nameunits = re.search('(.*)\((.*)\)', vdict['indID'])
        if nameunits:
            (indicator['name'], indicator['units']) = nameunits.groups()
        else:
            indicator['name'] = vdict['indID']
            indicator['units'] = 'uno'
        Indicator(**indicator).save()
        v = Value(**vdict)
        if not v.is_blank():
            v.save()
    print len(session.query(Value).filter(Value.dsID == 'World Bank').all())
    session.commit()

Ejemplo n.º 18

0

Mostrar archivo

Archivo: athena.py Proyecto: OCHA-DAP/dap-scrapers

def do_indicator(ind):
    print "indicator:", ind
    fh = dl.grab(baseurl % ind)
    mt, = messytables.commas.CSVTableSet(fh).tables
    mt_list = list(mt)
    headers = mt_list[0]
    if len(headers) == 0:
        print ind
        exit()
    logging.warn("headers {!r}".format(headers))
    rest = mt_list[1:]
    for row in rest:
        if len(row) == 0:
            continue  # skip empty row

        rowdict = {x[0].value: x[1].value for x in zip(headers, row)}
        name, unit = units(rowdict['GHO (DISPLAY)'])
        indID = rowdict['GHO (CODE)']
        for lookup in ["SEX", "RESIDENCEAREATYPE", "EDUCATIONLEVEL", "WEALTHQUINTILE"]:
            lookup_code = lookup+" (CODE)"
            lookup_name = lookup+" (DISPLAY)"
            if lookup_code in rowdict:    # header = "SEX (CODE)"
                if rowdict[lookup_code]:  # value != ""
                    indID = indID + "({}={})" .format(lookup, rowdict[lookup_code])
                    name = name + " - " + rowdict[lookup_name]
        
        value_dict = {"value": rowdict['Display Value'],
                      "period": rowdict['YEAR (DISPLAY)'],
                      "indID": indID,
                      "region": rowdict["COUNTRY (CODE)"],
                      "dsID": "athena-api",
                      "source": baseurl % ind,
                      "is_number": True}
  
        
        indicator_dict = {'indID': indID,
                          'name': name,
                          'units': unit}

        orm.Indicator(**indicator_dict).save()
        orm.Value(**value_dict).save()

Ejemplo n.º 19

0

Mostrar archivo

def doit():
    # country_cells: we used to assert_one(), but sometimes there's two!

    dataset = {
        'dsID': 'emdat',
        'last_updated': None,
        'last_scraped': orm.now(),
        'name': 'EM-DAT'
    }
    orm.DataSet(**dataset).save()

    for i, t in enumerate(targets):
        indicator = {'indID': "emdat:%s" % t, 'name': names[i], 'units': 'uno'}
        if t == 'total_damage':
            indicator['units'] = ",000$ USD"
        orm.Indicator(**indicator).save()

    for country in country_list():  # TODO country_list
        print country
        raw = dl.grab(url.format(country))
        m_tables = messytables.any.any_tableset(raw)
        mt, = m_tables.tables
        table = xypath.Table.from_messy(mt)
        yr = table.filter('year').assert_one()
        years = yr.fill(xypath.DOWN)
        cats = yr.fill(xypath.RIGHT)
        for year, cat, value in years.junction(cats):
            value = {
                'dsID': 'emdat',
                'region': country,
                'indID': 'emdat:{}'.format(cat.value),
                'period': '{}/P1Y'.format(year.value),
                'value': value.value,
                'source': url,
                'is_number': True
            }
            orm.Value(**value).save()
    orm.session.commit()

Ejemplo n.º 20

0

Mostrar archivo

Archivo: athena.py Proyecto: luiscape/dap-scrapers

def do_indicator(ind):
    fh = dl.grab(baseurl % ind)
    mt, = messytables.commas.CSVTableSet(fh).tables
    mt_list = list(mt)
    headers = mt_list[0]
    if len(headers) == 0:
        print ind
        exit()
    rest = mt_list[1:]
    for row in rest:
        if len(row) == 0:
            continue  # skip empty row

        rowdict = {x[0].value: x[1].value for x in zip(headers, row)}
        name, unit = units(rowdict['GHO (DISPLAY)'])
        if 'SEX (CODE)' in rowdict:
            indID = rowdict['GHO (CODE)'] + "-" + rowdict['SEX (CODE)']
            name = name + " - " + rowdict['SEX (DISPLAY)']
        else:
            indID = rowdict['GHO (CODE)']
        
        value_dict = {"value": rowdict['Display Value'],
                      "period": rowdict['YEAR (DISPLAY)'],
                      "indID": indID,
                      "region": rowdict["COUNTRY (CODE)"],
                      "dsID": "athena-api",
                      "source": baseurl % ind,
                      "is_number": True}
  
        
        indicator_dict = {'indID': indID,
                          'name': name,
                          'units': unit}

        orm.Indicator(**indicator_dict).save()
        orm.Value(**value_dict).save()
        print value_dict

Ejemplo n.º 21

0

Mostrar archivo

Archivo: emdat.py Proyecto: OCHA-DAP/dap-scrapers

def doit():
    # country_cells: we used to assert_one(), but sometimes there's two!

    dataset = {'dsID': 'emdat',
               'last_updated': None,
               'last_scraped': orm.now(),
               'name': 'EM-DAT'}
    orm.DataSet(**dataset).save()

    for i, t in enumerate(targets):
        indicator = {'indID': "emdat:%s" % t,
                     'name': names[i],
                     'units': 'uno'}
        if t == 'total_damage':
            indicator['units'] = ",000$ USD"
        orm.Indicator(**indicator).save()
    
    for country in country_list():  # TODO country_list
        print country
        raw = dl.grab(url.format(country))
        m_tables = messytables.any.any_tableset(raw)
        mt, = m_tables.tables
        table = xypath.Table.from_messy(mt)
        yr = table.filter('year').assert_one()
        years = yr.fill(xypath.DOWN)
        cats = yr.fill(xypath.RIGHT)
        for year, cat, value in years.junction(cats):
            value = {'dsID': 'emdat',
                     'region': country,
                     'indID': 'emdat:{}'.format(cat.value),
                     'period': '{}/P1Y'.format(year.value),
                     'value': value.value,
                     'source': url,
                     'is_number': True}
            orm.Value(**value).save()
    orm.session.commit()

Ejemplo n.º 22

0

Mostrar archivo

Archivo: unodc.py Proyecto: timofonic-otherdevstuff/dap-scrapers

def main():
    save_dataset()
    save_indicator()
    raw = dl.grab(SHEET_URL)
    mtables = messytables.any.any_tableset(raw)
    table = xypath.Table.from_messy(mtables.tables[0])
    table.filter(IND_NAME).assert_one()  # we have the right table
    region_header = table.filter(REGION_HEADER_VALUE).assert_one()
    regions = region_header.fill(xypath.DOWN)
    years = region_header.fill(xypath.RIGHT,
                               stop_before=lambda c: c.value == '')
    assert len(years) < 15  # left side.
    for region_cell, year_cell, value_cell in regions.junction(years):
        value = {
            "dsID": DSID,
            'region': region_cell.value,
            'indID': INDID,
            'source': SHEET_URL,
            'is_number': True,
            'period': year_cell.value,
            'value': value_cell.value
        }
        orm.Value(**value).save()
    orm.session.commit()

Ejemplo n.º 23

0

Mostrar archivo

Archivo: athena.py Proyecto: luiscape/dap-scrapers

def do_indicator(ind):
    fh = dl.grab(baseurl % ind)
    mt, = messytables.commas.CSVTableSet(fh).tables
    mt_list = list(mt)
    headers = mt_list[0]
    if len(headers) == 0:
        print ind
        exit()
    rest = mt_list[1:]
    for row in rest:
        if len(row) == 0:
            continue  # skip empty row

        rowdict = {x[0].value: x[1].value for x in zip(headers, row)}
        name, unit = units(rowdict['GHO (DISPLAY)'])
        if 'SEX (CODE)' in rowdict:
            indID = rowdict['GHO (CODE)'] + "-" + rowdict['SEX (CODE)']
            name = name + " - " + rowdict['SEX (DISPLAY)']
        else:
            indID = rowdict['GHO (CODE)']

        value_dict = {
            "value": rowdict['Display Value'],
            "period": rowdict['YEAR (DISPLAY)'],
            "indID": indID,
            "region": rowdict["COUNTRY (CODE)"],
            "dsID": "athena-api",
            "source": baseurl % ind,
            "is_number": True
        }

        indicator_dict = {'indID': indID, 'name': name, 'units': unit}

        orm.Indicator(**indicator_dict).save()
        orm.Value(**value_dict).save()
        print value_dict

Ejemplo n.º 24

0

Mostrar archivo

Archivo: echo.py Proyecto: OCHA-DAP/dap-scrapers

indicators = [{"indID": "gna-vi",
              "name": "GNA Vulnerability Index",
              "units": "index"},
             {"indID": "gna-ci",
              "name": "GNA Crisis Index",
              "units": "index"}]

for indicator in indicators:
    orm.Indicator(**indicator).save()

value_template = {"dsID": "echo",
                  "period": 2012,
                  "source": baseurl,
                  "is_number": True}

xls_raw = dl.grab(baseurl)
mt = messytables.excel.XLSTableSet(xls_raw).tables[0]
assert mt.name == "GNA Final Index (rank)"
xy = xypath.Table.from_messy(mt)
countries = xy.filter("ISO3").assert_one().fill(xypath.DOWN)
vuln_h = xy.filter("GNA Vulnerability Index (VI)").assert_one()
crisis_h = xy.filter("GNA Crisis Index (CI)").assert_one()

headerheader(xy.filter("ISO3").assert_one(), xypath.DOWN, xypath.RIGHT)

big = {'region': headerheader(xy.filter("ISO3").assert_one(), xypath.DOWN, xypath.RIGHT),
       'indID': {'gna-vi': vuln_h.fill(xypath.DOWN),
                     'gna-ci': crisis_h.fill(xypath.DOWN)}}

for olap_row in xypath.xyzzy.xyzzy(xy, big, valuename="value"):
    print olap_row

Ejemplo n.º 25

0

Mostrar archivo

Archivo: emdat.py Proyecto: luiscape/dap-scrapers

import datetime
import itertools
import dl
import xypath
import messytables
import orm

"""Value: dsID, region, indID, period, value, source, is_number
   DataSet: dsID, last_updated, last_scraped, name
   Indicator: indID, name, units
   """

url = 'http://cred01.epid.ucl.ac.be:5317/?after=&before=&agg1=iso&agg2=year&dl=true'
raw = dl.grab(url)
m_tables = messytables.any.any_tableset(raw)
mt, = m_tables.tables
table = xypath.Table.from_messy(mt)

def doit(targets, names, year):
    # country_cells: we used to assert_one(), but sometimes there's two!
    country_cells = table.filter('iso').fill(xypath.DOWN)
    country_cells = country_cells - country_cells.filter('iso')  # remove other
    if not country_cells: print "no countries"
    country_year_filter = country_cells.filter(lambda b: b.shift(xypath.RIGHT).value == year)
    if not country_year_filter: print "no countries for ", year
    target_cells = table.filter(lambda b: b.value in targets)
    if not target_cells: print "didn't find ", targets

    value = {'dsID': 'emdat',
             'period': "%s/P1Y" % (year),
             'source': url,

Ejemplo n.º 26

0

Mostrar archivo

Archivo: emdat.py Proyecto: luiscape/dap-scrapers

import datetime
import itertools
import dl
import xypath
import messytables
import orm
"""Value: dsID, region, indID, period, value, source, is_number
   DataSet: dsID, last_updated, last_scraped, name
   Indicator: indID, name, units
   """

url = 'http://cred01.epid.ucl.ac.be:5317/?after=&before=&agg1=iso&agg2=year&dl=true'
raw = dl.grab(url)
m_tables = messytables.any.any_tableset(raw)
mt, = m_tables.tables
table = xypath.Table.from_messy(mt)


def doit(targets, names, year):
    # country_cells: we used to assert_one(), but sometimes there's two!
    country_cells = table.filter('iso').fill(xypath.DOWN)
    country_cells = country_cells - country_cells.filter('iso')  # remove other
    if not country_cells: print "no countries"
    country_year_filter = country_cells.filter(
        lambda b: b.shift(xypath.RIGHT).value == year)
    if not country_year_filter: print "no countries for ", year
    target_cells = table.filter(lambda b: b.value in targets)
    if not target_cells: print "didn't find ", targets

    value = {
        'dsID': 'emdat',

Ejemplo n.º 27

0

Mostrar archivo

Archivo: echo.py Proyecto: timofonic-otherdevstuff/dap-scrapers

    "indID": "gna-ci",
    "name": "GNA Crisis Index",
    "units": "index"
}]

for indicator in indicators:
    orm.Indicator(**indicator).save()

value_template = {
    "dsID": "echo",
    "period": 2012,
    "source": baseurl,
    "is_number": True
}

xls_raw = dl.grab(baseurl)
mt = messytables.excel.XLSTableSet(xls_raw).tables[0]
assert mt.name == "GNA Final Index (rank)"
xy = xypath.Table.from_messy(mt)
countries = xy.filter("ISO3").assert_one().fill(xypath.DOWN)
vuln_h = xy.filter("GNA Vulnerability Index (VI)").assert_one()
crisis_h = xy.filter("GNA Crisis Index (CI)").assert_one()

headerheader(xy.filter("ISO3").assert_one(), xypath.DOWN, xypath.RIGHT)

big = {
    'region':
    headerheader(xy.filter("ISO3").assert_one(), xypath.DOWN, xypath.RIGHT),
    'indID': {
        'gna-vi': vuln_h.fill(xypath.DOWN),
        'gna-ci': crisis_h.fill(xypath.DOWN)