Ejemplo n.º 1
0
    def test_pandas(self):

        package_dir = '/Volumes/Storage/proj/virt-proj/metatab3/metatab-packages/civicknowledge.com/immigration-vs-gdp'

        doc = open_package(package_dir)

        r = doc.first_resource(name='country_gdp')

        rows = list(r)

        print(len(rows))

        df = r.dataframe()

        print(df.head())
Ejemplo n.º 2
0
def make_zip_map():
    """Create a map from zip to track that uses the HUD zip-tract cross walk as a probablilty
    map, with the facility it used as the probability. Using the facility ID makes the mapping stable. """
    zip_xwalk_doc = mt.open_package('http://library.metatab.org/huduser.gov-zip_tract-2016-2.csv')
    zip_xwalk = zip_xwalk_doc.resource('zip-tract')
    zip_xwalk_df = zip_xwalk.dataframe()
    zx_groups = zip_xwalk_df.sort_values('res_ratio').groupby('zip')
    
    def make_single_zip_map_f(groups, zip):
        """Function to create a closure for mapping for a single zip, from an id value to 
         tract"""
        import numpy as np
        import pandas as pd

        # Use the resigential ratios, the portion of the homes in the zip that are in each tract. 
        res_ratios = list(zx_groups.get_group(zip).cumsum().res_ratio)
        tracts = list(zx_groups.get_group(zip).tract)
        
        assert len(res_ratios) == len(tracts)

        def _f(id):
            # Use the end of the ID value to ensure repeadability
            n = float(id%100) / 100.0
            index = np.argmax(pd.Series(res_ratios) > n)

            return tracts[index]

        return _f
    
    f_map = {}
    
    # dict that returns, for each zip, the function to get a tract for the id number. 
    for zp in zx_groups.groups.keys():
        f_map[zp] = make_single_zip_map_f(zx_groups, zp)
        
    # Finally, put it all together in a single closure. 
    def lookup(zip, n):

        try:
            # The map will return a Census geoid, which has 11 charasters, but it is often missing
            # the leading 0, so we have to put it back. Then it much be converted to an 
            # ACS Tract
            census_tract_str =  str(f_map[int(zip)](int(n)%100 / 100.0)).zfill(11)
            return str(AcsTract.parse(census_tract_str))
        except KeyError:
            return None

    return lookup
Ejemplo n.º 3
0
def get_resource_urls(doc):

    resources = {}

    for dist in doc.find("Root.Distribution"):

        package_url, metadata_url = resolve_package_metadata_url(dist.value)

        u = Url(package_url)

        if u.resource_format == 'zip':
            prt("Skipping ZIP package ", package_url)

        elif u.resource_format == 'xlsx':
            resources[basename(package_url)] = package_url
            prt("Adding XLS package ", package_url)
            pass

        elif u.resource_format == 'csv':

            resources[basename(package_url)] = package_url

            prt("Adding CSV package {}".format(basename(package_url)))

            try:
                p = open_package(package_url)
            except (IOError, MetatabError) as e:
                err("Failed to open package '{}' from reference '{}': {}".
                    format(package_url, dist.value, e))

            for r in p.resources():

                mimetype = mimetypes.guess_type(r.resolved_url)[0]

                try:
                    ext = mimetypes.guess_extension(mimetype)[1:]
                except:
                    ext = None

                # '.csv': Data>world currently get the format from the name, not the URL
                resources[r.name + '.csv'] = r.resolved_url
                prt("Adding CSV resource {}".format(r.name))
        else:
            prt('Skipping {}'.format(package_url))

    return resources
Ejemplo n.º 4
0
    def test_metapack(self):

        from metatab import open_package, resolve_package_metadata_url

        cache = cache_fs()

        url = 'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1#random-names'

        rg = RowGenerator(cache=cache, url=url)

        package_url, metadata_url = resolve_package_metadata_url(
            rg.generator.spec.resource_url)

        self.assertEquals(
            'http://library.metatab.org/example.com-simple_example-2017-us-1/',
            package_url)
        self.assertEquals(
            'http://library.metatab.org/example.com-simple_example-2017-us-1/metadata.csv',
            metadata_url)

        doc = open_package(rg.generator.spec.resource_url, cache=cache)

        self.assertEquals(
            'http://library.metatab.org/example.com-simple_example-2017-us-1/data/random-names.csv',
            doc.resource('random-names').resolved_url)

        urls = [
            'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1#random-names',
            'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1.zip#random-names',
            'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1.xlsx#random-names'
        ]

        for url in urls:
            gen = None
            try:
                gen = RowGenerator(cache=cache, url=url)

                rows = list(gen)

                self.assertEquals(101, len(rows))
            except:
                print("ERROR URL", url)
                print("Row Generator ", gen)
                raise
Ejemplo n.º 5
0
def get_ave_weight(state):
    """Return the average weight parameter for a state"""
    import metatab as mt

    doc = mt.open_package(
        'http://s3.amazonaws.com/library.metatab.org/census.gov-varrep_tables_support-2011e2015-1.csv'
    )

    r = doc.resource('ave_weights')

    d = {}

    for row in r.iterdict:

        try:
            if int(row['fips_state_code']) == int(state):
                return int(row['average_weight'])
        except TypeError:
            continue
Ejemplo n.º 6
0
def get_k_val_f():
    """Return a function that maps from population to k_values"""
    import metatab as mt

    doc = mt.open_package(
        'http://s3.amazonaws.com/library.metatab.org/census.gov-varrep_tables_support-2011e2015-1.csv'
    )

    r = doc.resource('k_values')

    rows = list(dict(e.items()) for e in r.iterdict)

    def f(population):
        for row in rows:

            if row['range_start'] <= population and (
                    row['range_end'] is None
                    or population <= row['range_end']):
                return row['k_value']

        else:
            return row['k_value']

    return f
Ejemplo n.º 7
0
fac_zip = {}

for row in doc.resource('facilities').iterdict:
    fac_zip[row['facility_number']] = row['facility_zip']

##
## Load old geocodes, to save time and remote resources
## 

old_url='http://s3.amazonaws.com/library.metatab.org/{}.csv'.format(doc.as_version('-1').find_first_value('Root.Name'))

old_geo = {}

try:
    for row in mt.open_package(old_url).resource('geocodes').iterdict:
    
        try:
            ui = int(row['unique_id'])
            row['unique_id'] = ui
            old_geo[ui] = row
        except ValueError:
            # Erroroneous rows, have 'unique_id' == 'Unique'
            pass
except (SourceError, MetatabError) as e:
    print("Failed to load old geocodes", e, file=sys.stderr)
    
    
geocoder_header = 'unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips'.split()

out_header = 'unique_id input_address match quality match_address lat lon tiger_id side_of_street state_fips county_fips tract_fips block_fips tract_geoid'.split()
Ejemplo n.º 8
0
#
#
#

import metatab

doc = metatab.open_package('..')

r = doc.resource('sra')

for row in r.iterdict:
    print(row['name'], row['geometry'].shape.bounds)
Ejemplo n.º 9
0
def send_to_ckan(m):

    from ckanapi import RemoteCKAN, NotFound
    try:
        doc = MetatabDoc(m.mt_file, cache=m.cache)
    except (IOError, MetatabError) as e:
        err("Failed to open metatab '{}': {}".format(m.mt_file, e))

    c = RemoteCKAN(m.ckan_url, apikey=m.api_key)

    ckanid = doc.find_first_value('Root.Ckanid')
    identifier = doc.find_first_value('Root.Identitfier')
    name = doc.find_first('Root.Name')

    ckan_name = name.value.replace('.', '-')

    id_name = ckanid or ckan_name

    try:
        pkg = c.action.package_show(name_or_id=id_name)
        prt("Updating CKAN dataset for '{}'".format(ckan_name))
    except NotFound:
        pkg = c.action.package_create(name=ckan_name, package_id=identifier)
        prt("Adding CKAN dataset for '{}'".format(ckan_name))

    pkg['title'] = doc.find_first_value('Root.Title')

    if not pkg['title']:
        pkg['title'] = doc.find_first_value('Root.Description')

    try:
        pkg['notes'] = doc.markdown  #doc.find_first_value('Root.Description')
    except OSError as e:
        warn(e)

    pkg['version'] = name.properties.get('version')

    pkg['groups'] = [{'name': g.value} for g in doc['Root'].find('Root.Group')]

    pkg['tags'] = [{'name': g.value} for g in doc['Root'].find('Root.Tag')]

    def get_org(name):

        if not name:
            return None

        try:
            return
        except NotFound:
            return None

    org_name = name.get('Origin', doc['Root'].find_first_value('Root.CkanOrg'))

    if org_name:
        org_name_slug = org_name.value.replace('.', '-')
        try:

            owner_org = c.action.organization_show(id=org_name_slug).get('id')
            pkg['owner_org'] = owner_org
        except NotFound:
            warn("Didn't find org for '{}'; not setting organization ".format(
                org_name_slug))
            org_name_slug = None
    else:
        org_name_slug = None

    extras = {}

    for t in doc.find('*.*', section='Root'):
        if not t.term_is('Root.Distribution'):
            extras[t.qualified_term] = t.value

    for t in name.children:
        extras[t.qualified_term] = t.value

    pkg['extras'] = [{'key': k, 'value': v} for k, v in extras.items()]

    resources = []

    for dist in doc.find("Root.Distribution"):

        package_url, metadata_url = resolve_package_metadata_url(dist.value)

        u = Url(package_url)

        if u.resource_format == 'zip':
            d = dict(url=package_url,
                     name=basename(package_url),
                     format='ZIP',
                     mimetype=mimetypes.guess_type(package_url)[0],
                     description='ZIP version of package')
            resources.append(d)
            prt("Adding ZIP package ", d['name'])

        elif u.resource_format == 'xlsx':
            d = dict(url=package_url,
                     name=basename(package_url),
                     format='XLSX',
                     mimetype=mimetypes.guess_type(package_url)[0],
                     description='Excel version of package')
            resources.append(d)
            prt("Adding XLS package ", d['name'])

        elif u.resource_format == 'csv':

            d = dict(url=package_url,
                     name=basename(package_url),
                     format='csv',
                     mimetype=mimetypes.guess_type(metadata_url)[0],
                     description='CSV Package Metadata in Metatab format')

            resources.append(d)
            prt("Adding {} package {}".format(d['format'], d['name']))

            try:
                p = open_package(package_url)
            except (IOError, MetatabError) as e:
                err("Failed to open package '{}' from reference '{}': {}".
                    format(package_url, dist.value, e))

            for r in p.resources():

                mimetype = mimetypes.guess_type(r.resolved_url)[0]

                try:
                    ext = mimetypes.guess_extension(mimetype)[1:]
                except:
                    ext = None

                d = dict(name=r.name,
                         format=ext,
                         url=r.resolved_url,
                         mimetype=mimetype,
                         description=r.markdown)

                resources.append(d)
                prt("Adding {} resource {}".format(d['format'], d['name']))

    pkg['resources'] = resources

    c.action.package_update(**pkg)

    pkg = c.action.package_show(name_or_id=ckan_name)

    update_dist(doc, [], join(m.ckan_url, 'dataset', ckan_name))

    ##
    ## Add a term with CKAN info.

    doc['Root'].get_or_new_term('CkanId', pkg['id'])

    if org_name_slug is None and pkg.get('organization'):
        doc['Root'].get_or_new_term('CkanOrg', (pkg.get('organization')
                                                or {}).get('name'))

    groups = doc['Root'].find('Group')
    for g in groups:
        doc.remove_term(g)

    for group in pkg.get('groups', []):
        doc['Root'].new_term('Group', group['name'])

    write_doc(doc, m.mt_file)