def test_pandas(self): package_dir = '/Volumes/Storage/proj/virt-proj/metatab3/metatab-packages/civicknowledge.com/immigration-vs-gdp' doc = open_package(package_dir) r = doc.first_resource(name='country_gdp') rows = list(r) print(len(rows)) df = r.dataframe() print(df.head())
def make_zip_map(): """Create a map from zip to track that uses the HUD zip-tract cross walk as a probablilty map, with the facility it used as the probability. Using the facility ID makes the mapping stable. """ zip_xwalk_doc = mt.open_package('http://library.metatab.org/huduser.gov-zip_tract-2016-2.csv') zip_xwalk = zip_xwalk_doc.resource('zip-tract') zip_xwalk_df = zip_xwalk.dataframe() zx_groups = zip_xwalk_df.sort_values('res_ratio').groupby('zip') def make_single_zip_map_f(groups, zip): """Function to create a closure for mapping for a single zip, from an id value to tract""" import numpy as np import pandas as pd # Use the resigential ratios, the portion of the homes in the zip that are in each tract. res_ratios = list(zx_groups.get_group(zip).cumsum().res_ratio) tracts = list(zx_groups.get_group(zip).tract) assert len(res_ratios) == len(tracts) def _f(id): # Use the end of the ID value to ensure repeadability n = float(id%100) / 100.0 index = np.argmax(pd.Series(res_ratios) > n) return tracts[index] return _f f_map = {} # dict that returns, for each zip, the function to get a tract for the id number. for zp in zx_groups.groups.keys(): f_map[zp] = make_single_zip_map_f(zx_groups, zp) # Finally, put it all together in a single closure. def lookup(zip, n): try: # The map will return a Census geoid, which has 11 charasters, but it is often missing # the leading 0, so we have to put it back. Then it much be converted to an # ACS Tract census_tract_str = str(f_map[int(zip)](int(n)%100 / 100.0)).zfill(11) return str(AcsTract.parse(census_tract_str)) except KeyError: return None return lookup
def get_resource_urls(doc): resources = {} for dist in doc.find("Root.Distribution"): package_url, metadata_url = resolve_package_metadata_url(dist.value) u = Url(package_url) if u.resource_format == 'zip': prt("Skipping ZIP package ", package_url) elif u.resource_format == 'xlsx': resources[basename(package_url)] = package_url prt("Adding XLS package ", package_url) pass elif u.resource_format == 'csv': resources[basename(package_url)] = package_url prt("Adding CSV package {}".format(basename(package_url))) try: p = open_package(package_url) except (IOError, MetatabError) as e: err("Failed to open package '{}' from reference '{}': {}". format(package_url, dist.value, e)) for r in p.resources(): mimetype = mimetypes.guess_type(r.resolved_url)[0] try: ext = mimetypes.guess_extension(mimetype)[1:] except: ext = None # '.csv': Data>world currently get the format from the name, not the URL resources[r.name + '.csv'] = r.resolved_url prt("Adding CSV resource {}".format(r.name)) else: prt('Skipping {}'.format(package_url)) return resources
def test_metapack(self): from metatab import open_package, resolve_package_metadata_url cache = cache_fs() url = 'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1#random-names' rg = RowGenerator(cache=cache, url=url) package_url, metadata_url = resolve_package_metadata_url( rg.generator.spec.resource_url) self.assertEquals( 'http://library.metatab.org/example.com-simple_example-2017-us-1/', package_url) self.assertEquals( 'http://library.metatab.org/example.com-simple_example-2017-us-1/metadata.csv', metadata_url) doc = open_package(rg.generator.spec.resource_url, cache=cache) self.assertEquals( 'http://library.metatab.org/example.com-simple_example-2017-us-1/data/random-names.csv', doc.resource('random-names').resolved_url) urls = [ 'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1#random-names', 'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1.zip#random-names', 'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1.xlsx#random-names' ] for url in urls: gen = None try: gen = RowGenerator(cache=cache, url=url) rows = list(gen) self.assertEquals(101, len(rows)) except: print("ERROR URL", url) print("Row Generator ", gen) raise
def get_ave_weight(state): """Return the average weight parameter for a state""" import metatab as mt doc = mt.open_package( 'http://s3.amazonaws.com/library.metatab.org/census.gov-varrep_tables_support-2011e2015-1.csv' ) r = doc.resource('ave_weights') d = {} for row in r.iterdict: try: if int(row['fips_state_code']) == int(state): return int(row['average_weight']) except TypeError: continue
def get_k_val_f(): """Return a function that maps from population to k_values""" import metatab as mt doc = mt.open_package( 'http://s3.amazonaws.com/library.metatab.org/census.gov-varrep_tables_support-2011e2015-1.csv' ) r = doc.resource('k_values') rows = list(dict(e.items()) for e in r.iterdict) def f(population): for row in rows: if row['range_start'] <= population and ( row['range_end'] is None or population <= row['range_end']): return row['k_value'] else: return row['k_value'] return f
fac_zip = {} for row in doc.resource('facilities').iterdict: fac_zip[row['facility_number']] = row['facility_zip'] ## ## Load old geocodes, to save time and remote resources ## old_url='http://s3.amazonaws.com/library.metatab.org/{}.csv'.format(doc.as_version('-1').find_first_value('Root.Name')) old_geo = {} try: for row in mt.open_package(old_url).resource('geocodes').iterdict: try: ui = int(row['unique_id']) row['unique_id'] = ui old_geo[ui] = row except ValueError: # Erroroneous rows, have 'unique_id' == 'Unique' pass except (SourceError, MetatabError) as e: print("Failed to load old geocodes", e, file=sys.stderr) geocoder_header = 'unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips'.split() out_header = 'unique_id input_address match quality match_address lat lon tiger_id side_of_street state_fips county_fips tract_fips block_fips tract_geoid'.split()
# # # import metatab doc = metatab.open_package('..') r = doc.resource('sra') for row in r.iterdict: print(row['name'], row['geometry'].shape.bounds)
def send_to_ckan(m): from ckanapi import RemoteCKAN, NotFound try: doc = MetatabDoc(m.mt_file, cache=m.cache) except (IOError, MetatabError) as e: err("Failed to open metatab '{}': {}".format(m.mt_file, e)) c = RemoteCKAN(m.ckan_url, apikey=m.api_key) ckanid = doc.find_first_value('Root.Ckanid') identifier = doc.find_first_value('Root.Identitfier') name = doc.find_first('Root.Name') ckan_name = name.value.replace('.', '-') id_name = ckanid or ckan_name try: pkg = c.action.package_show(name_or_id=id_name) prt("Updating CKAN dataset for '{}'".format(ckan_name)) except NotFound: pkg = c.action.package_create(name=ckan_name, package_id=identifier) prt("Adding CKAN dataset for '{}'".format(ckan_name)) pkg['title'] = doc.find_first_value('Root.Title') if not pkg['title']: pkg['title'] = doc.find_first_value('Root.Description') try: pkg['notes'] = doc.markdown #doc.find_first_value('Root.Description') except OSError as e: warn(e) pkg['version'] = name.properties.get('version') pkg['groups'] = [{'name': g.value} for g in doc['Root'].find('Root.Group')] pkg['tags'] = [{'name': g.value} for g in doc['Root'].find('Root.Tag')] def get_org(name): if not name: return None try: return except NotFound: return None org_name = name.get('Origin', doc['Root'].find_first_value('Root.CkanOrg')) if org_name: org_name_slug = org_name.value.replace('.', '-') try: owner_org = c.action.organization_show(id=org_name_slug).get('id') pkg['owner_org'] = owner_org except NotFound: warn("Didn't find org for '{}'; not setting organization ".format( org_name_slug)) org_name_slug = None else: org_name_slug = None extras = {} for t in doc.find('*.*', section='Root'): if not t.term_is('Root.Distribution'): extras[t.qualified_term] = t.value for t in name.children: extras[t.qualified_term] = t.value pkg['extras'] = [{'key': k, 'value': v} for k, v in extras.items()] resources = [] for dist in doc.find("Root.Distribution"): package_url, metadata_url = resolve_package_metadata_url(dist.value) u = Url(package_url) if u.resource_format == 'zip': d = dict(url=package_url, name=basename(package_url), format='ZIP', mimetype=mimetypes.guess_type(package_url)[0], description='ZIP version of package') resources.append(d) prt("Adding ZIP package ", d['name']) elif u.resource_format == 'xlsx': d = dict(url=package_url, name=basename(package_url), format='XLSX', mimetype=mimetypes.guess_type(package_url)[0], description='Excel version of package') resources.append(d) prt("Adding XLS package ", d['name']) elif u.resource_format == 'csv': d = dict(url=package_url, name=basename(package_url), format='csv', mimetype=mimetypes.guess_type(metadata_url)[0], description='CSV Package Metadata in Metatab format') resources.append(d) prt("Adding {} package {}".format(d['format'], d['name'])) try: p = open_package(package_url) except (IOError, MetatabError) as e: err("Failed to open package '{}' from reference '{}': {}". format(package_url, dist.value, e)) for r in p.resources(): mimetype = mimetypes.guess_type(r.resolved_url)[0] try: ext = mimetypes.guess_extension(mimetype)[1:] except: ext = None d = dict(name=r.name, format=ext, url=r.resolved_url, mimetype=mimetype, description=r.markdown) resources.append(d) prt("Adding {} resource {}".format(d['format'], d['name'])) pkg['resources'] = resources c.action.package_update(**pkg) pkg = c.action.package_show(name_or_id=ckan_name) update_dist(doc, [], join(m.ckan_url, 'dataset', ckan_name)) ## ## Add a term with CKAN info. doc['Root'].get_or_new_term('CkanId', pkg['id']) if org_name_slug is None and pkg.get('organization'): doc['Root'].get_or_new_term('CkanOrg', (pkg.get('organization') or {}).get('name')) groups = doc['Root'].find('Group') for g in groups: doc.remove_term(g) for group in pkg.get('groups', []): doc['Root'].new_term('Group', group['name']) write_doc(doc, m.mt_file)