def write_hashes(m): pm = last_build_marker_path(m) hashes = {} if pm.exists(): hashes['last_package'] = pm.read_text() p = open_package(hashes['last_package']) hashes['last_hashes'] = { r.name: r.raw_row_generator.hash for r in p.resources() } tm = trial_build_marker_path(m) if tm.exists(): hashes['trial_package'] = tm.read_text() p = open_package(hashes['trial_package']) hashes['trial_hashes'] = { r.name: r.raw_row_generator.hash for r in p.resources() } hp = Path(m.package_root.fspath, '.hashes.yaml') hp.write_text(yaml.safe_dump(hashes))
def find_csv_packages(m, downloader): """Locate the build CSV package, which will have distributions if it was generated as an S3 package""" from metapack_build.package import CsvPackageBuilder pkg_dir = m.package_root name = m.doc.get_value('Root.Name') package_path, cache_path = CsvPackageBuilder.make_package_path( pkg_dir, name) if package_path.exists(): r = open_package(package_path, downloader=downloader) return r pkgs = list( reversed( sorted( list((f.stat().st_ctime, f) for f in sorted(pkg_dir.fspath.glob('*.csv')))))) if pkgs: return open_package(pkgs[0][1], downloader=downloader) return None
def compare_hashes(m): from metapack import open_package hp = Path(m.package_root.fspath, '.hashes.yaml') if not hp.exists(): return None hashes = yaml.safe_load(hp.read_text()) pm = last_build_marker_path(m) diffs = 0 if pm.exists(): p = open_package(pm.read_text()) for r in p.resources(): h1 = r.raw_row_generator.hash h2 = hashes['last_hashes'].get(r.name) if h1 != h2: diffs += 1 if diffs: return True else: return False
def expand_refs(r): from metapack import open_package, Resource from pathlib import Path if isinstance(r, Resource): yield r.doc, r return if isinstance(r, (list, tuple)): # Ought to be iterable type or somesuch for e in r: yield from expand_refs(e) return pkg = open_package(r) if not pkg.resources(): # Metatab can open a lof of normal files, and try to interpret them, without errors, # but the files won't have any resources. SO, just assume it is a # file with a list of packages. for l in Path(r).open().readlines(): yield from expand_refs(l.strip()) if pkg.default_resource: yield from expand_refs(pkg.resource(pkg.default_resource)) else: for r in pkg.resources(): if r.resolved_url.proto == 'metapack': try: yield pkg, r.resolved_url.resource except AttributeError: yield from r.resolved_url.doc.resources() else: yield pkg, r
def build_clusters(c): pkg_dir = str(Path(__file__).parent.resolve()) pkg = mp.open_package(pkg_dir) points_logger.info(f"Pkg dir: {pkg_dir}") pylib.build_clusters(pkg)
def set_descriptions(c): """Set the descriptions of the columns from the upstram data source""" pkg = mp.open_package('.') r = pkg.resource('distressed_counties') ur = pkg.reference('ffeic_distressed') # Upstream resource upkg = ur.resolved_url.package_url.doc # Upstrem package ucols = upkg.resource(ur.resource.name).columns() dmap = {c['header']: c['description'] for c in ucols} def get_desc(h): for k, v in dmap.items(): if h.startswith(k): if '_pop_pct' in h: return "Percentage of population in county in tracts with flag: " + v elif '_pop' in h: return "Population in county in tracts with flag: " + v elif '_pct' in h: return "Percentage of tracts in county with flag: " + v else: return "Count of tracts in county with flag: " + v for c in r.schema_term.find('Table.Column'): desc = get_desc(c.name) if desc and not c['description']: c['description'] = desc pkg.write()
def build(c, force=None): """Build a filesystem package.""" sys.path.append(str(Path(__file__).parent.resolve())) import pylib import logging from pylib import logger try: logging.basicConfig() logger.setLevel(logging.INFO) pkg_dir = str(get_pkg_dir()) print(f"Pkg dir: {pkg_dir}") pkg = mp.open_package(pkg_dir) ex = pylib.ExtractManager(pkg) ex.build(force) mp_build(c, force) finally: sys.path.pop() # So other packages won't get this pylib
def publish(c, s3_bucket=None, wp_site=None, groups=[], tags=[]): """ Publish to s3 and wordpress, if the proper bucket and site variables are defined If the package should not be published, add a 'Redistribution' Term to the root level of the metadata. It can take two values: "hidden": publish to S3, but not wordpress "private": Don't publish to either S3 or Wordpress """ wp_site = c.metapack.wp_site or wp_site groups = c.metapack.groups or groups tags = c.metapack.tags or tags group_flags = ' '.join([f"-g{g}" for g in groups]) tag_flags = ' '.join([f"-t{t}" for t in tags]) pkg = open_package('./metadata.csv') redist = pkg.find_first_value('Root.Redistribution') name = pkg.name s3(c, s3_bucket=s3_bucket) if redist in ('private', 'hidden'): print(f"⚠️ Package {name} is {redist}; won't publish to wordpress") elif wp_site: c.run(f"mp wp -s {wp_site} {group_flags} {tag_flags} -p", pty=True) if redist not in ('private', 'hidden') and not s3_bucket and not wp_site: print("⚠️ Neither s3 bucket nor wp site config specified; nothing to do")
def compare_hashes(m): hp = Path(m.package_root.fspath, '.hashes.yaml') if not hp.exists(): print("!!! NO HASHES: ", hp) return hashes = yaml.safe_load(hp.read_text()) pm = last_build_marker_path(m) diffs = 0 if pm.exists(): p = open_package(pm.read_text()) for r in p.resources(): h1 = r.raw_row_generator.hash h2 = hashes['last_hashes'].get(r.name) if h1 != h2: diffs += 1 prt(f"{diffs} diffs") if diffs: sys.exit(1) else: sys.exit(0)
def build_osm_blocks(c): """Build blocks geo file and assign OSM points to blocks""" pkg_dir = str(Path(__file__).parent.resolve()) pkg = mp.open_package(pkg_dir) points_logger.info(f"Pkg dir: {pkg_dir}") pylib.build_osm_points(pkg)
def x_test_categorical(self): import metapack as mp fn = '/Users/eric/proj/virt-proj/data-project/chis/healthpolicy.ucla.edu-chis_food/' pkg = mp.open_package(fn) print(pkg.resource)
def update_schema(self): pkg = mp.open_package( self.pkg.ref ) # Re-open in case it has changed since loaded in this notebook for c in pkg.resource('combined').schema_term.find('Table.Column'): if not c.description: c.description = self.column_map.get(c.name.upper()) pkg.write()
def create_roads_files(c): """Build the residential_roads.csv and nonres_roads.csv files""" cache_dir = str(Path(__file__).parent.resolve()) lines_logger.info(f"Cache: {cache_dir}") pkg = mp.open_package(cache_dir) convert_pbf(c) pylib.build_lines(pkg)
def create_points_files(c): """Build the geohash_tags.csv file""" pkg_dir = str(Path(__file__).parent.resolve()) pkg = mp.open_package(pkg_dir) points_logger.info(f"Pkg dir: {pkg_dir}") convert_pbf(c) pylib.build_points(pkg)
def test_html(self): p = open_package( test_data( 'packages/example.com/example.com-full-2017-us/metadata.csv')) self.assertTrue(len(p._repr_html_()) > 4500, len(p._repr_html_())) print(list(e.name for e in p.find('Root.Resource'))) r = p.find_first('Root.Resource', name='random-names') self.assertTrue(len(r._repr_html_()) > 400, len(r._repr_html_()))
def x_test_pandas(self): package_dir = '/Volumes/Storage/proj/virt-proj/metatab3/metatab-packages/civicknowledge.com/immigration-vs-gdp' doc = open_package(package_dir) r = doc.first_resource(name='country_gdp') rows = list(r) print(len(rows)) df = r.dataframe() print(df.head())
def test_basic(self): import metapack as mp import pandas as pd pd.set_option('display.width', 120) pd.set_option('display.max_columns', 12) p = mp.open_package( '/Volumes/Storage/proj/virt/data-projects/workshift.us/packages/nlsinfo.org-nlsy-shiftwork/metadata.csv' ) nlsy = NLSY97(p.reference('shiftwork_97')) print(nlsy.var_labels) print(nlsy.question_frame('YEMP_81300').head())
def test_build_dataframe(self): p = open_package( test_data('packages/example.com/example.com-python/metadata.csv')) df = p.resource('simple').dataframe() self.assertEqual(270, df.sum().sum()) df = p.resource('explicit_dataframe_source').dataframe() self.assertEqual(435, df.sum().sum()) df = p.resource('implicit_dataframe_source').dataframe() self.assertEqual(435, df.sum().sum())
def test_table(self): import json from metapack import open_package from itertools import islice u = '/Volumes/Storage/proj/virt/data-projects/client-boston-college/bc.edu-dataconv_poc/_packages/bc.edu-dataconv_poc-1/' pkg = open_package(u) r = pkg.resource('comments') json_headers = [(c['pos'], c.get('json')) for c in r.columns()] for row in islice(r, None, 10): d = {} for pos, jh in json_headers: add_to_struct(d, jh, row[pos]) print(json.dumps(d, indent=4, cls=VTEncoder))
def test_open_package(self): from metapack import open_package from metapack.terms import Resource p = open_package( test_data( 'packages/example.com/example.com-full-2017-us/metadata.csv')) self.assertEqual(Resource, type(p.find_first('root.datafile'))) self.assertEqual('example.com-full-2017-us-1', p.find_first('Root.Name').value) self.assertEqual(16, len(list(p['Resources'].find('Root.Resource')))) all_names = [r.name for r in p.find('Datafile')] for name in [ 'renter_cost', 'simple-example-altnames', 'simple-example', 'unicode-latin1', 'unicode-utf8', 'renter_cost_excel07', 'renter_cost_excel97', 'renter_cost-2', 'random-names', 'random-names-fs', 'random-names-csv', 'random-names-xlsx', 'random-names-zip', 'sra' ]: self.assertIn(name, all_names) self.assertIsInstance(p.resource('random-names'), Resource) self.assertEqual('random-names', p.resource('random-names').name) r = p.find_first('Root.DataFile') print(r.resolved_url) self.assertEqual( 'http://public.source.civicknowledge.com/example.com/sources/test_data.zip#renter_cost.csv', str(r.resolved_url)) for r in p.find('Root.DataFile'): if r.name != 'unicode-latin1': continue self.assertEqual(int(r.nrows), len(list(r))) self.assertEqual(['ipums', 'bordley', 'mcdonald', 'majumder'], [c.name for c in p['Bibliography']])
def reclaim_trial(m): tm = trial_build_marker_path(m).read_text() print(tm) p = open_package(tm) print(p.package_url) print(m.doc.package_url) print(p['Root'].get_value('Version.Build')) vt = p['Root'].find_first('Version') evt = m.doc['Root'].find_first('Version') m.doc['Root'].remove_term(evt) m.doc['Root'].add_term(vt) m.doc.write()
def s3(c, s3_bucket=None): """ Publish to s3, if the proper bucket and site variables are defined If the package should not be published, add a 'Redistribution' Term to the root level of the metadata. It can take two values: "hidden": publish to S3, but not wordpress "private": Don't publish to either S3 or Wordpress """ s3_bucket = c.metapack.s3_bucket or s3_bucket pkg = open_package('./metadata.csv') redist = pkg.find_first_value('Root.Redistribution') name = pkg.name if redist == 'private': print(f"⚠️ Package {name} is private; won't upload to s3") elif s3_bucket: c.run(f"mp s3 -s {s3_bucket}", pty=True)
def test_dataframe(self): try: p = open_package( test_data( 'packages/example.com/example.com-full-2017-us/metadata.csv' )) r = p.resource('random-names') df = r.dataframe() self.assertTrue(df.describe().loc['count', 'Size'] == 100) self.assertTrue(df.describe().loc['mean', 'Size'].round(4) == 49.8032) df = r.read_csv() self.assertTrue(df.describe().loc['count', 'Size'] == 100) self.assertTrue(df.describe().loc['mean', 'Size'].round(4) == 49.8032) except ImportError: unittest.skip("Pandas not installed") return
def get_columns(pkg): """Get the columns from the existing schema""" pkg = mp.open_package( pkg.ref ) # Re-open in case it has changed since loaded in this notebook return [e['name'] for e in pkg.resource('census_set').columns()]
import metapack as mp import subprocess as sp from os.path import exists, join from os import remove, mkdir pkg = mp.open_package('_packages/sandiegodata.org-planning-1') print("Package: ", pkg.package_url) package_dir = 'package' if not exists(package_dir): mkdir(package_dir) tracts = pkg.resource('tract_boundaries').geoframe().set_index('geoid') for r in pkg.resources(): if r.headers and 'geometry' not in r.headers and 'geoid' in r.headers: print("Writing GeoJSON: ", r.name) df = r.read_csv() gdf = tracts.join(df.set_index('geoid')) gjpath = join(package_dir, r.name + '.geojson') if exists(gjpath): remove(gjpath) gdf.to_file(gjpath, 'GeoJSON')
def run_s3(args): m = MetapackCliMemo(args) if m.args.credentials: show_credentials(m.args.profile) exit(0) # upload packages uploads the FS ( individual files ) and XLSX packages, # but does not create the CSV package file dist_urls, fs_p = upload_packages(m) writes = 0 csv_url = None if dist_urls: # Create the CSV package, with links into the filesystem package if fs_p: access_url, dist_urls, csv_url = create_s3_csv_package( m, dist_urls, fs_p) else: # If this happens, then no packages were created, because an FS package # is always built first prt("Not creating CSV package; no FS package was uploaded") add_to_index(open_package(access_url)) else: access_url = None if dist_urls: rows = [[path, url, reason] for what, reason, url, path in fs_p.files_processed if what != 'skip'] if rows: prt("\nWrote these files:") writes = len(rows) prt(tabulate(rows, headers='path url reason'.split())) rows = [[path, url, reason] for what, reason, url, path in fs_p.files_processed if what == 'skip'] if rows: prt("\nSkipped these files:") prt(tabulate(rows, headers='path url reason'.split())) prt("\nSynchronized these Package Urls") prt("-------------------------------") for au in dist_urls: prt(au) prt("-------------------------------") else: prt("тЪая╕П Did not find any packages to upload to S3") m.doc['Root'].get_or_new_term('Root.Issued').value = datetime_now() if fs_p: clear_cache(m, fs_p.files_processed) csv_pkg = open_package(csv_url) # Write the last distribution marker dist_info = { 'name': m.doc.name, 'version': m.doc.version, 'access_url': access_url, 'path': csv_pkg.path, 'issued': datetime_now(), 'distributions': {} } for d in csv_pkg['Distributions'].find('Root.Distribution'): dist_info['distributions'][d.type] = str(d.metadata_url) Path(last_dist_marker_path(m)).write_text(yaml.safe_dump(dist_info)) if m.args.result: if writes > 0: print(f"тЬЕ Wrote {writes} files to {args.s3}") else: print(f"ЁЯЪл Did not write anything to {args.s3}")
# Copy metadata that can't be copied with mp update -P import metapack as mp import subprocess as sp from os.path import exists, join from os import remove, mkdir pkg = mp.open_package('.') print("Package: ", pkg.package_url) pdb = pkg.reference('planning_db_sd') col_map = { c['header'].replace('_acs_12_16', '').replace('_acsmoe_12_16', '_m90'): c for c in pdb.columns() } acs_pdb = pkg.resource('acs_pdb') for c in acs_pdb.schema_term.children: ch = c.get_or_new_child('Description') ch.value = col_map.get(c.name)['description'] pkg.write_csv()