Example #1
0
def write_hashes(m):
    pm = last_build_marker_path(m)

    hashes = {}

    if pm.exists():
        hashes['last_package'] = pm.read_text()

        p = open_package(hashes['last_package'])

        hashes['last_hashes'] = {
            r.name: r.raw_row_generator.hash
            for r in p.resources()
        }

    tm = trial_build_marker_path(m)

    if tm.exists():
        hashes['trial_package'] = tm.read_text()

        p = open_package(hashes['trial_package'])

        hashes['trial_hashes'] = {
            r.name: r.raw_row_generator.hash
            for r in p.resources()
        }

    hp = Path(m.package_root.fspath, '.hashes.yaml')

    hp.write_text(yaml.safe_dump(hashes))
Example #2
0
def find_csv_packages(m, downloader):
    """Locate the build CSV package, which will have distributions if it was generated  as
    an S3 package"""
    from metapack_build.package import CsvPackageBuilder

    pkg_dir = m.package_root
    name = m.doc.get_value('Root.Name')

    package_path, cache_path = CsvPackageBuilder.make_package_path(
        pkg_dir, name)

    if package_path.exists():
        r = open_package(package_path, downloader=downloader)
        return r

    pkgs = list(
        reversed(
            sorted(
                list((f.stat().st_ctime, f)
                     for f in sorted(pkg_dir.fspath.glob('*.csv'))))))

    if pkgs:
        return open_package(pkgs[0][1], downloader=downloader)

    return None
Example #3
0
def compare_hashes(m):
    from metapack import open_package

    hp = Path(m.package_root.fspath, '.hashes.yaml')

    if not hp.exists():
        return None

    hashes = yaml.safe_load(hp.read_text())

    pm = last_build_marker_path(m)

    diffs = 0
    if pm.exists():

        p = open_package(pm.read_text())

        for r in p.resources():
            h1 = r.raw_row_generator.hash
            h2 = hashes['last_hashes'].get(r.name)

            if h1 != h2:
                diffs += 1

    if diffs:
        return True
    else:
        return False
Example #4
0
def expand_refs(r):
    from metapack import open_package, Resource
    from pathlib import Path


    if isinstance(r, Resource):
        yield r.doc, r
        return

    if isinstance(r, (list, tuple)): # Ought to be iterable type or somesuch
        for e in r:
            yield from expand_refs(e)
        return

    pkg = open_package(r)

    if not pkg.resources():
        # Metatab can open a lof of normal files, and try to interpret them, without errors,
        # but the files won't have any resources. SO, just assume it is a
        # file with a list of packages.
        for l in Path(r).open().readlines():
            yield from expand_refs(l.strip())


    if pkg.default_resource:
        yield from expand_refs(pkg.resource(pkg.default_resource))
    else:
        for r in pkg.resources():
            if r.resolved_url.proto == 'metapack':
                try:
                    yield pkg, r.resolved_url.resource
                except AttributeError:
                    yield from r.resolved_url.doc.resources()
            else:
                yield pkg, r
def build_clusters(c):

    pkg_dir = str(Path(__file__).parent.resolve())
    pkg = mp.open_package(pkg_dir)
    points_logger.info(f"Pkg dir: {pkg_dir}")

    pylib.build_clusters(pkg)
def set_descriptions(c):
    """Set the descriptions of the columns from the upstram data source"""

    pkg = mp.open_package('.')
    r = pkg.resource('distressed_counties')

    ur = pkg.reference('ffeic_distressed')  # Upstream resource
    upkg = ur.resolved_url.package_url.doc  # Upstrem package

    ucols = upkg.resource(ur.resource.name).columns()

    dmap = {c['header']: c['description'] for c in ucols}

    def get_desc(h):
        for k, v in dmap.items():
            if h.startswith(k):

                if '_pop_pct' in h:
                    return "Percentage of population in county in tracts with flag: " + v
                elif '_pop' in h:
                    return "Population in county in tracts with flag: " + v
                elif '_pct' in h:
                    return "Percentage of tracts in county with flag: " + v
                else:
                    return "Count of tracts in county with flag: " + v

    for c in r.schema_term.find('Table.Column'):
        desc = get_desc(c.name)

        if desc and not c['description']:
            c['description'] = desc

    pkg.write()
def build(c, force=None):
    """Build a filesystem package."""

    sys.path.append(str(Path(__file__).parent.resolve()))
    import pylib

    import logging
    from pylib import logger

    try:
        logging.basicConfig()
        logger.setLevel(logging.INFO)

        pkg_dir = str(get_pkg_dir())

        print(f"Pkg dir: {pkg_dir}")

        pkg = mp.open_package(pkg_dir)

        ex = pylib.ExtractManager(pkg)
        ex.build(force)

        mp_build(c, force)
    finally:
        sys.path.pop()  # So other packages won't get this pylib
Example #8
0
def publish(c, s3_bucket=None, wp_site=None, groups=[], tags=[]):
    """ Publish to s3 and wordpress, if the proper bucket and site variables are defined

    If the package should not be published, add a 'Redistribution'  Term to the root level
    of the metadata. It can take two values:

        "hidden": publish to S3, but not wordpress
        "private": Don't publish to either S3 or Wordpress

    """
    wp_site = c.metapack.wp_site or wp_site

    groups = c.metapack.groups or groups
    tags = c.metapack.tags or tags

    group_flags = ' '.join([f"-g{g}" for g in groups])
    tag_flags = ' '.join([f"-t{t}" for t in tags])

    pkg = open_package('./metadata.csv')

    redist = pkg.find_first_value('Root.Redistribution')
    name = pkg.name

    s3(c, s3_bucket=s3_bucket)

    if redist in ('private', 'hidden'):
        print(f"⚠️  Package {name} is {redist}; won't publish to wordpress")
    elif wp_site:
        c.run(f"mp wp -s {wp_site} {group_flags} {tag_flags} -p", pty=True)

    if redist not in ('private', 'hidden') and not s3_bucket and not wp_site:
        print("⚠️  Neither s3 bucket nor wp site config specified; nothing to do")
Example #9
0
def compare_hashes(m):
    hp = Path(m.package_root.fspath, '.hashes.yaml')

    if not hp.exists():
        print("!!! NO HASHES: ", hp)
        return

    hashes = yaml.safe_load(hp.read_text())

    pm = last_build_marker_path(m)

    diffs = 0
    if pm.exists():

        p = open_package(pm.read_text())

        for r in p.resources():
            h1 = r.raw_row_generator.hash
            h2 = hashes['last_hashes'].get(r.name)

            if h1 != h2:
                diffs += 1

    prt(f"{diffs} diffs")

    if diffs:
        sys.exit(1)
    else:
        sys.exit(0)
def build_osm_blocks(c):
    """Build blocks geo file and assign OSM points to blocks"""
    pkg_dir = str(Path(__file__).parent.resolve())
    pkg = mp.open_package(pkg_dir)
    points_logger.info(f"Pkg dir: {pkg_dir}")

    pylib.build_osm_points(pkg)
Example #11
0
    def x_test_categorical(self):

        import metapack as mp

        fn = '/Users/eric/proj/virt-proj/data-project/chis/healthpolicy.ucla.edu-chis_food/'

        pkg = mp.open_package(fn)

        print(pkg.resource)
    def update_schema(self):
        pkg = mp.open_package(
            self.pkg.ref
        )  # Re-open in case it has changed since loaded in this notebook

        for c in pkg.resource('combined').schema_term.find('Table.Column'):
            if not c.description:
                c.description = self.column_map.get(c.name.upper())

        pkg.write()
def create_roads_files(c):
    """Build the residential_roads.csv and nonres_roads.csv files"""
    cache_dir = str(Path(__file__).parent.resolve())
    lines_logger.info(f"Cache: {cache_dir}")
    
    pkg = mp.open_package(cache_dir)

    convert_pbf(c)

    pylib.build_lines(pkg)
def create_points_files(c):
    """Build the geohash_tags.csv file"""
    
    pkg_dir = str(Path(__file__).parent.resolve())
    pkg = mp.open_package(pkg_dir)
    points_logger.info(f"Pkg dir: {pkg_dir}")

    convert_pbf(c)
    
    pylib.build_points(pkg)
Example #15
0
    def test_html(self):

        p = open_package(
            test_data(
                'packages/example.com/example.com-full-2017-us/metadata.csv'))

        self.assertTrue(len(p._repr_html_()) > 4500, len(p._repr_html_()))

        print(list(e.name for e in p.find('Root.Resource')))

        r = p.find_first('Root.Resource', name='random-names')

        self.assertTrue(len(r._repr_html_()) > 400, len(r._repr_html_()))
Example #16
0
    def x_test_pandas(self):

        package_dir = '/Volumes/Storage/proj/virt-proj/metatab3/metatab-packages/civicknowledge.com/immigration-vs-gdp'

        doc = open_package(package_dir)

        r = doc.first_resource(name='country_gdp')

        rows = list(r)

        print(len(rows))

        df = r.dataframe()

        print(df.head())
Example #17
0
    def test_basic(self):
        import metapack as mp
        import pandas as pd

        pd.set_option('display.width', 120)
        pd.set_option('display.max_columns', 12)

        p = mp.open_package(
            '/Volumes/Storage/proj/virt/data-projects/workshift.us/packages/nlsinfo.org-nlsy-shiftwork/metadata.csv'
        )

        nlsy = NLSY97(p.reference('shiftwork_97'))

        print(nlsy.var_labels)

        print(nlsy.question_frame('YEMP_81300').head())
Example #18
0
    def test_build_dataframe(self):

        p = open_package(
            test_data('packages/example.com/example.com-python/metadata.csv'))

        df = p.resource('simple').dataframe()

        self.assertEqual(270, df.sum().sum())

        df = p.resource('explicit_dataframe_source').dataframe()

        self.assertEqual(435, df.sum().sum())

        df = p.resource('implicit_dataframe_source').dataframe()

        self.assertEqual(435, df.sum().sum())
Example #19
0
    def test_table(self):
        import json
        from metapack import open_package
        from itertools import islice

        u = '/Volumes/Storage/proj/virt/data-projects/client-boston-college/bc.edu-dataconv_poc/_packages/bc.edu-dataconv_poc-1/'
        pkg = open_package(u)
        r = pkg.resource('comments')

        json_headers = [(c['pos'], c.get('json')) for c in r.columns()]

        for row in islice(r, None, 10):
            d = {}
            for pos, jh in json_headers:
                add_to_struct(d, jh, row[pos])

            print(json.dumps(d, indent=4, cls=VTEncoder))
Example #20
0
    def test_open_package(self):

        from metapack import open_package
        from metapack.terms import Resource

        p = open_package(
            test_data(
                'packages/example.com/example.com-full-2017-us/metadata.csv'))

        self.assertEqual(Resource, type(p.find_first('root.datafile')))

        self.assertEqual('example.com-full-2017-us-1',
                         p.find_first('Root.Name').value)

        self.assertEqual(16, len(list(p['Resources'].find('Root.Resource'))))

        all_names = [r.name for r in p.find('Datafile')]

        for name in [
                'renter_cost', 'simple-example-altnames', 'simple-example',
                'unicode-latin1', 'unicode-utf8', 'renter_cost_excel07',
                'renter_cost_excel97', 'renter_cost-2', 'random-names',
                'random-names-fs', 'random-names-csv', 'random-names-xlsx',
                'random-names-zip', 'sra'
        ]:
            self.assertIn(name, all_names)

        self.assertIsInstance(p.resource('random-names'), Resource)
        self.assertEqual('random-names', p.resource('random-names').name)

        r = p.find_first('Root.DataFile')
        print(r.resolved_url)
        self.assertEqual(
            'http://public.source.civicknowledge.com/example.com/sources/test_data.zip#renter_cost.csv',
            str(r.resolved_url))

        for r in p.find('Root.DataFile'):

            if r.name != 'unicode-latin1':
                continue

            self.assertEqual(int(r.nrows), len(list(r)))

        self.assertEqual(['ipums', 'bordley', 'mcdonald', 'majumder'],
                         [c.name for c in p['Bibliography']])
Example #21
0
def reclaim_trial(m):
    tm = trial_build_marker_path(m).read_text()
    print(tm)

    p = open_package(tm)

    print(p.package_url)
    print(m.doc.package_url)

    print(p['Root'].get_value('Version.Build'))

    vt = p['Root'].find_first('Version')

    evt = m.doc['Root'].find_first('Version')

    m.doc['Root'].remove_term(evt)
    m.doc['Root'].add_term(vt)

    m.doc.write()
Example #22
0
def s3(c, s3_bucket=None):
    """ Publish to s3, if the proper bucket and site variables are defined

    If the package should not be published, add a 'Redistribution'  Term to the root level
    of the metadata. It can take two values:

        "hidden": publish to S3, but not wordpress
        "private": Don't publish to either S3 or Wordpress

    """

    s3_bucket = c.metapack.s3_bucket or s3_bucket

    pkg = open_package('./metadata.csv')

    redist = pkg.find_first_value('Root.Redistribution')
    name = pkg.name

    if redist == 'private':
        print(f"⚠️  Package {name} is private; won't upload to s3")
    elif s3_bucket:
        c.run(f"mp s3 -s {s3_bucket}", pty=True)
Example #23
0
    def test_dataframe(self):

        try:
            p = open_package(
                test_data(
                    'packages/example.com/example.com-full-2017-us/metadata.csv'
                ))

            r = p.resource('random-names')

            df = r.dataframe()

            self.assertTrue(df.describe().loc['count', 'Size'] == 100)
            self.assertTrue(df.describe().loc['mean',
                                              'Size'].round(4) == 49.8032)

            df = r.read_csv()

            self.assertTrue(df.describe().loc['count', 'Size'] == 100)
            self.assertTrue(df.describe().loc['mean',
                                              'Size'].round(4) == 49.8032)
        except ImportError:
            unittest.skip("Pandas not installed")
            return
def get_columns(pkg):
    """Get the columns from the existing schema"""
    pkg = mp.open_package(
        pkg.ref
    )  # Re-open in case it has changed since loaded in this notebook
    return [e['name'] for e in pkg.resource('census_set').columns()]
import metapack as mp
import subprocess as sp
from os.path import exists, join
from os import remove, mkdir

pkg = mp.open_package('_packages/sandiegodata.org-planning-1')

print("Package: ", pkg.package_url)

package_dir = 'package'

if not exists(package_dir):
    mkdir(package_dir)

tracts = pkg.resource('tract_boundaries').geoframe().set_index('geoid')

for r in pkg.resources():

    if r.headers and 'geometry' not in r.headers and 'geoid' in r.headers:
        print("Writing GeoJSON: ", r.name)
        df = r.read_csv()

        gdf = tracts.join(df.set_index('geoid'))

        gjpath = join(package_dir, r.name + '.geojson')

        if exists(gjpath):
            remove(gjpath)
        gdf.to_file(gjpath, 'GeoJSON')
Example #26
0
def run_s3(args):
    m = MetapackCliMemo(args)

    if m.args.credentials:
        show_credentials(m.args.profile)
        exit(0)

    # upload packages uploads the FS ( individual files )  and XLSX packages,
    # but does not create the CSV package file
    dist_urls, fs_p = upload_packages(m)

    writes = 0
    csv_url = None
    if dist_urls:

        # Create the CSV package, with links into the filesystem package
        if fs_p:
            access_url, dist_urls, csv_url = create_s3_csv_package(
                m, dist_urls, fs_p)
        else:
            # If this happens, then no packages were created, because an FS package
            # is always built first
            prt("Not creating CSV package; no FS package was uploaded")

        add_to_index(open_package(access_url))
    else:
        access_url = None

    if dist_urls:

        rows = [[path, url, reason]
                for what, reason, url, path in fs_p.files_processed
                if what != 'skip']
        if rows:
            prt("\nWrote these files:")
            writes = len(rows)
            prt(tabulate(rows, headers='path url reason'.split()))

        rows = [[path, url, reason]
                for what, reason, url, path in fs_p.files_processed
                if what == 'skip']
        if rows:
            prt("\nSkipped these files:")
            prt(tabulate(rows, headers='path url reason'.split()))

        prt("\nSynchronized these Package Urls")
        prt("-------------------------------")
        for au in dist_urls:
            prt(au)
        prt("-------------------------------")

    else:
        prt("тЪая╕П Did not find any packages to upload to S3")

    m.doc['Root'].get_or_new_term('Root.Issued').value = datetime_now()

    if fs_p:
        clear_cache(m, fs_p.files_processed)

    csv_pkg = open_package(csv_url)

    # Write the last distribution marker
    dist_info = {
        'name': m.doc.name,
        'version': m.doc.version,
        'access_url': access_url,
        'path': csv_pkg.path,
        'issued': datetime_now(),
        'distributions': {}
    }

    for d in csv_pkg['Distributions'].find('Root.Distribution'):
        dist_info['distributions'][d.type] = str(d.metadata_url)

    Path(last_dist_marker_path(m)).write_text(yaml.safe_dump(dist_info))

    if m.args.result:
        if writes > 0:
            print(f"тЬЕ Wrote {writes} files to {args.s3}")
        else:
            print(f"ЁЯЪл Did not write anything to {args.s3}")
Example #27
0
# Copy metadata that can't be copied with mp update -P

import metapack as mp
import subprocess as sp
from os.path import exists, join
from os import remove, mkdir

pkg = mp.open_package('.')

print("Package: ", pkg.package_url)

pdb = pkg.reference('planning_db_sd')

col_map = {
    c['header'].replace('_acs_12_16', '').replace('_acsmoe_12_16', '_m90'): c
    for c in pdb.columns()
}

acs_pdb = pkg.resource('acs_pdb')

for c in acs_pdb.schema_term.children:
    ch = c.get_or_new_child('Description')
    ch.value = col_map.get(c.name)['description']

pkg.write_csv()