Esempio n. 1
0
def write_eda_notebook(m):
    # Get the EDA notebook file from Github

    url = "https://raw.githubusercontent.com/Metatab/exploratory-data-analysis/master/eda.ipynb"

    resource = m.get_resource()

    if not resource:
        warn('Must specify a resource. Select one of:')
        list_rr(m.doc)
        sys.exit(0)

    r = requests.get(url, allow_redirects=True)
    r.raise_for_status()

    nb_path = Path('notebooks/{}-{}.ipynb'.format(
        splitext(basename(url))[0], resource.name))

    ensure_dir(nb_path.parent)

    if nb_path.exists():
        err("Notebook {} already exists".format(nb_path))

    with nb_path.open('wb') as f:
        f.write(r.content)

    prt('Wrote {}'.format(nb_path))

    with edit_notebook(nb_path) as nb:
        set_cell_source(nb, 'resource_name',
                        "resource_name='{}'".format(resource.name))
Esempio n. 2
0
def convert_documentation(nb_path):
    """Run only the document conversion portion of the notebook conversion

      The final document will not be completel
    """

    with open(nb_path) as f:
        nb = nbformat.reads(f.read(), as_version=4)

    doc = ExtractInlineMetatabDoc(package_url="metapack+file:" +
                                  dirname(nb_path)).run(nb)

    package_name = doc.as_version(None)

    output_dir = join(getcwd(), package_name)

    de = DocumentationExporter(config=Config(),
                               log=logger,
                               metadata=doc_metadata(doc))
    prt('Converting documentation')
    output, resources = de.from_filename(nb_path)

    fw = FilesWriter()

    fw.build_directory = join(output_dir, 'docs')
    fw.write(output, resources, notebook_name='notebook')
    prt("Wrote documentation to {}".format(fw.build_directory))
Esempio n. 3
0
def maybe_trial_build(m):
    from shutil import copyfile
    '''Update the metadata for a trial build, then restore it'''

    if not m.args.trial:
        yield False, m.mt_file
        return

    if not m.doc._has_semver():
        raise MetapackError(
            "To use trial builds, package must have a semantic version ")

    prt('Building a trial')

    mt_file = Path(m.mt_file.fspath).parent.joinpath('trial.csv')

    copyfile(m.mt_file.fspath, mt_file)

    doc = MetapackDoc(mt_file)
    version = doc['Root'].find_first('Root.Version')
    vb = version.get_or_new_child('Version.Build')
    vb.value = 'trial'

    try:
        doc.update_name()
        doc.write()

        yield True, parse_app_url(str(mt_file), downloader)
    finally:
        mt_file.unlink()
Esempio n. 4
0
def compare_hashes(m):
    hp = Path(m.package_root.fspath, '.hashes.yaml')

    if not hp.exists():
        print("!!! NO HASHES: ", hp)
        return

    hashes = yaml.safe_load(hp.read_text())

    pm = last_build_marker_path(m)

    diffs = 0
    if pm.exists():

        p = open_package(pm.read_text())

        for r in p.resources():
            h1 = r.raw_row_generator.hash
            h2 = hashes['last_hashes'].get(r.name)

            if h1 != h2:
                diffs += 1

    prt(f"{diffs} diffs")

    if diffs:
        sys.exit(1)
    else:
        sys.exit(0)
Esempio n. 5
0
def _process_metapack_resource(doc, r, force):
    remote_resource = r.resolved_url.resource

    if not remote_resource:
        warn('Metatab resource could not be resolved from {}'.format(r.resolved_url))
        return

    remote_st = remote_resource.schema_term

    schema_term = r.schema_term

    if schema_term:

        prt("Updating table '{}' ".format(r.schema_name))

        # Remove existing columns, so add them back later, possibly in a new order
        for child in list(schema_term.children):
            schema_term.remove_child(child)

    else:
        prt("Adding table '{}' ".format(r.schema_name))
        schema_term = doc['Schema'].new_term('Table', r.schema_name)

    for c in remote_st.children:
        schema_term.add_child(c)
Esempio n. 6
0
def row_generator(resource, doc, env, *args, **kwargs):
    """ An example row generator function.

    Reference this function in a Metatab file as the value of a Datafile:

            Datafile: python:pylib#row_generator

    The function must yield rows, with the first being headers, and subsequenct rows being data.

    :param resource: The Datafile term being processed
    :param doc: The Metatab document that contains the term being processed
    :param args: Positional arguments passed to the generator
    :param kwargs: Keyword arguments passed to the generator
    :return:


    The env argument is a dict with these environmental keys:

    * CACHE_DIR
    * RESOURCE_NAME
    * RESOLVED_URL
    * WORKING_DIR
    * METATAB_DOC
    * METATAB_WORKING_DIR
    * METATAB_PACKAGE

    It also contains key/valu pairs for all of the properties of the resource.

    """

    import requests
    from io import StringIO
    import csv
    from metapack.cli.core import prt

    ref = doc.reference('url_template')

    for i, year in enumerate(range(2000, 2019)):
        url = ref.resolved_url.interpolate({'year': year})

        prt(url)

        r = requests.get(url)
        r.raise_for_status()

        f = StringIO(r.text)
        reader = csv.reader(f, delimiter=',')
        for j, row in enumerate(reader):

            if j == 0:  # first row
                if i == 0:  # first file
                    yield row  # yield the header
                    continue
                else:
                    continue

            yield row
Esempio n. 7
0
def process_schemas(mt_file, resource=None, cache=None, clean=False, report_found=True, force=False, min_rows=5000,
                    allow_codes=True):
    from metapack import MetapackDoc, MetapackResourceUrl, MetapackDocumentUrl

    if isinstance(mt_file, MetapackDoc):
        doc = mt_file
        write_doc_to_file = False
    else:
        doc = MetapackDoc(mt_file)
        write_doc_to_file = True

    try:
        if clean:
            doc['Schema'].clean()
        else:
            doc['Schema']

    except KeyError:
        doc.new_section('Schema', ['DataType', 'AltName', 'Description'])

    schemas_processed = 0

    for r in doc['Resources'].find('Root.Resource'):

        if resource and r.name != resource:
            continue

        schema_term = r.schema_term

        col_count = len(list(r.columns()))
        datatype_count = sum(1 for c in r.columns() if c['datatype'])

        if schema_term and col_count == datatype_count and force is False:
            if report_found:
                prt("Found table for '{}'; skipping".format(r.schema_name))
            continue

        if col_count != datatype_count:
            prt("Found table for '{}'; but {} columns don't have datatypes"
                .format(r.schema_name, col_count - datatype_count))

        schemas_processed += 1

        rr = r.resolved_url

        rmtree(get_materialized_data_cache(doc), ignore_errors=True)

        if isinstance(rr, MetapackDocumentUrl):
            warn('{} is a MetapackDocumentUrl; skipping', r.name)
        elif isinstance(rr, MetapackResourceUrl):
            _process_metapack_resource(doc, r, force)
        else:
            _process_normal_resource(doc, r, force, skip_start=min_rows, allow_codes=allow_codes)

    if write_doc_to_file and schemas_processed:
        write_doc(doc, mt_file)
Esempio n. 8
0
def touch_metadata(m):
    import os

    p = m.filesystem_package

    t = p.package_build_time()

    os.utime(m.doc.ref.fspath, (t, t))

    prt(f"Set times for '{m.doc.ref.fspath}'' to {t}")
Esempio n. 9
0
def run_colmap_test(args):
    m = MetapackCliMemo(args, downloader)

    r = m.get_resource()

    if not r:
        prt('Select a resource to run:')
        list_rr(m.doc)
        sys.exit(0)

    cm = get_col_map(r)

    print(cm)
Esempio n. 10
0
def index_packages(m):
    from metapack.cli.index import walk_packages
    from metapack.index import SearchIndex, search_index_file

    idx = SearchIndex(search_index_file())

    entries = []
    for p in walk_packages(None, parse_app_url(str(m.package_root.fspath))):
        prt("Indexing:", p.ref)
        idx.add_package(p)
        entries.append(p.name)

    idx.write()
    prt("Indexed ", len(entries), 'entries')
Esempio n. 11
0
def run_colmap_new(args):
    m = MetapackCliMemo(args, downloader)

    resources = get_resources(m)

    if not resources:
        err(f"No resources found with colmap name '{m.args.colmap_name}'")

    # Collect all of the headers, into a list of headers,
    # and the union of all of them in col_index
    col_index = []
    headers = []

    for r in resources:
        h = r.headers

        col_index += [
            alt_col_name(c) for c in h if alt_col_name(c) not in col_index
        ]
        headers.append(h)

    # Create lists, of the same length as the index, of the source
    # column names, at the same position as the alt_col_name is in the col_index
    data = [col_index]

    for header in headers:
        new_row = [None] * len(col_index)
        for c in header:
            new_row[col_index.index(alt_col_name(c))] = c

        data.append(new_row)

    t = [['index'] + [r.name for r in resources]] + list(
        zip(*data))  # zip transposes rows into columns.

    path = Path(f"colmap-{m.args.colmap_name}.csv")

    if m.args.print:
        from tabulate import tabulate
        prt(tabulate(t[1:], headers=t[0]))
    else:
        if path.exists() and not m.args.force:
            err(f"Col map file '{str(path)}' already exists. Use -f to overwrite"
                )

        else:
            with path.open('w') as f:
                csv.writer(f).writerows(t)
            prt(f"Wrote {str(path)}")
Esempio n. 12
0
def add_single_resource(doc, ref, cache, seen_names):
    from metatab.util import slugify

    t = doc.find_first('Root.Datafile', value=ref)

    if t:
        prt("Datafile exists for '{}', deleting".format(ref))
        doc.remove_term(t)
    else:
        prt("Adding {}".format(ref))

    term_name = classify_url(ref)

    path, name = extract_path_name(ref)

    # If the name already exists, try to create a new one.
    # 20 attempts ought to be enough.
    if name in seen_names:
        base_name = re.sub(r'-?\d+$', '', name)

        for i in range(1, 20):
            name = "{}-{}".format(base_name, i)
            if name not in seen_names:
                break

    seen_names.add(name)

    encoding = start_line = None
    header_lines = []

    if not name:
        from hashlib import sha1
        name = sha1(slugify(path).encode('ascii')).hexdigest()[:12]

        # xlrd gets grouchy if the name doesn't start with a char
        try:
            int(name[0])
            name = 'a' + name[1:]
        except Exception:
            pass

    return doc['Resources'].new_term(term_name,
                                     ref,
                                     name=name,
                                     startline=start_line,
                                     headerlines=','.join(
                                         str(e) for e in header_lines),
                                     encoding=encoding)
Esempio n. 13
0
def add_missing_files(package_dir):
    import metapack_build.support as support_dir

    src_dir = Path(support_dir.__file__).parent

    for src, dest in [('gitignore', '.gitignore'), ('tox.ini', 'tox.ini'),
                      ('requirements.txt', 'requirements.txt'),
                      ('tasks.py', 'tasks.py')]:

        dest_p = Path(package_dir).joinpath(dest)

        if not dest_p.exists():
            copyfile(src_dir.joinpath(src), dest_p)
            prt('Creating file: ', dest)
        else:
            prt('File exists:', dest)
Esempio n. 14
0
def run_url_add(args):
    """Add a resources entry, downloading the intuiting the file, replacing entries with
        the same reference"""

    m = MetapackCliMemo(args, downloader)

    update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

    if isinstance(m.mt_file, MetapackDoc):
        doc = m.mt_file
    else:
        doc = MetapackDoc(m.mt_file)

    if 'Resources' not in doc:
        doc.new_section('Resources')

    doc['Resources'].args = [
        e for e in set(doc['Resources'].args +
                       ['Name', 'StartLine', 'HeaderLines', 'Encoding']) if e
    ]

    seen_names = set()

    u = parse_app_url(args.url)

    # The web and file URLs don't list the same.

    if u.proto == 'file':
        entries = u.list()
    else:
        entries = [ssu for su in u.list() for ssu in su.list()]

    errors = []

    for e in entries:
        if not add_single_resource(
                doc, e, cache=m.cache, seen_names=seen_names):
            errors.append(e)

    if errors:
        prt()
        warn("Found, but failed to add these urls:")
        for e in errors:
            print('    ', e)

    write_doc(doc)
Esempio n. 15
0
def update_promote(m):
    sections = m.args.promote

    if not sections or sections == 'RR':
        sections = ['resources', 'references']
    elif sections == '*':
        sections = [e.lower() for e in m.doc.sections.keys()]
    else:
        sections = [sections.lower()]

    for section_name, section in m.doc.sections.items():
        if section_name.lower() in sections:
            for arg in promote_terms(section):
                prt("Move {} to header in section {} ".format(
                    arg, section_name))

    write_doc(m.doc)
Esempio n. 16
0
def move_alt_names(m):
    doc = m.doc

    for t in doc['Schema'].find('Root.Table'):
        moved = 0
        for c in t.find('Table.Column'):

            altname = c.get('AltName')
            if altname:

                if not c.get('Description'):
                    c.description = (c.name or '').replace('\n', ' ')

                c.name = altname.value
                c.remove_child(altname)

                moved += 1

        prt("Moved {} names in '{}'".format(moved, t.name))

    write_doc(doc)
Esempio n. 17
0
def write_notebook(m):
    # Get the EDA notebook file from Github

    url = "https://raw.githubusercontent.com/Metatab/notebook-templates/master/package-notebook.ipynb"

    r = requests.get(url, allow_redirects=True)
    r.raise_for_status()

    p = Path(m.args.new_notebook)

    nb_path = f'notebooks/{p.stem}.ipynb'

    ensure_dir(dirname(nb_path))

    if exists(nb_path):
        err("Notebook {} already exists".format(nb_path))

    with open(nb_path, 'wb') as f:
        f.write(r.content)

    prt('Wrote {}'.format(nb_path))
def custom_update(doc, args):
    from metapack.cli.core import prt

    source = doc.reference('source')

    r = iter(source)

    descriptions = next(r)
    header = next(r)

    desc_map = {k: v for k, v in zip(header, descriptions)}

    changes = 0
    for c in doc['Schema'].find('Table.Column'):
        desc = desc_map.get(c.name)

        if desc and c.get_value('Description') != desc:
            prt(f"Setting {c.name} description to: {desc}")
            c['Description'] = desc
            changes += 1

        if c.get_value('Datatype') == 'unknown':
            c['Datatype'] = 'number'
            changes += 1

    if changes:
        prt(f'Writing {changes} changes')
        doc.write()
    else:
        prt('No changes')
Esempio n. 19
0
def _exec_build(p, package_root, force, nv_name, extant_url_f, post_f):
    from metapack import MetapackUrl

    if force:
        reason = 'Forcing build'
        should_build = True
    elif p.is_older_than_metadata():
        reason = 'Metadata is younger than package'
        should_build = True
    elif not p.exists():
        reason = "Package doesn't exist"
        should_build = True
    else:
        reason = 'Metadata is older than package'
        should_build = False

    if should_build:
        prt("Building {} package ({})".format(p.type_code, reason))
        url = p.save()
        prt("Package ( type: {} ) saved to: {}".format(p.type_code, url))
        created = True
    else:
        prt("Not building {} package ({})".format(p.type_code, reason))

    if not should_build and p.exists():
        created = False
        url = extant_url_f(p)

    post_f()

    if nv_name:
        p.move_to_nv_name()

    return p, MetapackUrl(url, downloader=package_root.downloader), created
Esempio n. 20
0
def convert_notebook(nb_path):
    prt('Convert notebook to Metatab source package')

    if not exists(nb_path):
        err("Notebook path does not exist: '{}' ".format(nb_path))

    c = Config()

    pe = NotebookExecutor(config=c, log=logger)

    prt('Running the notebook')
    output, resources = pe.from_filename(nb_path)

    fw = FilesWriter()
    fw.build_directory = pe.output_dir

    fw.write(output, resources, notebook_name=DEFAULT_METATAB_FILE)

    de = DocumentationExporter(config=c,
                               log=logger,
                               metadata=doc_metadata(pe.doc))

    prt('Exporting documentation')
    output, resources = de.from_filename(nb_path)

    fw.build_directory = join(pe.output_dir, 'docs')
    fw.write(output, resources, notebook_name='notebook')

    new_mt_file = join(pe.output_dir, DEFAULT_METATAB_FILE)

    doc = MetapackDoc(new_mt_file)

    de.update_metatab(doc, resources)

    for lib_dir in pe.lib_dirs:
        lib_dir = normpath(lib_dir).lstrip('./')

        doc['Resources'].new_term("Root.PythonLib", lib_dir)

        path = abspath(lib_dir)
        dest = join(pe.output_dir, lib_dir)

        ensure_dir(dest)
        copytree(path, join(pe.output_dir, lib_dir))

    doc.write_csv()

    # Reset the input to use the new data

    prt('Running with new package file: {}'.format(new_mt_file))
Esempio n. 21
0
def convert_hugo(nb_path, hugo_path):
    from os import environ
    from os.path import abspath

    # Total hack. Would like the -H to be allowed to have no arg, and then use the env var,
    # but I don't know how to do that. This is the case where the user types
    # -H nb_path, so just go with it.
    if hugo_path and not nb_path:
        nb_path = hugo_path
        hugo_path = environ.get('METAPACK_HUGO_DIR')

    if not hugo_path:
        err("Must specify value for -H or the METAPACK_HUGO_DIR environment var"
            )

    if not exists(nb_path):
        err("Notebook path does not exist: '{}' ".format(nb_path))

    c = Config()
    c.HugoExporter.hugo_dir = abspath(
        hugo_path)  # Exports assume rel path is rel to notebook
    he = HugoExporter(config=c, log=logger)

    output, resources = he.from_filename(nb_path)

    prt('Writing Notebook to Hugo Markdown')

    prt('    Writing ',
        resources['unique_key'] + resources['output_extension'])
    for k, v in resources['outputs'].items():
        prt('    Writing ', k)

    fw = FilesWriter()
    fw.write(output, resources, notebook_name=resources['unique_key'])
Esempio n. 22
0
def run_init_cmd(m):

    from git.exc import GitCommandError

    g = Github(get_token())

    remote_r = get_or_new_github_repo(g, m)
    local_r = get_or_init_local_repo(m)

    try:
        origin = local_r.remote('origin')
    except (ValueError, GitCommandError) as e:
        print(e)
        origin = local_r.create_remote('origin', remote_r.clone_url)
        #local_r.create_head('master', origin.refs.master)
        local_r.git.push('--set-upstream', 'origin', 'master')

    add_giturl(m.doc, force=True)
    write_doc(m.doc)

    prt(f'Initialized local and remote {origin.refs.master} at {remote_r.clone_url}'
        )
Esempio n. 23
0
def build(m):

    raise NotImplementedError()

    def mp(*args):
        pass

    name = m.doc.name

    lb_file = m.package_root.fspath.joinpath('.last_build')

    if m.args.result:
        prt = print
    else:
        from metapack.cli.core import prt

    if lb_file.exists():
        # Run a test build
        ft_args = ['build', '-FT']
        if m.args.no_cache:
            ft_args = ['-n'] + ft_args
        mp(ft_args, do_cli_init=False)

        tb_path = m.package_root.fspath.joinpath('.trial_build').read_text()
        lb_path = lb_file.read_text()

        tdoc = MetapackDoc(tb_path)
        ldoc = MetapackDoc(lb_path)

        diff_hashes = 0

        for t_r in tdoc.resources():
            l_r = ldoc.resource(t_r.name)

            h1 = t_r.raw_row_generator.hash
            h2 = l_r.raw_row_generator.hash

            if h1 != h2:
                diff_hashes += 1

        if diff_hashes == 0:
            prt(f'👍 {name}: Hashes Unchanged: will not rebuild')
            return

        prt(f'🛠 {name}: Hashes changed. Marked for rebuilding')
        Path(m.mt_file.fspath).touch()

        if m.args.increment:
            m.doc.update_name(mod_version='+')
            m.doc.write()
    else:
        prt(f'🛠 {name}: No previous build')
Esempio n. 24
0
def show_credentials(profile):
    import boto3

    session = boto3.Session(profile_name=profile)

    if profile:
        cred_line = " 'eval $(metasync -C -p {} )'".format(profile)
    else:
        cred_line = " 'eval $(metasync -C)'"

    prt("export AWS_ACCESS_KEY_ID={} ".format(
        session.get_credentials().access_key))
    prt("export AWS_SECRET_ACCESS_KEY={}".format(
        session.get_credentials().secret_key))
    prt("# Run {} to configure credentials in a shell".format(cred_line))
Esempio n. 25
0
def run_url_scrape(args):
    m = MetapackCliMemo(args, downloader)

    from metapack.util import scrape_urls_from_web_page

    doc = m.doc
    url = m.args.url

    doc['resources'].new_term('DownloadPage', url)

    d = scrape_urls_from_web_page(url)

    if d.get('error'):
        err(d.get('error'))

    new_resources = 0
    new_documentation = 0

    if not args.no_resources:
        for k, v in d['sources'].items():
            u = parse_app_url(v['url'])
            t = doc['Resources'].new_term('DataFile',
                                          v['url'],
                                          name=u.fspath.stem,
                                          description=v.get('description'))
            new_resources += 1
            if args.verbose:
                prt(t, t.props)

    if not args.no_docs:
        for k, v in d['external_documentation'].items():
            term_name = classify_url(v['url'])
            u = parse_app_url(v['url'])
            t = doc['Documentation'].new_term(term_name,
                                              v['url'],
                                              name=u.fspath.stem,
                                              description=v.get('description'))
            new_documentation += 1
            if args.verbose:
                prt(t, t.props)

    prt("Added {} resource and {} documentation terms".format(
        new_resources, new_documentation))

    if not args.dry_run:
        write_doc(doc)
Esempio n. 26
0
def _build_cmd(args):
    from rowgenerators.rowpipe.exceptions import TooManyCastingErrors

    downloader.set_callback((build_downloader_callback))

    m = MetapackCliMemo(args, downloader)

    if m.args.profile:
        from metatab.s3 import set_s3_profile
        set_s3_profile(m.args.profile)

    if m.args.clean_cache:
        clean_cache('metapack')

    try:
        changes = metatab_derived_handler(m)
        prt(f"{changes} changes")

    except TooManyCastingErrors as e:
        prt('Casting Errors:')
        for error in e.errors:
            prt(error)
        if m.args.exceptions:
            raise e
        else:
            err(e)
    except Exception as e:
        raise
        if m.args.exceptions:
            raise e
        else:
            err(e)

    clean_cache(m.cache)

    return changes
Esempio n. 27
0
def _process_normal_resource(doc, r, force, skip_start=5000, allow_codes=True):
    """Process a resource that requires reading the file; not a metatab resource"""

    from rowgenerators.exceptions import SourceError, SchemaError
    from requests.exceptions import ConnectionError
    from itertools import islice

    from rowgenerators.source import SelectiveRowGenerator
    from tableintuit import TypeIntuiter

    schema_term = r.schema_term

    try:
        if force:
            rg = r.raw_row_generator
        else:
            rg = r.row_generator

    except SchemaError:
        rg = r.raw_row_generator
        warn("Failed to build row processor table, using raw row generator")

    # Take only the first 250K rows, and then skip through them.
    # For 250,000 rows, the routine will analyze about 10K
    slice = skip_iterator(islice(rg, 250000), skip_start=skip_start)

    headers, start, end = r._get_start_end_header()

    si = SelectiveRowGenerator(slice, header_lines=headers, start=start, end=end)

    try:
        ti = TypeIntuiter().run(si)

    except SourceError as e:
        warn("Failed to process resource '{}'; {}".format(r.name, e))
        return
    except ConnectionError as e:
        warn("Failed to download resource '{}'; {}".format(r.name, e))
        return
    except UnicodeDecodeError as e:
        warn("Text encoding error for resource '{}'; {}".format(r.name, e))
        return

    if schema_term:

        prt("Updating table '{}' ".format(r.schema_name))

        # Existing columns
        orig_columns = {e['name'].lower() if e['name'] else '': e for e in r.schema_columns or {}}

        # Remove existing columns, so add them back later, possibly in a new order
        for child in list(schema_term.children):
            schema_term.remove_child(child)

    else:
        prt("Adding table '{}' ".format(r.schema_name))
        schema_term = doc['Schema'].new_term('Table', r.schema_name)
        orig_columns = {}

    for i, c in enumerate(ti.to_rows()):

        raw_alt_name = alt_col_name(c['header'], i)
        alt_name = raw_alt_name if raw_alt_name != c['header'] else ''

        kwargs = {}

        if alt_name:
            kwargs['AltName'] = alt_name

        datatype = type_map.get(c['resolved_type'], c['resolved_type'])

        # If the field has codes, it is probably an integer, with a few
        # strings
        if c['has_codes'] and not allow_codes:
            datatype = 'text' if c['unicode'] else 'string'

        schema_term.new_child('Column', c['header'],
                              datatype=datatype,
                              # description = get_col_value(c['header'].lower(),'description'),
                              has_codes='T' if c['has_codes'] else '',
                              **kwargs)

    update_resource_properties(r, orig_columns=orig_columns, force=force)

    return ti
Esempio n. 28
0
def update_resource_properties(r, orig_columns={}, force=False):
    """Get descriptions and other properties from this, or upstream, packages, and add them to the schema. """

    added = []

    schema_term = r.schema_term

    if not schema_term:
        warn("No schema term for ", r.name)
        return

    prt("Processing schema {}".format(r.name))

    rg = r.row_generator

    # Get columns information from the schema, or, if it is a package reference,
    # from the upstream schema

    upstream_columns = {e['name'].lower() if e['name'] else '': e for e in r.columns() or {}}

    # Just from the local schema
    schema_columns = {e['name'].lower() if e['name'] else '': e for e in r.schema_columns or {}}

    # Ask the generator if it can provide column descriptions and types
    generator_columns = {e['name'].lower() if e['name'] else '': e for e in rg.columns or {}}

    def get_col_value(col_name, value_name):

        v = None

        if not col_name:
            return None

        for d in [generator_columns, upstream_columns, orig_columns, schema_columns]:
            v_ = d.get(col_name.lower(), {}).get(value_name)
            if v_:
                v = v_

        return v

    # Look for new properties
    extra_properties = set()
    for d in [generator_columns, upstream_columns, orig_columns, schema_columns]:
        for k, v in d.items():
            for kk, vv in v.items():
                extra_properties.add(kk)

    # Remove the properties that are already accounted for
    extra_properties = extra_properties - {'pos', 'header', 'name', ''}

    # Add any extra properties, such as from upstream packages, to the schema.

    for ep in extra_properties:
        r.doc['Schema'].add_arg(ep)

    for c in schema_term.find('Table.Column'):

        for ep in extra_properties:
            t = c.get_or_new_child(ep)
            v = get_col_value(c.name, ep)
            if v:
                t.value = v
                added.append((c.name, ep, v))

    prt('Updated schema for {}. Set {} properties'.format(r.name, len(added)))
Esempio n. 29
0
def add_single_resource(doc, ref, cache, seen_names):
    from metatab.util import slugify

    t = doc.find_first('Root.Datafile', value=ref)

    if t:
        prt("Datafile exists for '{}', deleting".format(ref))
        doc.remove_term(t)

    term_name = classify_url(ref)

    path, name = extract_path_name(ref)

    # If the name already exists, try to create a new one.
    # 20 attempts ought to be enough.
    if name in seen_names:
        base_name = re.sub(r'-?\d+$', '', name)

        for i in range(1, 20):
            name = "{}-{}".format(base_name, i)
            if name not in seen_names:
                break

    seen_names.add(name)

    encoding = start_line = None
    header_lines = []

    try:
        encoding, ri = run_row_intuit(path, cache)

        start_line = ri.start_line or None
        header_lines = ri.header_lines
    except RowIntuitError as e:
        warn("Failed to intuit '{}'; {}".format(ref, e))
    except RowGeneratorError as e:
        warn("Can't generate rows for: '{}'; {}".format(ref, e))
        return None
    except SourceError as e:
        warn("Source Error: '{}'; {}".format(ref, e))
        return None

    except Exception as e:
        warn("Error: '{}'; {}".format(ref, e))
        raise

    if not name:
        from hashlib import sha1
        name = sha1(slugify(path).encode('ascii')).hexdigest()[:12]

        # xlrd gets grouchy if the name doesn't start with a char
        try:
            int(name[0])
            name = 'a' + name[1:]
        except Exception:
            pass

    prt("Added {}, url: {} ".format(name, ref))

    return doc['Resources'].new_term(term_name,
                                     ref,
                                     name=name,
                                     startline=start_line,
                                     headerlines=','.join(
                                         str(e) for e in header_lines))
Esempio n. 30
0
def new_cmd(args):
    from metapack import MetapackDoc
    from metapack.util import make_metatab_file, datetime_now, ensure_dir
    from metapack.cli.core import write_doc, prt, err
    from os.path import exists, join, expanduser
    from metatab import DEFAULT_METATAB_FILE
    from os import getenv

    if args.config:
        config_file = args.config
    elif getenv("METAPACK_CONFIG"):
        config_file = getenv("METAPACK_DEFAULTS")
    elif expanduser('~/.metapack-default.csv'):
        config_file = expanduser('~/.metapack-defaults.csv')
    else:
        config_file = None

    if config_file and exists(config_file):
        prt(f"Using defaults file '{config_file}'")
        config = MetapackDoc(config_file)
    else:
        config = MetapackDoc()

    if args.jupyter:
        import tempfile

        with tempfile.NamedTemporaryFile(suffix='.ipynb', delete=False) as fp:

            r = requests.get(TEMPLATE_NOTEBOOK, allow_redirects=True)
            r.raise_for_status()

            fp.write(r.content)
            nb_path = Path(fp.name)

        doc = MetapackDoc(str(nb_path))

    else:

        doc = make_metatab_file(args.template)

    doc['Root']['Created'] = datetime_now()

    origin = args.origin or config.get_value('Root.Origin')

    if not origin:
        err("Must specify a value for origin, either on command line or in defaults file"
            )

    (doc['Root'].find_first('Root.Origin') or et).value = origin
    (doc['Root'].find_first('Root.Dataset') or et).value = args.dataset
    (doc['Root'].find_first('Root.Space')
     or et).value = args.space or config.get_value('Root.Space')
    (doc['Root'].find_first('Root.Time')
     or et).value = args.time or config.get_value('Root.Time')
    (doc['Root'].find_first('Root.Grain')
     or et).value = args.grain or config.get_value('Root.Grain')
    (doc['Root'].find_first('Root.Variant')
     or et).value = args.variant or config.get_value('Root.Variant')

    v = doc['Root'].get_or_new_term('Root.Version')
    v.get_or_new_child(
        'Version.Major'
    ).value = args.revision or config.get_value('Root.Version')
    v.get_or_new_child('Version.Minor').value = 1
    v.get_or_new_child('Version.Patch').value = 1

    # Copy contacts in
    if 'Contacts' in config:
        for c in config['Contacts']:
            doc['Contacts'].add_term(c)

    if args.title:
        doc['Root'].find_first('Root.Title').value = args.title.strip()

    nv_name = doc.as_version(None)

    if args.example:
        doc['Resources'].new_term(
            'Root.Datafile',
            'http://public.source.civicknowledge.com/example.com/sources/random-names.csv',
            name='random_names')

        doc['Documentation'].new_term('Root.Homepage',
                                      'http://metatab.org',
                                      title='Metatab Home Page')

    doc.ensure_identifier()
    doc.update_name(create_term=True)

    if getattr(args, 'jupyter'):  # b/c maybe metatab_jupyter is not installed

        from metapack_jupyter.convert import write_metatab_notebook
        from metapack_jupyter.core import edit_notebook, set_cell_source, get_cell_source

        new_nb_path = Path(f'{nv_name}.ipynb')

        doc['Resources'].new_term(
            'Root.Datafile',
            './' + str(new_nb_path) + "#df",
            name='local_dataframe',
            description='Example of using a local Dataframe')

        if new_nb_path.exists():
            err(f"Directory {nb_path} already exists")

        copyfile(nb_path, new_nb_path)

        write_metatab_notebook(doc, new_nb_path)

        with edit_notebook(new_nb_path) as nb:
            init = get_cell_source(nb, 'init')
            init += f"\nthis_package_name = '{str(new_nb_path.name)}'"
            set_cell_source(nb, 'init', init)

        nb_path.unlink()
    else:

        doc['Documentation'].new_term('Root.Documentation',
                                      'file:README.md',
                                      title='README')

        if exists(nv_name):
            err(f"Directory {nv_name} already exists")

        if args.csv:
            fn = doc.nonver_name + '.csv'
            write_doc(doc, fn)
            prt(f"Writing to {fn}")

        else:
            ensure_dir(nv_name)

            pylib_dir = join(nv_name, 'pylib')
            ensure_dir(pylib_dir)
            with open(join(pylib_dir, '__init__.py'),
                      'w') as f_out, open(pylib.__file__) as f_in:
                f_out.write(f_in.read())

            if args.example:
                doc['Resources'].new_term('Root.Datafile',
                                          'python:pylib#row_generator',
                                          name='row_generator')

            prt(f"Writing to '{nv_name}'")

            write_doc(doc, join(nv_name, DEFAULT_METATAB_FILE))

            add_missing_files(nv_name)

            if args.title:
                readme = '# {}\n'.format(args.title)
            else:
                readme = '# {}\n'.format(doc.get_value('Root.Name'))

            with open(join(nv_name, 'README.md'), 'w') as f:
                f.write(readme)