Beispiel #1
0
def run_colmap_new(args):
    m = MetapackCliMemo(args, downloader)

    resources = get_resources(m)

    if not resources:
        err(f"No resources found with colmap name '{m.args.colmap_name}'")

    # Collect all of the headers, into a list of headers,
    # and the union of all of them in col_index
    col_index = []
    headers = []

    for r in resources:
        h = r.headers

        col_index += [
            alt_col_name(c) for c in h if alt_col_name(c) not in col_index
        ]
        headers.append(h)

    # Create lists, of the same length as the index, of the source
    # column names, at the same position as the alt_col_name is in the col_index
    data = [col_index]

    for header in headers:
        new_row = [None] * len(col_index)
        for c in header:
            new_row[col_index.index(alt_col_name(c))] = c

        data.append(new_row)

    t = [['index'] + [r.name for r in resources]] + list(
        zip(*data))  # zip transposes rows into columns.

    path = Path(f"colmap-{m.args.colmap_name}.csv")

    if m.args.print:
        from tabulate import tabulate
        prt(tabulate(t[1:], headers=t[0]))
    else:
        if path.exists() and not m.args.force:
            err(f"Col map file '{str(path)}' already exists. Use -f to overwrite"
                )

        else:
            with path.open('w') as f:
                csv.writer(f).writerows(t)
            prt(f"Wrote {str(path)}")
Beispiel #2
0
def process_schema(doc, resource, df):
    """Add schema entiries to a metatab doc from a dataframe"""
    from rowgenerators import SourceError
    from requests.exceptions import ConnectionError

    from metapack.cli.core import extract_path_name, type_map
    from metapack_build.core import alt_col_name
    from tableintuit import TypeIntuiter
    from rowgenerators.generator.python import PandasDataframeSource
    from appurl import parse_app_url

    try:
        doc['Schema']
    except KeyError:
        doc.new_section('Schema', ['DataType', 'Altname', 'Description'])

    schema_name = resource.get_value('schema', resource.get_value('name'))

    schema_term = doc.find_first(term='Table',
                                 value=schema_name,
                                 section='Schema')

    if schema_term:
        logger.info("Found table for '{}'; skipping".format(schema_name))
        return

    path, name = extract_path_name(resource.url)

    logger.info("Processing {}".format(resource.url))

    si = PandasDataframeSource(
        parse_app_url(resource.url),
        df,
        cache=doc._cache,
    )

    try:
        ti = TypeIntuiter().run(si)
    except SourceError as e:
        logger.warn("Failed to process '{}'; {}".format(path, e))
        return
    except ConnectionError as e:
        logger.warn("Failed to download '{}'; {}".format(path, e))
        return

    table = doc['Schema'].new_term('Table', schema_name)

    logger.info("Adding table '{}' to metatab schema".format(schema_name))

    for i, c in enumerate(ti.to_rows()):
        raw_alt_name = alt_col_name(c['header'], i)
        alt_name = raw_alt_name if raw_alt_name != c['header'] else ''

        t = table.new_child('Column', c['header'],
                            datatype=type_map.get(c['resolved_type'], c['resolved_type']),
                            altname=alt_name,
                            description=df[c['header']].description \
                                        if hasattr(df, 'description') and df[c['header']].description else ''
                            )

    return table
Beispiel #3
0
def add_dataframe(df, name, pkg=None, description=''):
    """Add a dataframe to a source package. Pass in either the name of the dataframe, or the dataframe.
    If the dataframeis passed it, the name will be the dataframe's variable name. The function
    will re-write the source package with the new resource.

    """
    from warnings import warn
    from metapack.cli.core import type_map
    from metapack_build.core import alt_col_name
    import numpy as np

    if name is None or df is None:
        warn("Did not find dataframe for reference '{}' ".format(ref))
        return

    pkg = pkg or open_source_package()

    resource_ref = 'file:' + get_notebook_rel_path(pkg) + '#' + name

    t = pkg.find_first('Root.Datafile', value=resource_ref)
    col_props = {}

    if t:
        print("Datafile exists for url '{}', deleting".format(resource_ref))

        if t.schema_term:
            col_props = { c['name']:c for c in t.columns()}
            pkg.remove_term(t.schema_term)

        pkg.remove_term(t)

    t = pkg['Resources'].new_term('Root.Datafile', resource_ref, name=name, description=description)

    st = pkg['Schema'].new_term('Table', t.schema_name, description=description)

    for i, name in enumerate(df.columns):

        props = col_props.get(name,{})

        try:
            native_type = type(np.asscalar(df[name].dtype.type(0))).__name__
        except ValueError:
            native_type = df[name].dtype.name
        except AttributeError:
            native_type = type(df[name][0]).__name__

        for pn in 'datatype name pos header'.split():
            if pn in props:
                del props[pn]

        if 'altname' in props:
            altname = props['altname']
            del props['altname']
        else:
            raw_alt_name = alt_col_name(name, i)
            altname = raw_alt_name if raw_alt_name != name else ''

        col = df[name]

        if hasattr(col, 'description'):  # custom property
            props['description'] = col.description

        t = st.new_child('Column', name,
                            datatype=type_map.get(native_type, native_type),
                            altname=altname,
                            **props)


    pkg.write_csv()
Beispiel #4
0
def rebuild_schema(doc, r, df):
    """Rebuild the schema for a resource based on a dataframe"""
    import numpy as np

    # Re-get the resource in the doc, since it may be different.
    try:
        r = doc.resource(r.name)
    except AttributeError:
        # Maybe r is actually a resource name
        r = doc.resource(r)

    def alt_col_name(name, i):
        import re

        if not name:
            return 'col{}'.format(i)

        return re.sub('_+', '_', re.sub('[^\w_]', '_', str(name)).lower()).rstrip('_')

    df_types = {
        np.dtype('O'): 'text',
        np.dtype('int64'): 'integer',
        np.dtype('float64'): 'number'
    }

    try:
        df_index_frame = df.index.to_frame()
    except AttributeError:
        df_index_frame = None

    def get_col_dtype(c):

        c = str(c)

        try:
            return df_types[df[c].dtype]
        except KeyError:
            # Maybe it is in the index?
            pass

        try:
            return df_types[df_index_frame[c].dtype]
        except TypeError:
            # Maybe not a multi-index
            pass

        if c == 'id' or c == df.index.name:
            return df_types[df.index.dtype]

        return 'unknown'

    columns = []
    schema_term = r.schema_term[0]

    if schema_term:

        old_cols = {c['name'].value: c.properties for c in schema_term.children}
        for c in schema_term.children:
            schema_term.remove_child(c)

        schema_term.children = []

    else:
        old_cols = {}
        schema_term = doc['Schema'].new_term('Table', r.schema_name)

    index_names = [n if n else "id" for n in df.index.names]

    for i, col in enumerate(index_names + list(df.columns)):
        acn = alt_col_name(col, i) if alt_col_name(col, i) != str(col) else ''

        d = {'name': col, 'datatype': get_col_dtype(col), 'altname': acn}

        if col in old_cols.keys():
            lookup_name = col
        elif acn in old_cols.keys():
            lookup_name = acn
        else:
            lookup_name = None

        if lookup_name and lookup_name in old_cols:

            for k, v in schema_term.properties.items():

                old_col = old_cols.get(lookup_name)

                for k, v in old_col.items():
                    if k != 'name' and v:
                        d[k] = v

        columns.append(d)

    for c in columns:
        name = c['name']
        del c['name']
        datatype = c['datatype']
        del c['datatype']
        altname = c['altname']
        del c['altname']

        schema_term.new_child('Column', name, datatype=datatype, altname=altname, **c)
Beispiel #5
0
def convert_col(v):
    # The replacement of '_' may be necessary for some datasets
    # to ensure that similar columns from different datasets are aligned.
    return alt_col_name(v, 0)  # .replace('_', '')