Beispiel #1
0
    def test_line_doc_parts(self):

        doc = MetatabDoc(TextRowGenerator("Declare: metatab-latest"))

        for fn in (
                'line/line-oriented-doc-root.txt',
                'line/line-oriented-doc-contacts.txt',
                'line/line-oriented-doc-references-1.txt',
                'line/line-oriented-doc-references-2.txt',
                'line/line-oriented-doc-bib.txt',
        ):

            with open(test_data(fn)) as f:
                text = f.read()

            tp = TermParser(TextRowGenerator(text),
                            resolver=doc.resolver,
                            doc=doc)

            doc.load_terms(tp)

        self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249',
                         doc.get_value('Root.Identifier'))
        self.assertEqual(152, len(doc.terms))

        self.assertEqual(5, len(list(doc['References'])))

        self.assertEqual(5, len(list(doc['References'].find('Root.Resource'))))
Beispiel #2
0
    def test_write_line_doc(self):
        """Convert CSV files to text lines and back to text lines"""

        all = [
            'example1.csv', 'example2.csv', 'example1-web.csv', 'children.csv',
            'children2.csv', 'issue1.csv'
        ]

        self.maxDiff = None

        for f in all:

            path = test_data(f)

            doc1 = MetatabDoc(path)

            doc1_lines = doc1.as_lines()

            print(doc1_lines)

            doc2 = MetatabDoc(TextRowGenerator(doc1_lines))

            doc2_lines = doc2.as_lines()

            self.assertEqual(doc1_lines, doc2_lines)

            self.compare_dict(doc1.as_dict(), doc2.as_dict())

            self.assertEqual(doc1_lines, doc2_lines)

            self.assertEqual(doc1.as_csv(), doc2.as_csv())
Beispiel #3
0
def add_resource(mt_file, ref, cache):
    """Add a resources entry, downloading the intuiting the file, replacing entries with
    the same reference"""
    from metatab.util import enumerate_contents

    if isinstance(mt_file, MetatabDoc):
        doc = mt_file
    else:
        doc = MetatabDoc(mt_file)

    if not 'Resources' in doc:
        doc.new_section('Resources')

    doc['Resources'].args = [e for e in set(doc['Resources'].args + ['Name', 'StartLine', 'HeaderLines', 'Encoding']) if
                             e]

    seen_names = set()

    if isdir(ref):
        for f in find_files(ref, DATA_FORMATS):

            if f.endswith(DEFAULT_METATAB_FILE):
                continue

            if doc.find_first('Root.Datafile', value=f):
                prt("Datafile exists for '{}', ignoring".format(f))
            else:
                add_single_resource(doc, f, cache=cache, seen_names=seen_names)
    else:

        for c in enumerate_contents(ref, cache=cache, callback=prt):
            add_single_resource(doc, c.rebuild_url(), cache=cache, seen_names=seen_names)

    write_doc(doc, mt_file)
Beispiel #4
0
    def test_datapackage_declare(self):
        import datapackage

        doc = MetatabDoc(test_data('datapackage_ex2.csv'))

        d = doc.as_dict()

        f = open('/tmp/package.json', 'w')  # NamedTemporaryFile(delete=False)
        f.write(json.dumps(d, indent=4))
        f.close()

        try:
            dp = datapackage.DataPackage(f.name)
            dp.validate()
        except:
            with open(f.name) as f2:
                print(f2.read())
            raise

        print(f.name)
        # unlink(f.name)

        doc = MetatabDoc(test_data('example1.csv'))

        from metatab.datapackage import convert_to_datapackage

        print(json.dumps(convert_to_datapackage(doc), indent=4))
Beispiel #5
0
    def test_line_doc(self):

        doc = MetatabDoc(TextRowGenerator("Declare: metatab-latest"))

        with open(test_data('line/line-oriented-doc.txt')) as f:
            text = f.read()

        tp = TermParser(TextRowGenerator(text), resolver=doc.resolver, doc=doc)

        doc.load_terms(tp)

        self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249',
                         doc.get_value('Root.Identifier'))
        self.assertEqual(152, len(doc.terms))

        self.assertEqual(5, len(list(doc['References'])))

        self.assertEqual(5,
                         len(list(doc['References'].find('Root.Reference'))))

        self.assertEqual(5, len(list(doc['References'].find(
            'Root.Resource'))))  #References are Resources

        rt = list(doc['References'].find('Root.Resource'))[0]

        print(type(rt))
Beispiel #6
0
def update_distributions(m):
    """Add a distribution term for each of the distributions the sync is creating. Also updates the 'Issued' time"""

    doc = MetatabDoc(m.mt_file)

    access_value = doc.find_first_value('Root.Access')

    acl = 'private' if access_value == 'private' else 'public'

    b = S3Bucket(m.args.s3, acl=acl)

    updated = False

    old_dists = list(doc.find('Root.Distribution'))

    if m.args.excel is not False:
        p = ExcelPackage(m.mt_file)

        if update_dist(doc, old_dists, b.access_url(p.save_path())):
            prt("Added Excel distribution to metadata")
            updated = True

    if m.args.zip is not False:
        p = ZipPackage(m.mt_file)
        if update_dist(doc, old_dists, b.access_url(p.save_path())):
            prt("Added ZIP distribution to metadata")
            updated = True

    if m.args.fs is not False:
        p = FileSystemPackage(m.mt_file)
        if update_dist(doc, old_dists,
                       b.access_url(p.save_path(), DEFAULT_METATAB_FILE)):
            prt("Added FS distribution to metadata")
            updated = True

    if m.args.csv is not False:
        p = CsvPackage(m.mt_file)
        url = b.access_url(basename(p.save_path()))
        if update_dist(doc, old_dists, url):
            prt("Added CSV distribution to metadata", url)
            updated = True

    doc['Root']['Issued'] = datetime_now()

    if not write_doc(doc, m.mt_file):
        # The mt_file is probably a URL, so we can't write back to it,
        # but we need the updated distributions, so write it elsewhere, then
        # reload it in the next stage.
        second_stage_file = join(PACKAGE_PREFIX, DEFAULT_METATAB_FILE)

        if not exists(dirname(second_stage_file)):
            makedirs(dirname(second_stage_file))

        assert write_doc(doc, second_stage_file)

    else:
        second_stage_file = m.mt_file

    return second_stage_file, updated
Beispiel #7
0
    def load_doc(self, ref):

        if isinstance(ref, string_types):
            self._doc = MetatabDoc(ref, cache=self._cache)
        else:
            self._doc = ref

        return self
Beispiel #8
0
    def test_open(self):

        doc = MetatabDoc(test_data('almost-everything.csv'))

        self.assertEquals('9FC11204-B291-4E0E-A841-5372090ADEC0',
                          doc.find_first_value('Root.Identifier'))

        self.assertEquals('9FC11204-B291-4E0E-A841-5372090ADEC0',
                          doc['Root'].find_first_value('Root.Identifier'))
Beispiel #9
0
    def test_new(self):

        import metatab.templates as tmpl

        template_path = join(dirname(tmpl.__file__), 'metatab.csv')

        doc = MetatabDoc(template_path)
        doc.cleanse()

        print(doc.as_csv()[:200])
Beispiel #10
0
    def test_acessors(self):

        doc = MetatabDoc(test_data('properties.csv'))

        c = doc.find_first('Root.Citation', name='ipums')

        # Arg_props not include Author, Title or Year, which are children, but not arg props
        self.assertEquals(
            sorted([
                'type', 'month', 'publisher', 'journal', 'version', 'volume',
                'number', 'pages', 'accessdate', 'location', 'url', 'doi',
                'issn', 'name'
            ]), sorted(list(c.arg_props.keys())))

        # Props includes just the children that actually have values
        self.assertEquals(
            sorted([
                'type', 'publisher', 'version', 'accessdate', 'url', 'doi',
                'author', 'title', 'year'
            ]), sorted(list(c.props.keys())))

        # All props includes values for all of the children and all of the property args
        self.assertEquals(
            sorted([
                'type', 'month', 'publisher', 'journal', 'version', 'volume',
                'number', 'pages', 'accessdate', 'location', 'url', 'doi',
                'issn', 'name', 'author', 'title', 'year'
            ]), sorted(list(c.all_props.keys())))

        # Attribute acessors
        self.assertEqual('dataset', c.type)
        self.assertEqual('2017', c.year)
        self.assertEqual('Integrated Public Use Microdata Series', c.title)
        self.assertEqual('University of Minnesota', c.publisher)

        # These are properties of Term
        self.assertEqual(c.join, 'root.citation')
        self.assertTrue(c.term_is('Root.Citation'))

        # Item style acessors
        self.assertEqual('dataset', c['type'].value)
        self.assertTrue(c['type'].term_is('Citation.Type'))
        self.assertEqual('2017', c['year'].value)
        self.assertEqual('Integrated Public Use Microdata Series',
                         c['title'].value)
        self.assertEqual('University of Minnesota', c['publisher'].value)
        self.assertTrue(c['publisher'].term_is('Citation.Publisher'))

        c.foo = 'bar'

        c.type = 'foobar'
        self.assertEqual('foobar', c.type)
        self.assertEqual('foobar', c['type'].value)
Beispiel #11
0
    def init_doc(self):

        if self._ref:
            self.load_doc(self._ref)
        else:
            self._doc = MetatabDoc()

            if not self._doc.find("Root.Declare"):
                # FIXME. SHould really have a way to insert this term as the first term.
                self.sections.root.new_term('Declare', 'metatab-latest')
                self._doc.load_declarations(['metatab-latest'])

        return self.doc
Beispiel #12
0
    def __iter__(self):
        """Iterate over all of the lines in the file"""

        import yaml
        from metatab import MetatabDoc

        with open(self.url.fspath) as f:
            d = yaml.load(f)

        decl = d.get('declare', 'metatab-latest')

        doc = MetatabDoc(decl=decl)

        #yield from doc.rows

        section_names = [
            'root', 'contacts', 'documentation', 'resources', 'references',
            'schema'
        ]

        for section_name in section_names:
            section = doc.decl_sections[section_name]
            #print(section_name, section)

            for tn in section.get('terms', []):
                self.section_map[tn.lower()] = section_name

            self.sections[section_name] = doc.get_or_new_section(
                section_name, section['args'])

        last_section = None
        last_term = {}
        for term_name, value, parent in self.yield_dict(doc, d):

            print(term_name, value, parent)

            section = self.sections.get(
                self.section_map.get(term_name) or 'root')

            if parent is None:
                term = section.new_term(term_name, value)
            else:

                parent_term = last_term[parent]
                term = parent_term.new_child(term_name, value)

            last_term[term_name] = term

        yield from doc.rows
Beispiel #13
0
    def test_includes(self):

        doc = MetatabDoc(test_data('include1.csv'))
        d = doc.as_dict()

        for t in doc['root'].terms:
            print(t)

        print(d)

        self.assertEquals(
            ['Include File 1', 'Include File 2', 'Include File 3'], d['note'])

        self.assertTrue(any('include2.csv' in e for e in d['include']))
        self.assertTrue(any('include3.csv' in e for e in d['include']))
Beispiel #14
0
    def x_test_metatab_line(self):
        from metatab.generate import TextRowGenerator
        from metatab.cli.core import process_schemas
        from metatab import MetatabDoc

        cli_init()

        doc = MetatabDoc(
            TextRowGenerator(test_data('simple-text.txt'), 'simple-text.txt'))

        process_schemas(doc)

        r = doc.resource('resource')

        for c in r.columns():
            print(c)
Beispiel #15
0
        def errs(fn):
            with self.assertRaises(IncludeError):
                doc = MetatabDoc()
                tp = TermParser(fn, resolver=WebResolver, doc=doc)
                _ = list(tp)

            return tp.errors_as_dict()
Beispiel #16
0
def make_metatab_file(template='metatab'):
    import metatab.templates as tmpl

    template_path = join(dirname(tmpl.__file__), template + '.csv')

    doc = MetatabDoc(template_path)

    return doc
Beispiel #17
0
    def test_children(self):

        doc = MetatabDoc(test_data('children.csv'))

        for t in doc.terms:
            print(t)

        import json
        print(json.dumps(doc.as_dict(), indent=4))

        for t in doc.as_dict()['parent']:
            self.assertEquals(
                {
                    'prop1': 'prop1',
                    'prop2': 'prop2',
                    '@value': 'parent'
                }, t)
Beispiel #18
0
def metatab_admin_handler(m):
    if m.args.enumerate:

        from metatab.util import enumerate_contents

        specs = list(enumerate_contents(m.args.enumerate, m.cache, callback=prt))

        for s in specs:
            prt(classify_url(s.url), s.target_format, s.url, s.target_segment)

    if m.args.html:
        from metatab.html import html
        doc = MetatabDoc(m.mt_file)

        # print(doc.html)
        prt(html(doc))

    if m.args.markdown:
        from metatab.html import markdown

        doc = MetatabDoc(m.mt_file)
        prt(markdown(doc))

    if m.args.clean_cache:
        clean_cache('metapack')

    if m.args.name:
        doc = MetatabDoc(m.mt_file)
        prt(doc.find_first_value("Root.Name"))
        exit(0)
Beispiel #19
0
    def test_new_parser(self):

        tp = MetatabDoc(test_data('short.csv'))

        for t in tp.terms:
            print(t)

        import json
        print(json.dumps(tp.decl_terms, indent=4))
Beispiel #20
0
    def test_datapackage_convert(self):
        import datapackage
        from metatab.datapackage import convert_to_datapackage

        doc = MetatabDoc(test_data('example1.csv'))

        dp = convert_to_datapackage(doc)

        print(json.dumps(dp, indent=4))

        dp = datapackage.DataPackage(dp)
        dp.validate()
Beispiel #21
0
def metatab_derived_handler(m, skip_if_exists=None):
    """Create local Zip, Excel and Filesystem packages

    :param m:
    :param skip_if_exists:
    :return:
    """
    from metatab.package import PackageError

    create_list = []
    url = None

    doc = MetatabDoc(m.mt_file)

    env = get_lib_module_dict(doc)

    if (m.args.excel is not False or m.args.zip is not False or
            (hasattr(m.args, 'filesystem') and m.args.filesystem is not False) ):
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

    if m.args.force:
        skip_if_exists = False

    try:

        # Always create a filesystem package before ZIP or Excel, so we can use it as a source for
        # data for the other packages. This means that Transform processes and programs only need
        # to be run once.
        if any([m.args.filesystem, m.args.excel, m.args.zip]):

            _, url, created = make_filesystem_package(m.mt_file, m.cache, env, skip_if_exists)
            create_list.append(('fs', url, created))

            m.mt_file = url

            env = {}  # Don't need it anymore, since no more programs will be run.

        if m.args.excel is not False:
            _, url, created = make_excel_package(m.mt_file, m.cache, env, skip_if_exists)
            create_list.append(('xlsx', url, created))

        if m.args.zip is not False:
            _, url, created = make_zip_package(m.mt_file, m.cache, env, skip_if_exists)
            create_list.append(('zip', url, created))

        if m.args.csv is not False:
            _, url, created = make_csv_package(m.mt_file, m.cache, env, skip_if_exists)
            create_list.append(('csv', url, created))

    except PackageError as e:
        err("Failed to generate package: {}".format(e))

    return create_list
Beispiel #22
0
def get_metatab_doc(nb_path):
    """Read a notebook and extract the metatab document. Only returns the first document"""

    from metatab.generate import CsvDataRowGenerator
    from metatab.rowgenerators import TextRowGenerator
    from metatab import MetatabDoc

    with open(nb_path) as f:
        nb = nbformat.reads(f.read(), as_version=4)

    for cell in nb.cells:
        source = ''.join(cell['source']).strip()
        if source.startswith('%%metatab'):
            return MetatabDoc(TextRowGenerator(source))
Beispiel #23
0
    def test_parse_everything(self):

        import json

        all = [
            'example1.csv', 'example2.csv', 'example1-web.csv', 'include1.csv',
            'include2.csv', 'include3.csv', 'children.csv', 'children2.csv',
            'issue1.csv'
        ]

        # These are currently broken -- as_dict doesn't work properly with the
        # datapackage-latest decl.
        datapackages = [
            'datapackage_ex1.csv', 'datapackage_ex1_web.csv',
            'datapackage_ex2.csv'
        ]

        for fn in all:

            print('Testing ', fn)

            path = test_data(fn)

            json_path = test_data('json', fn.replace('.csv', '.json'))

            doc = MetatabDoc(path)
            d = doc.as_dict()

            if not exists(json_path):
                with open(json_path, 'w') as f:
                    print("Writing", json_path)
                    json.dump(d, f, indent=4)

            with open(json_path) as f:
                d2 = json.load(f)

            self.compare_dict(d, d2)
Beispiel #24
0
    def test_versions(self):

        doc = MetatabDoc(test_data('example1.csv'))

        self.assertEqual('201404', doc.find_first_value('Root.Version'))
        self.assertEqual('example.com-voters-2002_2014-ca-county-201409',
                         doc.as_version('+5'))
        self.assertEqual('example.com-voters-2002_2014-ca-county-201399',
                         doc.as_version('-5'))
        self.assertEqual('example.com-voters-2002_2014-ca-county-foobar',
                         doc.as_version('foobar'))
        self.assertEqual('example.com-voters-2002_2014-ca-county',
                         doc.as_version(None))
Beispiel #25
0
def metatab_query_handler(m):
    if m.args.resource or m.args.head:

        limit = 20 if m.args.head else None

        try:
            doc = MetatabDoc(m.mt_file, cache=m.cache)
        except OSError as e:
            err("Failed to open Metatab doc: {}".format(e))
            return

        if m.resource:
            dump_resource(doc, m.resource, limit)
        else:
            dump_resources(doc)
Beispiel #26
0
def process_schemas(mt_file, cache, clean=False):
    from rowgenerators import SourceError
    from requests.exceptions import ConnectionError

    doc = MetatabDoc(mt_file)

    try:
        if clean:
            doc['Schema'].clean()
        else:
            doc['Schema']

    except KeyError:
        doc.new_section('Schema', ['DataType', 'Altname', 'Description'])

    for r in doc.resources():

        schema_name = r.get_value('schema', r.get_value('name'))

        schema_term = doc.find_first(term='Table', value=schema_name, section='Schema')

        if schema_term:
            prt("Found table for '{}'; skipping".format(schema_name))
            continue

        path, name = extract_path_name(r.url)

        prt("Processing {}".format(r.url))

        si = SelectiveRowGenerator(islice(r.row_generator, 100),
                                   headers=[int(i) for i in r.get_value('headerlines', '0').split(',')],
                                   start=int(r.get_value('startline', 1)))

        try:
            ti = TypeIntuiter().run(si)
        except SourceError as e:
            warn("Failed to process '{}'; {}".format(path, e))
            continue
        except ConnectionError as e:
            warn("Failed to download '{}'; {}".format(path, e))
            continue

        table = doc['Schema'].new_term('Table', schema_name)

        prt("Adding table '{}' ".format(schema_name))

        for i, c in enumerate(ti.to_rows()):
            raw_alt_name = alt_col_name(c['header'], i)
            alt_name = raw_alt_name if raw_alt_name != c['header'] else ''

            table.new_child('Column', c['header'],
                            datatype=type_map.get(c['resolved_type'], c['resolved_type']),
                            altname=alt_name)

    write_doc(doc, mt_file)
Beispiel #27
0
    def test_update_name(self):

        for fn in ('name.csv', 'name2.csv'):

            doc = MetatabDoc(test_data(fn))

            updates = doc.update_name()

            name = doc.find_first_value("Root.Name")

            self.assertEquals('example.com-foobar-2017-ca-people-1', name)
            self.assertEquals(['Changed Name'], updates)

            try:
                doc.remove_term(doc.find_first('Root.Dataset'))
            except ValueError:
                nv = doc.find_first('Root.Name')
                nv.remove_child(nv.find_first('Name.Dataset'))

            updates = doc.update_name()

            self.assertIn("No Root.Dataset, so can't update the name", updates)
Beispiel #28
0
def metaworld():
    import argparse
    parser = argparse.ArgumentParser(
        prog='metakan',
        description='Publish packages to Data.World, version {}'.format(
            _meta.__version__))

    parser.add_argument('-i',
                        '--info',
                        default=False,
                        action='store_true',
                        help="Show package information")

    parser.add_argument('metatabfile',
                        nargs='?',
                        default=DEFAULT_METATAB_FILE,
                        help='Path to a Metatab file')

    class MetapackCliMemo(object):
        def __init__(self, args):
            self.cwd = getcwd()
            self.args = args
            self.cache = get_cache('metapack')

            self.mtfile_arg = args.metatabfile if args.metatabfile else join(
                self.cwd, DEFAULT_METATAB_FILE)

            self.mtfile_url = Url(self.mtfile_arg)
            self.resource = self.mtfile_url.parts.fragment

            self.package_url, self.mt_file = resolve_package_metadata_url(
                self.mtfile_url.rebuild_url(False, False))

    m = MetapackCliMemo(parser.parse_args(sys.argv[1:]))

    try:
        doc = MetatabDoc(m.mt_file, cache=m.cache)
    except (IOError, MetatabError) as e:
        err("Failed to open metatab '{}': {}".format(m.mt_file, e))

    if m.args.info:
        package_info(doc)
    else:
        send_to_dw(doc)

    exit(0)
Beispiel #29
0
    def test_sections(self):

        doc = MetatabDoc(test_data('example1.csv'))

        self.assertEqual(
            ['root', u'resources', u'contacts', u'notes', u'schema'],
            list(doc.sections.keys()))

        del doc['Resources']

        self.assertEqual(['root', u'contacts', u'notes', u'schema'],
                         list(doc.sections.keys()))

        notes = list(doc['notes'])

        self.assertEquals(2, len(notes))

        for sname, s in doc.sections.items():
            print(sname, s.value)
Beispiel #30
0
    def test_find(self):

        doc = MetatabDoc(test_data('example1.csv'))

        self.assertEquals('cdph.ca.gov-hci-registered_voters-county',
                          doc.find_first('Root.Identifier').value)

        doc = MetatabDoc(test_data('resources.csv'))

        self.assertEqual(
            {
                'root.downloadpage', 'root.supplementarydata', 'root.api',
                'root.citation', 'root.datafile', 'root.datadictionary',
                'root.image', 'root.reference', 'root.documentation',
                'root.homepage', 'root.webpage', 'root.sql', 'root.dsn'
            }, doc.derived_terms['root.resource'])

        self.assertEqual([
            'example1', 'example10', 'example2', 'example3', 'example4',
            'example5', 'example6', 'example7', 'example8', 'example9'
        ], sorted([t.name for t in doc.find('root.resource')]))

        self.assertEquals(['example1', 'example2'],
                          [t.name for t in doc.find('root.datafile')])