Ejemplo n.º 1
0
    def test_line_doc_parts(self):

        doc = MetatabDoc(TextRowGenerator("Declare: metatab-latest"))

        for fn in (
                'line/line-oriented-doc-root.txt',
                'line/line-oriented-doc-contacts.txt',
                'line/line-oriented-doc-references-1.txt',
                'line/line-oriented-doc-references-2.txt',
                'line/line-oriented-doc-bib.txt',
        ):

            with open(test_data(fn)) as f:
                text = f.read()

            tp = TermParser(TextRowGenerator(text),
                            resolver=doc.resolver,
                            doc=doc)

            doc.load_terms(tp)

        self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249',
                         doc.get_value('Root.Identifier'))
        self.assertEqual(152, len(doc.terms))

        self.assertEqual(5, len(list(doc['References'])))

        self.assertEqual(5, len(list(doc['References'].find('Root.Resource'))))
Ejemplo n.º 2
0
    def test_line_doc(self):

        doc = MetatabDoc(TextRowGenerator("Declare: metatab-latest"))

        with open(test_data('line/line-oriented-doc.txt')) as f:
            text = f.read()

        tp = TermParser(TextRowGenerator(text), resolver=doc.resolver, doc=doc)

        doc.load_terms(tp)

        self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249',
                         doc.get_value('Root.Identifier'))
        self.assertEqual(152, len(doc.terms))

        self.assertEqual(5, len(list(doc['References'])))

        self.assertEqual(5,
                         len(list(doc['References'].find('Root.Reference'))))

        self.assertEqual(5, len(list(doc['References'].find(
            'Root.Resource'))))  #References are Resources

        rt = list(doc['References'].find('Root.Resource'))[0]

        print(type(rt))
Ejemplo n.º 3
0
    def test_line_doc_parts(self):

        doc = MetapackDoc(TextRowGenerator("Declare: metatab-latest"))

        for fn in (
                'line/line-oriented-doc-root.txt',
                'line/line-oriented-doc-contacts.txt',
                'line/line-oriented-doc-datafiles.txt',
                'line/line-oriented-doc-references-1.txt',
                'line/line-oriented-doc-references-2.txt',
                'line/line-oriented-doc-bib.txt',
        ):
            with open(test_data(fn)) as f:
                text = f.read()

            tp = TermParser(TextRowGenerator(text),
                            resolver=doc.resolver,
                            doc=doc)

            doc.load_terms(tp)

        self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249',
                         doc.get_value('Root.Identifier'))
        self.assertEqual(157, len(doc.terms))

        self.assertEqual(5, len(list(doc['References'])))

        self.assertEqual(5,
                         len(list(doc['References'].find('Root.Reference'))))

        self.assertEqual(5, len(list(doc['References'].find(
            'Root.Resource'))))  # References are Resources

        rt = list(doc['References'].find('Root.Resource'))[0]

        self.assertIsInstance(rt, Reference)

        self.assertEqual(5, len(list(doc['Resources'])))

        self.assertEqual(5, len(list(doc['Resources'].find('Root.Datafile'))))

        self.assertEqual(5, len(list(doc['Resources'].find(
            'Root.Resource'))))  # References are Resources

        rt = list(doc['Resources'].find('Root.Resource'))[0]

        self.assertIsInstance(rt, Resource)

        doc._repr_html_()  # Check no exceptions
Ejemplo n.º 4
0
    def preprocess_cell(self, cell, resources, index):
        import re
        from metatab.rowgenerators import TextRowGenerator

        if not self.extra_terms:
            self.extra_terms = []

        if cell['source'].startswith('%%metatab'):

            tp = TermParser(TextRowGenerator(re.sub(r'\%\%metatab.*\n', '', cell['source'])),
                            resolver=self.doc.resolver, doc=self.doc)

            self.doc.load_terms(tp)

        elif cell['cell_type'] == 'markdown':

            tags = cell['metadata'].get('tags', [])

            if 'Title' in tags:
                self.extra_terms.append(('Root', 'Root.Title', cell.source.strip().replace('#', '')))

            elif 'Description' in tags:
                self.extra_terms.append(('Root', 'Root.Description', cell.source.strip()))

        else:
            cell, resources = super().preprocess_cell(cell, resources, index)

        return cell, resources
Ejemplo n.º 5
0
    def test_write_line_doc(self):
        """Convert CSV files to text lines and back to text lines"""

        all = [
            'example1.csv', 'example2.csv', 'example1-web.csv', 'children.csv',
            'children2.csv', 'issue1.csv'
        ]

        self.maxDiff = None

        for f in all:

            path = test_data(f)

            doc1 = MetatabDoc(path)

            doc1_lines = doc1.as_lines()

            print(doc1_lines)

            doc2 = MetatabDoc(TextRowGenerator(doc1_lines))

            doc2_lines = doc2.as_lines()

            self.assertEqual(doc1_lines, doc2_lines)

            self.compare_dict(doc1.as_dict(), doc2.as_dict())

            self.assertEqual(doc1_lines, doc2_lines)

            self.assertEqual(doc1.as_csv(), doc2.as_csv())
Ejemplo n.º 6
0
    def mt_doc(self):
        """Return the current metatab document, which must be created with either %%metatab
        or %mt_load_package"""

        if MT_DOC_VAR not in self.shell.user_ns:

            package_url = MetapackPackageUrl(
                "metapack+file:" + os.getcwd() + '/',
                downloader=Downloader.get_instance())

            self.shell.user_ns[MT_DOC_VAR] = \
                MetapackDoc(TextRowGenerator("Declare: metatab-latest\n"), package_url=package_url)

            inline_doc = self.shell.user_ns[MT_DOC_VAR]

            if 'Resources' not in inline_doc:
                inline_doc.new_section('Resources', ['Name', 'Description'])
            if 'Resources' not in inline_doc:
                inline_doc.new_section('References', ['Name', 'Description'])

            # Give all of the sections their standard args, to make the CSV versions of the doc
            # prettier
            for name, s in inline_doc.sections.items():
                try:
                    s.args = inline_doc.decl_sections[name.lower()]['args']
                except KeyError:
                    pass

        return self.shell.user_ns[MT_DOC_VAR]
Ejemplo n.º 7
0
    def add_term_lines(self, text):

        assert 'root.reference' in TermParser.term_classes

        tp = TermParser(TextRowGenerator(text),
                        resolver=self.mt_doc.resolver,
                        doc=self.mt_doc)

        self.mt_doc.load_terms(tp)
Ejemplo n.º 8
0
    def run(self, nb):

        assert str(self.package_url)

        self.doc = MetapackDoc(TextRowGenerator("Declare: metatab-latest\n"),
                               package_url=parse_app_url(self.package_url))

        self.preprocess(nb, {})

        for section, term, value in self.extra_terms:
            self.doc[section].get_or_new_term(term, value)

        return self.doc
Ejemplo n.º 9
0
def get_metatab_doc(nb_path):
    """Read a notebook and extract the metatab document. Only returns the first document"""

    from metatab.generate import CsvDataRowGenerator
    from metatab.rowgenerators import TextRowGenerator
    from metatab import MetatabDoc

    with open(nb_path) as f:
        nb = nbformat.reads(f.read(), as_version=4)

    for cell in nb.cells:
        source = ''.join(cell['source']).strip()
        if source.startswith('%%metatab'):
            return MetatabDoc(TextRowGenerator(source))
Ejemplo n.º 10
0
    def x_test_metatab_line(self):
        from metatab.generate import TextRowGenerator
        from metatab.cli.core import process_schemas
        from metatab import MetatabDoc

        cli_init()

        doc = MetatabDoc(
            TextRowGenerator(test_data('simple-text.txt'), 'simple-text.txt'))

        process_schemas(doc)

        r = doc.resource('resource')

        for c in r.columns():
            print(c)
Ejemplo n.º 11
0
    def test_read_geo_packages(self):

        import warnings
        from requests.exceptions import HTTPError

        warnings.simplefilter("ignore")

        try:
            from publicdata.census.dataframe import CensusDataFrame
        except ImportError:
            return unittest.skip("Public data isn't installed")

        with open(test_data('line', 'line-oriented-doc.txt')) as f:
            text = f.read()

        doc = MetapackDoc(TextRowGenerator("Declare: metatab-latest\n" + text))

        r = doc.reference('B09020')

        try:
            df = r.dataframe()
        except HTTPError:  # The Census reporter URLs fail sometimes.
            return unittest.skip("Census Reporter vanished")

        self.assertIsInstance(df, CensusDataFrame)

        r = doc.reference('sra_geo')

        gf = r.geoframe()

        self.assertEqual(41, len(gf.geometry.geom_type))

        self.assertEqual({'Polygon'}, set(gf.geometry.geom_type))

        r = doc.reference('ri_tracts')

        gf = r.geoframe()

        self.assertEqual(244, len(gf.geometry.geom_type))

        print(sorted(list(set(gf.geometry.geom_type))))

        self.assertEqual(['MultiPolygon', 'Polygon'],
                         sorted(list(set(gf.geometry.geom_type))))

        print(gf.head())
Ejemplo n.º 12
0
    def test_line_doc(self):

        from os.path import splitext, basename
        import sys

        with open(test_data('line', 'line-oriented-doc.txt')) as f:
            text = f.read()

        doc = MetapackDoc(TextRowGenerator("Declare: metatab-latest\n" + text))

        # process_schemas(doc)

        r = doc.reference('tracts')

        self.assertEqual(628, len(list(r)))

        tracts = r.dataframe()

        self.assertEqual(-73427, tracts.lon.sum().astype(int))

        tracts = r.read_csv()

        self.assertEqual(-73427, tracts.lon.sum().astype(int))

        r.dataframe()

        # Test loading a Python Library from a package.

        ref = doc.reference('incv')

        self.assertIsNotNone(ref)

        ref_resource = parse_app_url(
            ref.url).inner.clear_fragment().get_resource()

        # The path has to be a Metatab ZIP archive, and the root directory must be the same as
        # the name of the path

        pkg_name, _ = splitext(basename(ref_resource.path))

        lib_path = ref_resource.join(pkg_name).path

        if lib_path not in sys.path:
            sys.path.insert(0, lib_path)
Ejemplo n.º 13
0
    def preprocess_cell(self, cell, resources, index):
        from metatab.rowgenerators import TextRowGenerator

        if cell['metadata'].get('mt_final_metatab'):
            if cell['outputs']:
                o = ''.join(e['text'] for e in cell['outputs'])

                self.doc = MetapackDoc(TextRowGenerator(o))

                # Give all of the sections their standard args, to make the CSV versions of the doc
                # prettier

                for name, s in self.doc.sections.items():
                    try:
                        s.args = self.doc.decl_sections[name.lower()]['args']
                    except KeyError:
                        pass

        return cell, resources
Ejemplo n.º 14
0
    def test_line_oriented(self):

        doc = MetapackDoc(
            TextRowGenerator(test_data('line', 'line-oriented-doc.txt')))

        self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249',
                         doc.get_value('Root.Identifier'))
        self.assertEqual(153, len(doc.terms))

        self.assertEqual(6, len(list(doc['References'])))

        self.assertEqual(6,
                         len(list(doc['References'].find('Root.Reference'))))

        self.assertEqual(6, len(list(doc['References'].find(
            'Root.Resource'))))  # References are Resources

        rt = list(doc['References'].find('Root.Resource'))[0]

        self.assertIsInstance(rt, Reference)
Ejemplo n.º 15
0
def extract_notebook_metatab(nb_path: Path):
    """Extract the metatab lines from a notebook and return a Metapack doc """

    from metatab.rowgenerators import TextRowGenerator
    import nbformat

    with nb_path.open() as f:
        nb = nbformat.read(f, as_version=4)

    lines = '\n'.join(['Declare: metatab-latest'] + [
        get_cell_source(nb, tag)
        for tag in ['metadata', 'resources', 'schema']
    ])
    doc = MetapackDoc(TextRowGenerator(lines))

    doc['Root'].get_or_new_term('Root.Title').value = get_cell_source(
        nb, 'Title').strip('#').strip()
    doc['Root'].get_or_new_term('Root.Description').value = get_cell_source(
        nb, 'Description')

    doc['Documentation'].get_or_new_term(
        'Root.Readme').value = get_cell_source(nb, 'readme')

    return doc
Ejemplo n.º 16
0
    def mt_add_dataframe(self, line, cell=''):
        """Add a dataframe to a metatab document's data files

        """
        from metapack.jupyter.core import process_schema
        from metatab.exc import ParserError

        args = parse_argstring(self.mt_add_dataframe, line)

        if not cell:
            is_line = True
            args.dump = True
        else:
            is_line = False

        dataframe_name = args.dataframe_name[0]

        if '_material_dataframes' not in self.shell.user_ns:
            self.shell.user_ns['_material_dataframes'] = {}

        df = self.shell.user_ns[dataframe_name]

        try:
            cell_doc = MetapackDoc(
                TextRowGenerator("Declare: metatab-latest\n" + cell))
        except ParserError as e:
            warn('Failed to parse Metatab in cell: {} '.format(e))
            return

        cell_table = cell_doc.find_first('Root.Table')

        if cell_table and (args.name or args.title):
            warn(
                "The name and title arguments are ignored when the cell includes a Metatab table definition"
            )

        if cell_table:
            name = cell_table.get_value('name')
            title = cell_table.get_value('title', '')
            description = cell_table.get_value('description', '')
        else:
            name = None
            title = ''
            description = ''

        if not name:
            name = args.name or dataframe_name

        if not title:
            title = args.title or dataframe_name

        if not name:
            warn("Name must be set with .name property, or --name option")
            return

        title = title.strip("'").strip('"')

        try:
            doc = self.mt_doc
        except KeyError:
            doc = None

        if args.materialize:
            ref = 'file:data/{}.csv'.format(name)
            self.shell.user_ns['_material_dataframes'][dataframe_name] = ref

        elif doc is not None:
            ref = 'ipynb:notebooks/{}.ipynb#{}'.format(doc.as_version(None),
                                                       dataframe_name)

        else:
            ref = None

        table = None

        resource_term = None

        #
        # First, process the schema, extracting the columns from the dataframe.
        #

        if doc and ref:
            if 'Resources' not in doc:
                doc.new_section('Resources')

            resource_term = doc['Resources'].get_or_new_term(
                "Root.Datafile", ref)

            resource_term['name'] = name
            resource_term['title'] = title
            resource_term['description'] = description

            df = df.reset_index()

            table = process_schema(doc, doc.resource(name), df)

            if not table:
                table = doc['Schema'].find_first('Root.Table', name)

        #
        # Next, apply the names from  table description from the cell
        #

        if cell_table:

            cols_by_name = {c.name: c for c in cell_table.find('Table.Column')}

            for i, c in enumerate(table.find('Table.Column')):

                cell_column = cols_by_name.get(c.name)
                try:
                    cell_col_by_pos = list(cols_by_name.values())[i]
                except KeyError:
                    cell_col_by_pos = None
                except IndexError:
                    cell_col_by_pos = None

                if cell_column:
                    c.description = cell_column.description
                    c.name = cell_column.name
                elif cell_col_by_pos:
                    c.description = cell_col_by_pos.description
                    c.name = cell_col_by_pos.name

        if args.dump and table:
            print("Table:", resource_term.name)

            if resource_term and resource_term.title:
                print("Table.Title:", resource_term.get_value('title'))
                print(
                    "Table.Description:",
                    resource_term.get_value('description')
                    if resource_term.get_value('description') else '')

            for c in table.find('Table.Column'):
                print("Table.Column:", c.name)
                print("  .Datatype:", c.datatype)
                print("  .Description:", c.description or '')

            if is_line:
                print(
                    "\nCopy the above into the cell, and change to a cell magic, with '%%' "
                )
Ejemplo n.º 17
0
    def test_version(self):

        from textwrap import dedent

        doc = MetatabDoc(
            TextRowGenerator(
                dedent("""
            Root.Version:
            """)))

        # None because there are no Minor, Major, Patch value
        self.assertIsNone(doc.update_version())

        self.assertFalse(doc._has_semver())

        doc = MetatabDoc(
            TextRowGenerator(
                dedent("""
                Root.Version: 10
                """)))

        # None because there are no Minor, Major, Patch value
        self.assertEqual("10", doc.update_version())
        self.assertFalse(doc._has_semver())

        doc = MetatabDoc(
            TextRowGenerator(
                dedent("""
                Root.Version: 10
                Version.Patch: 5
                """)))

        # None because there are no Minor, Major, Patch value
        self.assertEqual("0.0.5", doc.update_version())
        self.assertTrue(doc._has_semver())

        doc = MetatabDoc(
            TextRowGenerator(
                dedent("""
                Root.Version: 10
                Version.Major: 2
                Version.Patch: 5
                """)))

        # None because there are no Minor, Major, Patch value
        self.assertEqual("2.0.5", doc.update_version())

        doc = MetatabDoc(
            TextRowGenerator(
                dedent("""
                Root.Name:
                Root.Origin: example.com
                Root.Dataset: foobar
                Root.Version:
                Version.Minor: 24
                Version.Major: 2
                Version.Patch: 5
                """)))

        # None because there are no Minor, Major, Patch value
        self.assertEqual("2.24.5", doc.update_version())

        doc.update_name()
        self.assertEqual('example.com-foobar-2.24', doc.get_value('Root.Name'))