def test_line_doc_parts(self): doc = MetatabDoc(TextRowGenerator("Declare: metatab-latest")) for fn in ( 'line/line-oriented-doc-root.txt', 'line/line-oriented-doc-contacts.txt', 'line/line-oriented-doc-references-1.txt', 'line/line-oriented-doc-references-2.txt', 'line/line-oriented-doc-bib.txt', ): with open(test_data(fn)) as f: text = f.read() tp = TermParser(TextRowGenerator(text), resolver=doc.resolver, doc=doc) doc.load_terms(tp) self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249', doc.get_value('Root.Identifier')) self.assertEqual(152, len(doc.terms)) self.assertEqual(5, len(list(doc['References']))) self.assertEqual(5, len(list(doc['References'].find('Root.Resource'))))
def test_line_doc(self): doc = MetatabDoc(TextRowGenerator("Declare: metatab-latest")) with open(test_data('line/line-oriented-doc.txt')) as f: text = f.read() tp = TermParser(TextRowGenerator(text), resolver=doc.resolver, doc=doc) doc.load_terms(tp) self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249', doc.get_value('Root.Identifier')) self.assertEqual(152, len(doc.terms)) self.assertEqual(5, len(list(doc['References']))) self.assertEqual(5, len(list(doc['References'].find('Root.Reference')))) self.assertEqual(5, len(list(doc['References'].find( 'Root.Resource')))) #References are Resources rt = list(doc['References'].find('Root.Resource'))[0] print(type(rt))
def test_line_doc_parts(self): doc = MetapackDoc(TextRowGenerator("Declare: metatab-latest")) for fn in ( 'line/line-oriented-doc-root.txt', 'line/line-oriented-doc-contacts.txt', 'line/line-oriented-doc-datafiles.txt', 'line/line-oriented-doc-references-1.txt', 'line/line-oriented-doc-references-2.txt', 'line/line-oriented-doc-bib.txt', ): with open(test_data(fn)) as f: text = f.read() tp = TermParser(TextRowGenerator(text), resolver=doc.resolver, doc=doc) doc.load_terms(tp) self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249', doc.get_value('Root.Identifier')) self.assertEqual(157, len(doc.terms)) self.assertEqual(5, len(list(doc['References']))) self.assertEqual(5, len(list(doc['References'].find('Root.Reference')))) self.assertEqual(5, len(list(doc['References'].find( 'Root.Resource')))) # References are Resources rt = list(doc['References'].find('Root.Resource'))[0] self.assertIsInstance(rt, Reference) self.assertEqual(5, len(list(doc['Resources']))) self.assertEqual(5, len(list(doc['Resources'].find('Root.Datafile')))) self.assertEqual(5, len(list(doc['Resources'].find( 'Root.Resource')))) # References are Resources rt = list(doc['Resources'].find('Root.Resource'))[0] self.assertIsInstance(rt, Resource) doc._repr_html_() # Check no exceptions
def preprocess_cell(self, cell, resources, index): import re from metatab.rowgenerators import TextRowGenerator if not self.extra_terms: self.extra_terms = [] if cell['source'].startswith('%%metatab'): tp = TermParser(TextRowGenerator(re.sub(r'\%\%metatab.*\n', '', cell['source'])), resolver=self.doc.resolver, doc=self.doc) self.doc.load_terms(tp) elif cell['cell_type'] == 'markdown': tags = cell['metadata'].get('tags', []) if 'Title' in tags: self.extra_terms.append(('Root', 'Root.Title', cell.source.strip().replace('#', ''))) elif 'Description' in tags: self.extra_terms.append(('Root', 'Root.Description', cell.source.strip())) else: cell, resources = super().preprocess_cell(cell, resources, index) return cell, resources
def test_write_line_doc(self): """Convert CSV files to text lines and back to text lines""" all = [ 'example1.csv', 'example2.csv', 'example1-web.csv', 'children.csv', 'children2.csv', 'issue1.csv' ] self.maxDiff = None for f in all: path = test_data(f) doc1 = MetatabDoc(path) doc1_lines = doc1.as_lines() print(doc1_lines) doc2 = MetatabDoc(TextRowGenerator(doc1_lines)) doc2_lines = doc2.as_lines() self.assertEqual(doc1_lines, doc2_lines) self.compare_dict(doc1.as_dict(), doc2.as_dict()) self.assertEqual(doc1_lines, doc2_lines) self.assertEqual(doc1.as_csv(), doc2.as_csv())
def mt_doc(self): """Return the current metatab document, which must be created with either %%metatab or %mt_load_package""" if MT_DOC_VAR not in self.shell.user_ns: package_url = MetapackPackageUrl( "metapack+file:" + os.getcwd() + '/', downloader=Downloader.get_instance()) self.shell.user_ns[MT_DOC_VAR] = \ MetapackDoc(TextRowGenerator("Declare: metatab-latest\n"), package_url=package_url) inline_doc = self.shell.user_ns[MT_DOC_VAR] if 'Resources' not in inline_doc: inline_doc.new_section('Resources', ['Name', 'Description']) if 'Resources' not in inline_doc: inline_doc.new_section('References', ['Name', 'Description']) # Give all of the sections their standard args, to make the CSV versions of the doc # prettier for name, s in inline_doc.sections.items(): try: s.args = inline_doc.decl_sections[name.lower()]['args'] except KeyError: pass return self.shell.user_ns[MT_DOC_VAR]
def add_term_lines(self, text): assert 'root.reference' in TermParser.term_classes tp = TermParser(TextRowGenerator(text), resolver=self.mt_doc.resolver, doc=self.mt_doc) self.mt_doc.load_terms(tp)
def run(self, nb): assert str(self.package_url) self.doc = MetapackDoc(TextRowGenerator("Declare: metatab-latest\n"), package_url=parse_app_url(self.package_url)) self.preprocess(nb, {}) for section, term, value in self.extra_terms: self.doc[section].get_or_new_term(term, value) return self.doc
def get_metatab_doc(nb_path): """Read a notebook and extract the metatab document. Only returns the first document""" from metatab.generate import CsvDataRowGenerator from metatab.rowgenerators import TextRowGenerator from metatab import MetatabDoc with open(nb_path) as f: nb = nbformat.reads(f.read(), as_version=4) for cell in nb.cells: source = ''.join(cell['source']).strip() if source.startswith('%%metatab'): return MetatabDoc(TextRowGenerator(source))
def x_test_metatab_line(self): from metatab.generate import TextRowGenerator from metatab.cli.core import process_schemas from metatab import MetatabDoc cli_init() doc = MetatabDoc( TextRowGenerator(test_data('simple-text.txt'), 'simple-text.txt')) process_schemas(doc) r = doc.resource('resource') for c in r.columns(): print(c)
def test_read_geo_packages(self): import warnings from requests.exceptions import HTTPError warnings.simplefilter("ignore") try: from publicdata.census.dataframe import CensusDataFrame except ImportError: return unittest.skip("Public data isn't installed") with open(test_data('line', 'line-oriented-doc.txt')) as f: text = f.read() doc = MetapackDoc(TextRowGenerator("Declare: metatab-latest\n" + text)) r = doc.reference('B09020') try: df = r.dataframe() except HTTPError: # The Census reporter URLs fail sometimes. return unittest.skip("Census Reporter vanished") self.assertIsInstance(df, CensusDataFrame) r = doc.reference('sra_geo') gf = r.geoframe() self.assertEqual(41, len(gf.geometry.geom_type)) self.assertEqual({'Polygon'}, set(gf.geometry.geom_type)) r = doc.reference('ri_tracts') gf = r.geoframe() self.assertEqual(244, len(gf.geometry.geom_type)) print(sorted(list(set(gf.geometry.geom_type)))) self.assertEqual(['MultiPolygon', 'Polygon'], sorted(list(set(gf.geometry.geom_type)))) print(gf.head())
def test_line_doc(self): from os.path import splitext, basename import sys with open(test_data('line', 'line-oriented-doc.txt')) as f: text = f.read() doc = MetapackDoc(TextRowGenerator("Declare: metatab-latest\n" + text)) # process_schemas(doc) r = doc.reference('tracts') self.assertEqual(628, len(list(r))) tracts = r.dataframe() self.assertEqual(-73427, tracts.lon.sum().astype(int)) tracts = r.read_csv() self.assertEqual(-73427, tracts.lon.sum().astype(int)) r.dataframe() # Test loading a Python Library from a package. ref = doc.reference('incv') self.assertIsNotNone(ref) ref_resource = parse_app_url( ref.url).inner.clear_fragment().get_resource() # The path has to be a Metatab ZIP archive, and the root directory must be the same as # the name of the path pkg_name, _ = splitext(basename(ref_resource.path)) lib_path = ref_resource.join(pkg_name).path if lib_path not in sys.path: sys.path.insert(0, lib_path)
def preprocess_cell(self, cell, resources, index): from metatab.rowgenerators import TextRowGenerator if cell['metadata'].get('mt_final_metatab'): if cell['outputs']: o = ''.join(e['text'] for e in cell['outputs']) self.doc = MetapackDoc(TextRowGenerator(o)) # Give all of the sections their standard args, to make the CSV versions of the doc # prettier for name, s in self.doc.sections.items(): try: s.args = self.doc.decl_sections[name.lower()]['args'] except KeyError: pass return cell, resources
def test_line_oriented(self): doc = MetapackDoc( TextRowGenerator(test_data('line', 'line-oriented-doc.txt'))) self.assertEqual('47bc1089-7584-41f0-b804-602ec42f1249', doc.get_value('Root.Identifier')) self.assertEqual(153, len(doc.terms)) self.assertEqual(6, len(list(doc['References']))) self.assertEqual(6, len(list(doc['References'].find('Root.Reference')))) self.assertEqual(6, len(list(doc['References'].find( 'Root.Resource')))) # References are Resources rt = list(doc['References'].find('Root.Resource'))[0] self.assertIsInstance(rt, Reference)
def extract_notebook_metatab(nb_path: Path): """Extract the metatab lines from a notebook and return a Metapack doc """ from metatab.rowgenerators import TextRowGenerator import nbformat with nb_path.open() as f: nb = nbformat.read(f, as_version=4) lines = '\n'.join(['Declare: metatab-latest'] + [ get_cell_source(nb, tag) for tag in ['metadata', 'resources', 'schema'] ]) doc = MetapackDoc(TextRowGenerator(lines)) doc['Root'].get_or_new_term('Root.Title').value = get_cell_source( nb, 'Title').strip('#').strip() doc['Root'].get_or_new_term('Root.Description').value = get_cell_source( nb, 'Description') doc['Documentation'].get_or_new_term( 'Root.Readme').value = get_cell_source(nb, 'readme') return doc
def mt_add_dataframe(self, line, cell=''): """Add a dataframe to a metatab document's data files """ from metapack.jupyter.core import process_schema from metatab.exc import ParserError args = parse_argstring(self.mt_add_dataframe, line) if not cell: is_line = True args.dump = True else: is_line = False dataframe_name = args.dataframe_name[0] if '_material_dataframes' not in self.shell.user_ns: self.shell.user_ns['_material_dataframes'] = {} df = self.shell.user_ns[dataframe_name] try: cell_doc = MetapackDoc( TextRowGenerator("Declare: metatab-latest\n" + cell)) except ParserError as e: warn('Failed to parse Metatab in cell: {} '.format(e)) return cell_table = cell_doc.find_first('Root.Table') if cell_table and (args.name or args.title): warn( "The name and title arguments are ignored when the cell includes a Metatab table definition" ) if cell_table: name = cell_table.get_value('name') title = cell_table.get_value('title', '') description = cell_table.get_value('description', '') else: name = None title = '' description = '' if not name: name = args.name or dataframe_name if not title: title = args.title or dataframe_name if not name: warn("Name must be set with .name property, or --name option") return title = title.strip("'").strip('"') try: doc = self.mt_doc except KeyError: doc = None if args.materialize: ref = 'file:data/{}.csv'.format(name) self.shell.user_ns['_material_dataframes'][dataframe_name] = ref elif doc is not None: ref = 'ipynb:notebooks/{}.ipynb#{}'.format(doc.as_version(None), dataframe_name) else: ref = None table = None resource_term = None # # First, process the schema, extracting the columns from the dataframe. # if doc and ref: if 'Resources' not in doc: doc.new_section('Resources') resource_term = doc['Resources'].get_or_new_term( "Root.Datafile", ref) resource_term['name'] = name resource_term['title'] = title resource_term['description'] = description df = df.reset_index() table = process_schema(doc, doc.resource(name), df) if not table: table = doc['Schema'].find_first('Root.Table', name) # # Next, apply the names from table description from the cell # if cell_table: cols_by_name = {c.name: c for c in cell_table.find('Table.Column')} for i, c in enumerate(table.find('Table.Column')): cell_column = cols_by_name.get(c.name) try: cell_col_by_pos = list(cols_by_name.values())[i] except KeyError: cell_col_by_pos = None except IndexError: cell_col_by_pos = None if cell_column: c.description = cell_column.description c.name = cell_column.name elif cell_col_by_pos: c.description = cell_col_by_pos.description c.name = cell_col_by_pos.name if args.dump and table: print("Table:", resource_term.name) if resource_term and resource_term.title: print("Table.Title:", resource_term.get_value('title')) print( "Table.Description:", resource_term.get_value('description') if resource_term.get_value('description') else '') for c in table.find('Table.Column'): print("Table.Column:", c.name) print(" .Datatype:", c.datatype) print(" .Description:", c.description or '') if is_line: print( "\nCopy the above into the cell, and change to a cell magic, with '%%' " )
def test_version(self): from textwrap import dedent doc = MetatabDoc( TextRowGenerator( dedent(""" Root.Version: """))) # None because there are no Minor, Major, Patch value self.assertIsNone(doc.update_version()) self.assertFalse(doc._has_semver()) doc = MetatabDoc( TextRowGenerator( dedent(""" Root.Version: 10 """))) # None because there are no Minor, Major, Patch value self.assertEqual("10", doc.update_version()) self.assertFalse(doc._has_semver()) doc = MetatabDoc( TextRowGenerator( dedent(""" Root.Version: 10 Version.Patch: 5 """))) # None because there are no Minor, Major, Patch value self.assertEqual("0.0.5", doc.update_version()) self.assertTrue(doc._has_semver()) doc = MetatabDoc( TextRowGenerator( dedent(""" Root.Version: 10 Version.Major: 2 Version.Patch: 5 """))) # None because there are no Minor, Major, Patch value self.assertEqual("2.0.5", doc.update_version()) doc = MetatabDoc( TextRowGenerator( dedent(""" Root.Name: Root.Origin: example.com Root.Dataset: foobar Root.Version: Version.Minor: 24 Version.Major: 2 Version.Patch: 5 """))) # None because there are no Minor, Major, Patch value self.assertEqual("2.24.5", doc.update_version()) doc.update_name() self.assertEqual('example.com-foobar-2.24', doc.get_value('Root.Name'))