def test_datapackage_declare(self): from tempfile import NamedTemporaryFile import datapackage from os import unlink doc = MetatabDoc(test_data('datapackage_ex2.csv')) d = doc.as_dict() f = open('/tmp/package.json', 'w') # NamedTemporaryFile(delete=False) f.write(json.dumps(d, indent=4)) f.close() try: dp = datapackage.DataPackage(f.name) dp.validate() except: with open(f.name) as f2: print(f2.read()) raise print(f.name) # unlink(f.name) doc = MetatabDoc(test_data('example1.csv')) from metatab.datapackage import convert_to_datapackage print(json.dumps(convert_to_datapackage(doc), indent=4))
def test_serializer(self): return doc = MetatabDoc(test_data('schema.csv')) d = doc.as_dict() s = Serializer() s.load_declarations(d) sections = defaultdict(list) for e in s.semiflatten(d): print(e) return for e in sorted(s.serialize(d)): has_int = any(isinstance(ki, int) for ki in e[0]) key_no_int = tuple(ki for ki in e[0] if not isinstance(ki, int)) print(key_no_int) pr = '.'.join(key_no_int[-2:]) t = Term(pr, e[1], row=0, col=0, file_name=None) section = s.decl['terms'].get(t.join(), {}).get('section', 'Root') sections[section].append(t) return for k, v in sections.items(): print("=====", k) for t in v: print(t)
def test_headers(self): from metatab import TermParser, CsvPathRowGenerator d1 = MetatabDoc(test_data('example1-headers.csv')).root.as_dict() d2 = MetatabDoc(test_data('example1.csv')).root.as_dict() self.compare_dict(d1, d2)
def test_sections(self): doc = MetatabDoc() s = doc.new_section("SectionOne", "A B C".split()) print(doc.sections) print(doc.as_dict())
def test_includes(self): doc = MetatabDoc(test_data('include1.csv')) d = doc.as_dict() for t in doc['root'].terms: print(t) print(d) self.assertEquals( ['Include File 1', 'Include File 2', 'Include File 3'], d['note']) self.assertTrue(any('include2.csv' in e for e in d['include'])) self.assertTrue(any('include3.csv' in e for e in d['include']))
def scrape_page(mt_file, url): from .util import scrape_urls_from_web_page doc = MetatabDoc(mt_file) doc['resources'].new_term('DownloadPage', url) d = scrape_urls_from_web_page(url) for k, v in d['sources'].items(): doc['Resources'].new_term('DataFile', v['url'], description=v.get('description')) for k, v in d['external_documentation'].items(): term_name = classify_url(v['url']) doc['Documentation'].new_term(term_name, v['url'], description=v.get('description')) doc.write_csv(mt_file)
def errs(fn): with self.assertRaises(IncludeError): doc = MetatabDoc() tp = TermParser(CsvPathRowGenerator(fn), doc=doc) _ = list(tp) return tp.errors_as_dict()
def test_children(self): doc = MetatabDoc(test_data('children.csv')) for t in doc.terms: print(t) import json print(json.dumps(doc.as_dict(), indent=4)) for t in doc.as_dict()['parent']: self.assertEquals( { 'prop1': 'prop1', 'prop2': 'prop2', '@value': 'parent' }, t)
def rewrite_resource_format(mt_file): doc = MetatabDoc(mt_file) if 'schema' in doc: table_schemas = {t.value: t.as_dict() for t in doc['schema']} del doc['schema'] for resource in doc['resources']: s = resource.new_child('Schema', '') for column in table_schemas.get(resource.find_value('name'), {})['column']: c = s.new_child('column', column['name']) for k, v in column.items(): if k != 'name': c.new_child(k, v) doc.write_csv(mt_file)
def test_new_parser(self): tp = MetatabDoc(test_data('short.csv')) for t in tp.terms: print(t) import json print(json.dumps(tp.decl_terms, indent=4))
def make_metatab_file(template='metatab'): from os.path import dirname from rowgenerators.util import fs_join as join import metatab.templates from metatab.doc import MetatabDoc template_path = join(dirname(metatab.templates.__file__), template + '.csv') doc = MetatabDoc(template_path) return doc
def test_datapackage_convert(self): import datapackage from metatab.datapackage import convert_to_datapackage doc = MetatabDoc(test_data('example1.csv')) dp = convert_to_datapackage(doc) print(json.dumps(dp, indent=4)) dp = datapackage.DataPackage(dp) dp.validate()
def test_basic(self): fn = test_data('example1.csv') fn = '/Volumes/Storage/proj/virt/ambry/metatab-py/test/packages/csvs/metadata.csv' doc = MetatabDoc().load_csv(fn) terms = list(doc.terms) for row in doc: print(row)
def test_parse_everything(self): all = [ 'example1.csv', 'example2.csv', 'example1-web.csv', 'include1.csv', 'include2.csv', 'include3.csv', 'children.csv', 'children2.csv', 'datapackage_ex1.csv', 'datapackage_ex1_web.csv', 'datapackage_ex2.csv', 'issue1.csv' ] all = ['example1.csv'] for fn in all: print('Testing ', fn) path = test_data(fn) json_path = test_data('json', fn.replace('.csv', '.json')) with open(path) as f: doc = MetatabDoc(path) d = doc.as_dict() if not exists(json_path): with open(json_path, 'w') as f: print("Writing", json_path) json.dump(d, f, indent=4) with open(json_path) as f: d2 = json.load(f) #import json #print(json.dumps(d, indent=4)) self.compare_dict(d, d2)
def test_versions(self): doc = MetatabDoc(test_data('example1.csv')) self.assertEqual('201404', doc.find_first_value('Root.Version')) self.assertEqual('201409', doc.as_version('+5').find_first_value('Root.Version')) self.assertEqual('201399', doc.as_version('-5').find_first_value('Root.Version')) self.assertEqual( 'foobar', doc.as_version('foobar').find_first_value('Root.Version'))
def test_sections(self): doc = MetatabDoc(test_data('example1.csv')) self.assertEqual( ['root', u'resources', u'contacts', u'notes', u'schema'], list(doc.sections.keys())) del doc['Resources'] self.assertEqual(['root', u'contacts', u'notes', u'schema'], list(doc.sections.keys())) notes = list(doc['notes']) self.assertEquals(2, len(notes)) for sname, s in doc.sections.items(): print(sname, s.value)
def test_generic_row_generation(self): from metatab import GenericRowGenerator url = 'gs://14_nfiTtSiMSjDes6BSiLU-Gsqy8DIdUxpMaH6DswcVQ' doc = MetatabDoc(url) self.assertEquals('Registered Voters, By County', doc.find_first('root.title').value) url = 'http://assets.metatab.org/examples/example-package.xls#meta' doc = MetatabDoc(url) self.assertEquals('17289303-73fa-437b-97da-2e1ed2cd01fd', doc.find_first('root.identifier').value)
def test_declarations(self): doc = MetatabDoc(test_data('example1.csv')) d = {k: v for k, v in doc.decl_terms.items() if 'homepage' in k} self.assertEqual(17, len(d)) self.assertIn("homepage.mediatype", d.keys()) self.assertIn("homepage.hash", d.keys()) self.assertIn("homepage.title", d.keys()) # Direct use of function ti = TermParser( CsvPathRowGenerator(declaration_path('metatab-latest')), False) ti.install_declare_terms() fn = test_data( 'example1.csv') # Not acutally used. Sets base directory doc = MetatabDoc( MetatabRowGenerator([['Declare', 'metatab-latest']], fn)) terms = doc.decl_terms self.assertIn('root.homepage', terms.keys()) self.assertIn('documentation.description', terms.keys()) self.assertEquals(247, len(terms.keys())) sections = doc.decl_sections self.assertEquals( { 'contacts', 'declaredterms', 'declaredsections', 'root', 'resources', 'schemas', 'sources', 'documentation', 'data' }, set(sections.keys())) # Use the Declare term fn = test_data('example1.csv') doc = MetatabDoc(CsvPathRowGenerator(fn)) d = doc._term_parser.declare_dict self.assertEqual({'terms', 'synonyms', 'sections'}, set(d.keys())) terms = d['terms'] self.assertIn('root.homepage', terms.keys()) self.assertIn('documentation.description', terms.keys()) self.assertEquals(247, len(terms.keys())) sections = d['sections'] self.assertEquals( { 'contacts', 'declaredterms', 'declaredsections', 'root', 'resources', 'schemas', 'sources', 'documentation', 'data' }, set(sections.keys())) self.assertEqual(['Email', 'Organization', 'Tel', 'Url'], sections['contacts']['args']) self.assertEqual(['TermValueName', 'ChildPropertyType', 'Section'], sections['declaredterms']['args']) self.assertEqual(['DataType', 'ValueType', 'Description'], sections['schemas']['args'])
def test_find(self): doc = MetatabDoc(test_data('example1.csv')) self.assertEquals('cdph.ca.gov-hci-registered_voters-county', doc.find_first('Root.Identifier').value)
from collections import defaultdict l = get_library() table_meta_p = l.partition( 'census.gov-acs_geofile-schemas-2009e-table_meta-2014-5') column_meta_p = l.partition( 'census.gov-acs_geofile-schemas-2009e-column_meta-2014-5') sequence_p = l.partition( 'census.gov-acs_geofile-schemas-2009e-table_sequence-2014-5') sequences = { row.table_id: (row.sequence_number, row.start, row.table_cells) for row in sequence_p if row.start } root_doc = MetatabDoc() root = root_doc.new_section('Root') root.new_term('Declare', 'http://assets.metatab.org/census.csv') root.new_term('Title', 'American Community Survey, 5 Year, 2009-2014') root.new_term('Release', 5) root.new_term('Year', 2014) root.new_term('Include', 'acs20145-sources.csv') root.new_term('Include', 'acs20145-schema.csv') root_doc.write_csv('acs20145-metadata.csv') src_doc = MetatabDoc() source_sec = src_doc.new_section('Sources', ['geography', 'state']) from censuslib import ACS09TableRowGenerator as TableRowGenerator
def test_descendents(self): doc = MetatabDoc(test_data('example1.csv')) self.assertEquals(144, (len(list(doc.all_terms))))
def test_update_name(self): import datapackage from metatab.datapackage import convert_to_datapackage for fn in ('name.csv', 'name2.csv'): doc = MetatabDoc(test_data(fn)) updates = doc.update_name() name = doc.find_first_value("Root.Name") self.assertEquals('example.com-foobar-2017-ca-people-1', name) self.assertEquals(['Changed Name'], updates) try: doc.remove_term(doc.find_first('Root.Dataset')) except ValueError: nv = doc.find_first('Root.Name') nv.remove_child(nv.find_first('Name.Dataset')) updates = doc.update_name() self.assertIn("No Root.Dataset, so can't update the name", updates) doc.find_first('Root.Name').value = None updates = doc.update_name() self.assertIn('Setting the name to the identifier', updates) doc.find_first('Root.Name').value = None doc.find_first('Root.Identifier').value = None updates = doc.update_name() self.assertIn( 'Failed to find DatasetName term or Identity term. Giving up', updates) self.assertIsNone(doc.get_value('Root.Name'))