def test_entrypoints(self): from rowgenerators.generator.iterator import IteratorSource from rowgenerators.generator.generator import GeneratorSource from rowgenerators.generator.csv import CsvSource us = 'http://public.source.civicknowledge.com/example.com/sources/unicode-utf8.csv' def g(): yield None self.assertIsInstance(get_generator([]), IteratorSource) self.assertIsInstance(get_generator(g()), GeneratorSource) self.assertIsInstance( get_generator(parse_app_url(us).get_resource().get_target()), CsvSource)
def test_zip_file(self): us = 'metapack+http://library.metatab.org/example.com-simple_example-2017-us-1.zip#metadata.csv' g = get_generator(us) print(list(g))
def test_fixed(self): from itertools import islice from rowgenerators import Table t = Table() t.add_column('id', int, 6) t.add_column('uuid', str, 34) t.add_column('int', int, 3) t.add_column('float', float, 14) print(str(t)) parse = t.make_fw_row_parser() u = parse_app_url( 'fixed+file:/Volumes/Storage/Downloads/test_data/fixed/simple-example.txt' ) print(u.get_resource()) print(u.get_resource().get_target()) g = get_generator(u, table=t) print(type(g)) for row in islice(g, 10): print(row)
def test_program(self): u = parse_app_url(script_path('rowgen.py')) u.scheme_extension = 'program' env = { '--long-arg': 'a', '-s': 'a', 'ENV_VAR': 'a', 'prop1': 'a', 'prop2': 'a' } g = get_generator(u, working_dir=dirname(u.path), env=env) print(type(g)) rows = {} for row in g.iter_rp: rows[row['type'] + '-' + row['k']] = row.v print(row) self.assertEqual('a', rows['prop-prop1']) self.assertEqual('{"prop1": "a", "prop2": "a"}', rows['env-PROPERTIES'])
def run_row_intuit(url, cache): for encoding in ('ascii', 'utf8', 'latin1'): rows = list(islice(get_generator(url), 5000)) return encoding, RowIntuiter().run(list(rows)) raise Exception('Failed to convert with any encoding')
def geo(self): """Return a geopandas dataframe with boundaries for the area""" from publicdata.censusreporter.url import CensusReporterURL from rowgenerators import get_generator from itertools import islice from metapack.jupyter.pandas import MetatabDataFrame if isinstance(self._url, CensusReporterURL): geo_url = self._url.geo r = geo_url.get_resource() t = r.get_target() g = get_generator(t) headers = next(islice(g, 0, 1)) data = islice(g, 1, None) df = MetatabDataFrame(list(data), columns=headers, metatab_resource=self) return df.geo else: raise PublicDataException( "Dataframe doesn't have a CensusReporterURL, so can't find geo source" )
def generator(self): from rowgenerators import get_generator ## ## Hack! This used to be ## target = self.get_resource().get_target().inner target = self.get_resource().get_target() return get_generator(target)
def get_row_generator(self, ref, cache=None): """Return a row generator for a reference""" from inspect import isgenerator from rowgenerators import get_generator g = get_generator(ref) if not g: raise GenerateError("Cant figure out how to generate rows from {} ref: {}".format(type(ref), ref)) else: return g
def generator(self): """ Return the generator for this URL, if the rowgenerator package is installed. :return: A row generator object. """ from rowgenerators import get_generator r = self.get_resource() t = r.get_target() return get_generator(t.get_target(), source_url=self)
def load_resource(self): """Load rows into a previously created resource table""" from rowgenerators import parse_app_url, get_generator if not self.loaded: url = parse_app_url(self.source_url) g = get_generator(url.get_resource().get_target()) session = inspect(self).session session.bulk_insert_mappings(self.mapper, g.iter_dict) self.loaded = True
def test_sources(self): from csv import DictReader with open(data_path('sources.csv')) as f: for e in DictReader(f): if not e['url_class']: print() continue u = parse_app_url(e['url']) r = u.get_resource() t = r.get_target() g = get_generator(t) self.assertEquals(e['gen_class'], g.__class__.__name__) self.assertEquals(int(e['n_rows']), (len(list(g))))
def run_row_intuit(path, cache): from tableintuit import RowIntuiter from itertools import islice from rowgenerators import TextEncodingError, get_generator for encoding in ('ascii', 'utf8', 'latin1'): try: u = parse_app_url(path) u.encoding = encoding rows = list(islice(get_generator( url=str(u), cache=cache, ), 5000)) return encoding, RowIntuiter().run(list(rows)) except (TextEncodingError, UnicodeEncodeError): pass raise RowIntuitError('Failed to convert with any encoding')
def test_geo(self): from rowgenerators.generator.shapefile import ShapefileSource from rowgenerators.appurl.shapefile import ShapefileUrl us = 'shape+http://s3.amazonaws.com/public.source.civicknowledge.com/sangis.org/Subregional_Areas_2010.zip' u = parse_app_url(us) r = u.get_resource() self.assertIsInstance(r, ShapefileUrl) t = r.get_target() self.assertIsInstance(t, ShapefileUrl) self.assertTrue( str(t).endswith( 'public.source.civicknowledge.com/sangis.org/Subregional_Areas_2010.zip#SRA2010tiger.shp' )) g = get_generator(t) self.assertIsInstance(g, ShapefileSource) self.assertEqual([{ 'name': 'id', 'type': 'int' }, { 'name': 'SRA', 'type': 'int' }, { 'name': 'NAME', 'type': 'str' }, { 'name': 'geometry', 'type': 'geometry_type' }], g.columns) self.assertEqual(['id', 'SRA', 'NAME', 'geometry'], g.headers) self.assertEquals(42, len(list(g)))
def test_geo(self): us = 'shape+http://s3.amazonaws.com/public.source.civicknowledge.com/sangis.org/Subregional_Areas_2010.zip' u = parse_app_url(us) r = u.get_resource() print(type(r), r) t = r.get_target() print(type(t), t) g = get_generator(t) print(type(g)) print(g.columns) print(g.headers) self.assertEquals(42, len(list(g)))
def test_resolve_resource_urls(self): """Test how resources are resolved in packages. - A name, for excel and CSV packages - a path, for ZIP and filesystem packages - a web url, for any kind of package """ with open(test_data('packages.csv')) as f: for i, l in enumerate(DictReader(f), 2): # print(i, l['url'], l['target_file']) u = MetapackPackageUrl(l['url'], downloader=Downloader()) try: t = u.resolve_url(l['target_file']) self.assertFalse(bool(l['resolve_error'])) except ResourceError: self.assertTrue(bool(l['resolve_error'])) continue except DownloadError: raise # Testing containment because t can have path in local filesystem, which changes depending on where # test is run # print(" ", t) self.assertTrue(l['resolved_url'] in str(t), (i, l['resolved_url'], str(t))) try: g = get_generator(t.get_resource().get_target()) self.assertEqual(101, len(list(g))) self.assertFalse(bool(l['generate_error'])) except DownloadError: raise except RowGeneratorError: self.assertTrue(bool(l['generate_error'])) continue
def test_notebook_url(self): try: from metapack.appurl import JupyterNotebookUrl from metapack.jupyter.exec import execute_notebook from os.path import exists u = parse_app_url(test_data('notebooks', 'GenerateDataTest.ipynb')) self.assertIsInstance(u, JupyterNotebookUrl) execute_notebook(u.path, '/tmp/nbtest', ['dfa', 'dfb'], True) self.assertTrue(exists('/tmp/nbtest/dfa.csv')) self.assertTrue(exists('/tmp/nbtest/dfb.csv')) g = get_generator(parse_app_url('/tmp/nbtest/dfa.csv')) print(list(g)) except ImportError: unittest.skip("Missing pandas or jupyter client") return
def generate_terms(self, ref, root, file_type=None): """An generator that yields term objects, handling includes and argument children. :param file_type: :param doc: :param root: :param ref: """ last_section = root t = None if isinstance(ref, Source): row_gen = ref ref_path = row_gen.__class__.__name__ else: row_gen = get_generator(ref) ref_path = ref.path try: for line_n, row in enumerate(row_gen, 1): if not row or not row[0] or not row[0].strip() or row[0].strip( ).startswith('#'): continue tt = Term( row[0], None ) # Just to get the qualified name constructed property term_class = self.get_term_class(tt.join_lc) t = term_class(tt.join_lc, row[1] if len(row) > 1 else '', row[2:] if len(row) > 2 else [], row=line_n, col=1, file_name=ref_path, file_type=file_type, doc=self.doc) # Why did we remove comments from values? It strips out Markdown #if t.value and str(t.value).startswith('#'): # Comments are ignored # continue if t.term_is('include') or t.term_is('declare'): if t.term_is('include'): resolved = self.find_include_doc( dirname(ref_path), t.value.strip()) else: resolved = self.find_declare_doc( dirname(ref_path), t.value.strip()) if row_gen.ref == resolved: raise IncludeError( "Include loop for '{}' ".format(resolved)) yield t try: sub_gen = get_generator( resolved.get_resource().get_target()) for t in self.generate_terms( sub_gen, root, file_type=t.record_term_lc): yield t if last_section: yield last_section # Re-assert the last section except IncludeError as e: e.term = t raise except (OSError, FileNotFoundError, GenerateError, DownloadError) as e: e = IncludeError("Failed to Include; {}".format(e)) e.term = t raise e continue # Already yielded the include/declare term, and includes can't have children elif t.term_is('section'): # If there is already a section in the document, emit the existing section, # rather than a new one. try: last_section = self.doc[t.name] t = last_section except (KeyError, TypeError): # TypeError -> self.doc is None last_section = t yield t # Yield any child terms, from the term row arguments if not t.term_is('section') and not t.term_is('header'): for col, value in enumerate(t.args, 0): if str(value).strip(): term_name = t.record_term_lc + '.' + str(col) term_class = self.get_term_class(term_name) yield term_class( term_name, str(value), [], row=line_n, col=col + 2, # The 0th argument starts in col 2 file_name=ref_path, file_type=file_type, parent=t) #, #doc=None, #section=last_section) except IncludeError as e: exc = IncludeError(str(e) + "; in '{}' ".format(ref_path)) exc.term = e.term if hasattr(e, 'term') else None raise exc
def get_generator(self, cache=None, working_dir=None): from rowgenerators import get_generator return get_generator(self, cache, working_dir=working_dir)