def join_dir(self, s): try: path = s.path except AttributeError: path = parse_app_url(s).path # If there is a netloc, it's an absolute URL if s.netloc: return s new_key = join(dirname(self.key), path) return parse_app_url('s3://{bucket}/{key}'.format( bucket=self.bucket_name.strip('/'), key=new_key.lstrip('/')))
def test_fixed(self): from itertools import islice from rowgenerators import Table t = Table() t.add_column('id', int, 6) t.add_column('uuid', str, 34) t.add_column('int', int, 3) t.add_column('float', float, 14) print(str(t)) parse = t.make_fw_row_parser() u = parse_app_url( 'fixed+file:/Volumes/Storage/Downloads/test_data/fixed/simple-example.txt' ) print(u.get_resource()) print(u.get_resource().get_target()) g = get_generator(u, table=t) print(type(g)) for row in islice(g, 10): print(row)
def test_program(self): u = parse_app_url(script_path('rowgen.py')) u.scheme_extension = 'program' env = { '--long-arg': 'a', '-s': 'a', 'ENV_VAR': 'a', 'prop1': 'a', 'prop2': 'a' } g = get_generator(u, working_dir=dirname(u.path), env=env) print(type(g)) rows = {} for row in g.iter_rp: rows[row['type'] + '-' + row['k']] = row.v print(row) self.assertEqual('a', rows['prop-prop1']) self.assertEqual('{"prop1": "a", "prop2": "a"}', rows['env-PROPERTIES'])
def get_file(url_str): from appurl import parse_app_url u = parse_app_url(url_str) return u.get_resource().get_target()
def test_geo(self): u = parse_app_url('censusreporter://B01001/140/05000US06073') B01001 = u.generator.dataframe() geo = B01001.geo print(len(geo))
def test_app_urls(self): with open(test_data('database_urls.csv')) as f: for e in DictReader(f): u = parse_app_url(e['url']) self.assertEquals(str(e['driver']), str(u.driver)) self.assertEquals(str(e['dialect']), str(u.dialect))
def extract(resource, doc, *args, **kwargs): """Extract rows from an FFIEC disclosire file, from a collection of Root.References, for a given prefix This function is used as a program URL in a Root.DataFile term: Section: Resources DataFile: python:publicdata.ffiec#extract Datafile.Name: sb_loan_orig Datafile.Schema:cra_disclosure Datafile.Prefix:D1-1 The schema for the table must be specified, because the rows are fixed width, so the schema must have a Column.Width for each column. The function also expects that all of the references in the document refer to FFEIC file, such as: Section: References Reference: https://www.ffiec.gov/cra/xls/15exp_discl.zip Reference.Name: discl_15 Reference: https://www.ffiec.gov/cra/xls/14exp_discl.zip Reference.Name: discl_14 Reference: https://www.ffiec.gov/cra/xls/13exp_discl.zip Reference.Name: discl_13 Reference: https://www.ffiec.gov/cra/xls/12exp_discl.zip Reference.Name: discl_12 Reference: https://www.ffiec.gov/cra/xls/11exp_discl.zip Reference.Name: discl_11 Reference: https://www.ffiec.gov/cra/xls/10exp_discl.zip Reference.Name: discl_10 """ test = bool(resource.get_value('test', False)) prefix = resource.prefix table = resource.row_processor_table() yield table.headers parser = table.make_fw_row_parser(ignore_empty=True) for r in doc.references(): print("Processing ", r.name) t = parse_app_url(r.url).get_resource().get_target() with open(t.path, 'rU') as f: for line in (islice(f.readlines(), 10) if test else f.readlines()): if not line.startswith(prefix + ' '): continue yield parser(line)
def enumerate_contents(base_spec, cache_fs, callback=None): """Inspect the URL, and if it is a container ( ZIP Or Excel ) inspect each of the contained files. Yields all of the lower-level URLs""" if not isinstance(base_spec, Url): base_spec = parse_app_url(url=base_spec) for s in inspect(base_spec, cache_fs, callback=callback): for s2 in inspect(s, cache_fs, callback=callback): yield s2
def test_entrypoints(self): from rowgenerators.generator.iterator import IteratorSource from rowgenerators.generator.generator import GeneratorSource from rowgenerators.generator.csv import CsvSource us = 'http://public.source.civicknowledge.com/example.com/sources/unicode-utf8.csv' def g(): yield None self.assertIsInstance(get_generator([]), IteratorSource) self.assertIsInstance(get_generator(g()), GeneratorSource) self.assertIsInstance( get_generator(parse_app_url(us).get_resource().get_target()), CsvSource)
def get_resource(self): """Get the contents of resource and save it to the cache, returning a file-like object""" from appurl import parse_app_url self._resource = self._downloader.download(self.inner) ru = parse_app_url(self._resource.sys_path, fragment=self.fragment, fragment_query=self.fragment_query, scheme_extension=self.scheme_extension, downloader = self.downloader ) return ru
def __init__(self, url=None, downloader=None, **kwargs): super().__init__(url, downloader, **kwargs) self._proto = 'gs' self.key = self.path or self.netloc # former without '://', later with ':' self.gid = self.target_file if self.gid: web_url = (self.url_template + self.gid_siffix).format( key=self.key, gid=self.gid) else: web_url = self.url_template.format(key=self.key) web_url += "#target_file={}-{}.csv".format(self.key, self.gid) self.web_url = parse_app_url(web_url)
def get_resource(self): cache = self.downloader.cache if cache and cache.exists(self.cache_key): pass else: r = requests.get(self.resource_url) r.raise_for_status() data = r.json() if cache: cache.makedirs(dirname(self.cache_key), recreate=True) cache.settext(self.cache_key, json.dumps(data, indent=4)) return parse_app_url( cache.getsyspath(self.cache_key), fragment=["/".join(self._parts), None], ).as_type(CensusReporterJsonUrl)
def test_google(self): url = 'gs:1qjjtkMqpxtkDp3qZlkF7P8Tm8VtfIwiWW-OqJ2J91yE#2038675149' u = parse_app_url(url) wu = u.web_url print(type(wu), wu) r = u.get_resource() print(type(r), r.path) t = r.get_target() print(type(t), t.path) for r in t.generator: print(r)
def test_sources(self): from csv import DictReader with open(data_path('sources.csv')) as f: for e in DictReader(f): if not e['url_class']: print() continue u = parse_app_url(e['url']) r = u.get_resource() t = r.get_target() g = get_generator(t) self.assertEquals(e['gen_class'], g.__class__.__name__) self.assertEquals(int(e['n_rows']), (len(list(g))))
def test_geo(self): from rowgenerators.generator.shapefile import ShapefileSource from rowgenerators.appurl.shapefile import ShapefileUrl us = 'shape+http://s3.amazonaws.com/public.source.civicknowledge.com/sangis.org/Subregional_Areas_2010.zip' u = parse_app_url(us) r = u.get_resource() self.assertIsInstance(r, ShapefileUrl) t = r.get_target() self.assertIsInstance(t, ShapefileUrl) self.assertTrue( str(t).endswith( 'public.source.civicknowledge.com/sangis.org/Subregional_Areas_2010.zip#SRA2010tiger.shp' )) g = get_generator(t) self.assertIsInstance(g, ShapefileSource) self.assertEqual([{ 'name': 'id', 'type': 'int' }, { 'name': 'SRA', 'type': 'int' }, { 'name': 'NAME', 'type': 'str' }, { 'name': 'geometry', 'type': 'geometry_type' }], g.columns) self.assertEqual(['id', 'SRA', 'NAME', 'geometry'], g.headers) self.assertEquals(42, len(list(g)))
def test_geo(self): us = 'shape+http://s3.amazonaws.com/public.source.civicknowledge.com/sangis.org/Subregional_Areas_2010.zip' u = parse_app_url(us) r = u.get_resource() print(type(r), r) t = r.get_target() print(type(t), t) g = get_generator(t) print(type(g)) print(g.columns) print(g.headers) self.assertEquals(42, len(list(g)))
def test_basic(self): from publicdata.censusreporter.url import CensusReporterURL from publicdata.censusreporter.generator import CensusReporterSource u = parse_app_url('censusreporter://B01001/140/05000US06073') self.assertEqual(629, len(list(u.generator))) self.assertIsInstance(u, CensusReporterURL) self.assertIsInstance(u.generator, CensusReporterSource) B01001 = u.generator.dataframe() self.assertEqual(3223096.0, B01001.B01001001.sum()) #print(B01001.titles.iloc[:2].T) cols = [ 'geoid', 'B01001001', # Total Population 'B01001002', # Total Male 'B01001026', # Total Female 'B01001013', 'B01001014', # Males, 35-39 and 40-44 'B01001037', 'B01001038' # Female, 35-39 and 40-44 ] df = B01001[cols].copy() df['male_35_44'], df['male_35_44_m90'] = df.sum_m('B01001013', 'B01001014') df['female_35_44'], df['female_35_44_m90'] = df.sum_m('B01001037', 'B01001038') df['m_ratio'],df['m_ratio_m90'] = df.ratio('male_35_44','B01001002') print(len(df.proportion('male_35_44', 'female_35_44'))) df['mf_proprtion'] , df['mf_proprtion_m90'] = df.proportion('male_35_44', 'female_35_44') self.assertEqual(211707.0, df.female_35_44.dropna().sum()) self.assertEqual(82, int(df.m_ratio.dropna().sum()))
def test_census_shapes(self): from publicdata.censusreporter.url import CensusReporterShapeURL from rowgenerators.appurl.shapefile import ShapefileUrl, ShapefileShpUrl from rowgenerators.generator.shapefile import ShapefileSource u = parse_app_url('censusreportergeo://B01003/140/05000US06073') self.assertTrue(str(u.resource_url).endswith('&format=shp')) self.assertIsInstance(u, CensusReporterShapeURL) r = u.get_resource() self.assertIsInstance(r, ShapefileUrl) self.assertTrue(str(r).endswith('/latest.zip#.%2A%5C.shp%24'), str(r)) g = r.generator self.assertIsInstance(g, ShapefileSource) self.assertEquals(629, (len(list(g)))) return
def resource_url(self): return parse_app_url("http://{host}/1.0/data/download/latest?table_ids={table_id}&geo_ids={sl}|{geoid}&format=shp" \ .format(host=self.api_host,table_id=self.table_id, sl=self.summary_level, geoid=self.geoid), downloader=self.downloader)
def process_schema(doc, resource, df): """Add schema entiries to a metatab doc from a dataframe""" from rowgenerators import SourceError from requests.exceptions import ConnectionError from metapack.cli.core import extract_path_name, type_map from metapack_build.core import alt_col_name from tableintuit import TypeIntuiter from rowgenerators.generator.python import PandasDataframeSource from appurl import parse_app_url try: doc['Schema'] except KeyError: doc.new_section('Schema', ['DataType', 'Altname', 'Description']) schema_name = resource.get_value('schema', resource.get_value('name')) schema_term = doc.find_first(term='Table', value=schema_name, section='Schema') if schema_term: logger.info("Found table for '{}'; skipping".format(schema_name)) return path, name = extract_path_name(resource.url) logger.info("Processing {}".format(resource.url)) si = PandasDataframeSource( parse_app_url(resource.url), df, cache=doc._cache, ) try: ti = TypeIntuiter().run(si) except SourceError as e: logger.warn("Failed to process '{}'; {}".format(path, e)) return except ConnectionError as e: logger.warn("Failed to download '{}'; {}".format(path, e)) return table = doc['Schema'].new_term('Table', schema_name) logger.info("Adding table '{}' to metatab schema".format(schema_name)) for i, c in enumerate(ti.to_rows()): raw_alt_name = alt_col_name(c['header'], i) alt_name = raw_alt_name if raw_alt_name != c['header'] else '' t = table.new_child('Column', c['header'], datatype=type_map.get(c['resolved_type'], c['resolved_type']), altname=alt_name, description=df[c['header']].description \ if hasattr(df, 'description') and df[c['header']].description else '' ) return table
def __init__(self, url, name=None, proto=None, resource_format=None, target_file=None, target_segment=None, target_format=None, encoding=None, columns=None, generator_args=None, **kwargs): """ The ``header_lines`` can be a list of header lines, or one of a few special values: * [0]. The header line is the first line in the dataset. * False. The header line is not specified, so it should be intuited * None or 'none'. There is no header line, and it should not be intuited. :param url: :param name: An optional name for the source :param proto: Either the scheme of the url, or the scheme extension. One of http, https, gs, socrata. Forces how the URL is interpreted. :param target_format: Forces the file format, which may be either the downloaded resource, or an internal file in a ZIP archive. , which is usually taked from the file extension. May be any typical extension string. :param file: A reference to an internal file in a Zip archive. May a string, or a regular expression. :param segment: A reference to a worksheet in a spreadsheet. May be a string or a number :param resource_format: The file format of the object the URL points to, such as a ZIP file, which may have internal file of another type. :param encoding: The file encoding. :param kwargs: Stored and made available to generators :return: The segment may have one or two parameters. If it contains a ';', there are two parameters. The first will identify a spreadsheet file in an archive, and the second identifies a worksheet in the file. """ if isinstance(url, Url): self._url = url else: self._url = parse_app_url(url, proto=proto, resource_format=resource_format.lower() if resource_format else resource_format, target_file=target_file, target_segment=target_segment, target_format=target_format.lower() if target_format else target_format, encoding=encoding) self.name = name if name else str(uuid4()) self.columns = columns self.download_time = None # Set externally self.generator_args = generator_args self.kwargs = kwargs
def get_generator(source, **kwargs): from rowgenerators import Source names = [] if isinstance(source, Source): return source if isinstance(source, str): ref = parse_app_url(source).get_resource().get_target() try: names.append('.{}'.format(ref.target_format)) except AttributeError: pass elif inspect.isgenerator(source): names.append('<generator>') ref = source elif isinstance(source, collections.Iterable): names.append('<iterator>') ref = source elif hasattr(source, '__iter__'): names.append('<iterator>') ref = source elif isinstance(source, Url): ref = source try: names.append('.{}'.format(ref.target_format)) except AttributeError: pass try: names.append('{}+'.format(ref.scheme_extension)) except AttributeError: pass try: names.append('{}:'.format(ref.scheme)) except AttributeError: pass try: names.append('<{}>'.format(ref.__class__.__name__)) except AttributeError: pass else: raise RowGeneratorError("Unknown arg type for source: '{}'".format( type(source))) classes = sorted([ ep.load() for ep in iter_entry_points(group='rowgenerators') if ep.name in names ], key=lambda cls: cls.priority) if not classes: raise RowGeneratorError( "Can't find generator for source '{}' \nproto={}, resource_format={}, target_format={} " .format(source, ref.proto, ref.resource_format, ref.target_format)) try: return classes[0](ref, **kwargs) except Exception as e: raise RowGeneratorError( "Failed to instantiate generator for class '{}', ref '{}'".format( classes[0], ref)) from e