def test_urls(self): import rowgenerators as rg gdf = rg.geoframe('censusgeo://CA/140') print(gdf.set_index('geoid').head()) return u = rg.parse_app_url('census://CA/140/B17001') t = u.get_resource().get_target() print(t, t.year, t.release) self.assertEqual('census://CA/140/B17001', str(t)) self.assertEqual(2016, t.year) self.assertEqual(5, t.release) u = rg.parse_app_url('census://2015/3/CA/140/B17001') t = u.get_resource().get_target() print(t, t.year, t.release) self.assertEqual('census://2015/3/CA/140/B17001', str(t)) self.assertEqual(2015, t.year) self.assertEqual(3, t.release) gdf = t.geoframe() self.assertEqual(43.083, gdf.area.sum().round(3)) gdf = rg.geoframe('census://CA/140/B17001') self.assertEqual(43.083, gdf.area.sum().round(3)) gdf = rg.geoframe('censusgeo://CA/140') self.assertEqual(43.083, gdf.area.sum().round(3))
def __iter__(self): """Iterate the estimates and margins, interleaved""" yield self.file_headers for e, m in zip(parse_app_url(self.est_url).generator, parse_app_url(self.margin_url).generator): yield e[:6] + list(ileave(e[6:], m[6:]))
def __iter__(self): yield self.file_headers for e, m in zip( parse_app_url(self.est_url).generator, parse_app_url(self.margin_url).generator): yield e[:6] + list(ileave(e[6:], m[6:]))
def test_geo_dataframe(self): u = parse_app_url('census://2016/5/RI/140/B01002') self.assertEqual(244, len(u.geoframe().geometry)) u = parse_app_url('censusgeo://2016/5/RI/140') self.assertEqual(244, len(u.geoframe().geometry))
def __iter__(self): headers = list(parse_app_url(self.geo_header_url).generator) yield headers[0] t = parse_app_url(self.geo_url).get_resource().get_target() t.encoding = 'latin1' yield from t.generator
def mt_open_package(self, line): """Find the metatab file for this package, open it, and load it into the namespace. """ from metapack.jupyter.ipython import open_package parse_argstring(self.mt_open_package, line) self.shell.user_ns[MT_DOC_VAR] = open_package(self.shell.user_ns) if self.mt_doc.package_url: parse_app_url(self.mt_doc.package_url)
def _load_documentation_files(self): from metapack_jupyter.exporters import DocumentationExporter notebook_docs = [] # First find and remove notebooks from the docs. These wil get processed to create # normal documents. try: for term in list( self.doc['Documentation'].find('Root.Documentation')): u = parse_app_url(term.value) if u is not None and u.target_format == 'ipynb' and u.proto == 'file': notebook_docs.append(term) self.doc.remove_term(term) except KeyError: self.warn("No documentation defined in metadata") # Process all of the normal files super()._load_documentation_files() fw = FilesWriter() fw.build_directory = join(self.package_path.path, 'docs') # Now, generate the notebook documents directly into the filesystem package for term in notebook_docs: de = DocumentationExporter( base_name=term.name or slugify(term.title)) u = parse_app_url(term.value) nb_path = join(self.source_dir, u.path) # Only works if the path is relative. try: output, resources = de.from_filename(nb_path) fw.write(output, resources, notebook_name=de.base_name + '_full') # Write notebook html with inputs de.update_metatab(self.doc, resources) except Exception as e: from metapack.cli.core import warn warn("Failed to convert document for {}: {}".format( term.name, e))
def add_resource(mt_file, ref, cache): """Add a resources entry, downloading the intuiting the file, replacing entries with the same reference""" if isinstance(mt_file, MetapackDoc): doc = mt_file else: doc = MetapackDoc(mt_file) if 'Resources' not in doc: doc.new_section('Resources') doc['Resources'].args = [ e for e in set(doc['Resources'].args + ['Name', 'StartLine', 'HeaderLines', 'Encoding']) if e ] seen_names = set() u = parse_app_url(ref) # The web and file URLs don't list the same. if u.proto == 'file': entries = u.list() else: entries = [ssu for su in u.list() for ssu in su.list()] for e in entries: add_single_resource(doc, e, cache=cache, seen_names=seen_names) write_doc(doc, mt_file)
def maybe_trial_build(m): from shutil import copyfile '''Update the metadata for a trial build, then restore it''' if not m.args.trial: yield False, m.mt_file return if not m.doc._has_semver(): raise MetapackError( "To use trial builds, package must have a semantic version ") prt('Building a trial') mt_file = Path(m.mt_file.fspath).parent.joinpath('trial.csv') copyfile(m.mt_file.fspath, mt_file) doc = MetapackDoc(mt_file) version = doc['Root'].find_first('Root.Version') vb = version.get_or_new_child('Version.Build') vb.value = 'trial' try: doc.update_name() doc.write() yield True, parse_app_url(str(mt_file), downloader) finally: mt_file.unlink()
def __init__(self, source_ref=None, package_root=None, callback=None, env=None): from metapack.doc import MetapackDoc self._downloader = source_ref._downloader self._cache = self._downloader.cache self._source_ref = source_ref self.source_dir = dirname(parse_app_url(self._source_ref).path) self.package_root = package_root self._callback = callback self._env = env if env is not None else {} self._source_doc = MetapackDoc( self._source_ref, cache=self._cache) # this one stays constant self._doc = MetapackDoc(self._source_ref, cache=self._cache) # This one gets edited self._last_write_path = None if not self.doc.find_first_value('Root.Name'): raise PackageError("Package must have Root.Name term defined")
def test_age_dimensions(self): '''Check that there are not tables with 'year' in the title that don't get a parsed age range''' tm = TableMeta(2016, 5) age_tables = [] for t_id, table in tm.tables.items(): if 'by age' in table.title.lower(): age_tables.append(t_id) for at in age_tables: u = parse_app_url('census://2016/5/RI/40/{}'.format(at.lower())) g = u.generator t = g.table parse_errors = [] for c in t.columns: if '_m90' not in c.unique_id and 'year' in c.description and not c.age_range and '1 year ago' not in \ c.description and 'year-round' not in c.description: parse_errors.append(c) for parse_error in parse_errors: print(parse_error.row) self.assertEqual(0, len(parse_errors))
def test_appurl_US(self): from rowgenerators import parse_app_url from rowgenerators.appurl.web.download import logger as download_logger from publicdata.census.files import logger logging.basicConfig() logger.setLevel(logging.DEBUG) # Iterate over all counties in the US u = parse_app_url('census://2016/5/US/county/B01003') rows = list(u.generator) states = set() counties = set() for row in rows[1:]: states.add(row[1]) counties.add(row[3]) from collections import Counter c = Counter(row[3] for row in rows[1:]) for k, v in c.items(): if v > 1: print(k,v) self.assertEqual(52, len(states)) self.assertEqual(3220, len(counties)) self.assertEqual(3220,len(rows[1:]))
def save(self, path=None): from metapack import MetapackPackageUrl # HACK ... if not self.doc.ref: self.doc._ref = self.package_path # Really should not do this but ... self.check_is_ready() self.load_declares() self.doc.cleanse() self._load_resources() self._relink_documentation() self._clean_doc() if path is None: if self.package_path.inner.proto == 'file': path = self.package_path.path else: raise PackageError("Can't write doc to path: '{}'".format(path)) self.doc['Root'].get_or_new_term('Root.Issued').value = datetime_now() self._last_write_path = path self.doc.write_csv(path) return parse_app_url(abspath(path)).as_type(MetapackPackageUrl)
def acs_dataframe(year, release, stateab, summary_level, table): """ Return a dataframe with ACS data :param year: ACS year :param release: Release, either 5 or 1 :param stateab: State abbreviation, or US :param summary_level: Summary level, either a number or string :param table: Table ID :return: """ u = parse_app_url('census://2016/5/RI/140/B01002') print(type(u)) g = u.generator rows = list(g) self.assertEqual(245, len(rows)) df = u.generator.dataframe() self.assertEqual(9708, int(df['B01002_001'].sum())) self.assertEqual(809, int(df['B01002_001_m90'].sum())) self.assertEqual(9375, int(df['B01002_002'].sum())) self.assertEqual(1171, int(df['B01002_002_m90'].sum()))
def load(self, url, load_all_resources=False): """Load a package and possibly one or all resources, from a url""" u = parse_app_url(url) d = MetapackDoc(u.clear_fragment()) db_doc = self.document(name=d.get_value('Root.Name')) if not db_doc: self.add_doc(d) db_doc = self.document(name=d.get_value('Root.Name')) assert db_doc resources = [] if load_all_resources: for r in self.resources(db_doc): self.load_resource(r) resources.append(r) elif u.target_file: r = self.resource(db_doc, u.target_file) self.load_resource(r) resources.append(d) return (db_doc, resources)
def resource_url(self): predicates = {} url = self.dataset.fetch_url(*self.target_file.split(','), geo_for=self.geo_for, geo_in=self.geo_in, **predicates) return parse_app_url(url, downloader=self.downloader)
def __init__(self, year, release, stusab, summary_level, seq): assert seq is not None super().__init__(year, release, stusab, summary_level, seq) # Url to the estimates self.est_url = seq_estimate_url(self.year, self.release, self.stusab, self.summary_level, self.seq) # Url to the margins self.margin_url = seq_margin_url(self.year, self.release, self.stusab, self.summary_level, self.seq) # Url to the file header, which includes fancy descriptions # The file is a 2-row Excel file, intended to be used as the headers # for the data files. The first row is the column ids, and the second is # the titles. The first 6 columns are for STUSAB, SEQUENCE, LOGRECNO, etc, # so they are cut off. self.header_url = seq_header_url(self.year, self.release, self.stusab, self.summary_level, self.seq) # There are only two rows in the file, the first is the file headers ( column IDs ) # and the second is longer descriptions self._file_headers, _descriptions = list(parse_app_url(self.header_url).generator) # At least some of the fields have '%' as a seperator instead of ' - ' self._descriptions = [ c.replace('%',' -') for c in _descriptions]
def test_build_s3_package(self): from metapack_build.build import make_s3_csv_package cache = Downloader().cache fs_url = MetapackUrl( '/Volumes/Storage/proj/virt-proj/metapack/metapack/test-data/packages/example.com/' 'example-package/_packages/example.com-example_data_package-2017-us-1/metadata.csv', downloader=downloader) # _, url, created = make_excel_package(fs_url,package_dir,get_cache(), {}, False) # _, url, created = make_zip_package(fs_url, package_dir, get_cache(), {}, False) # _, url, created = make_csv_package(fs_url, package_dir, get_cache(), {}, False) package_dir = parse_app_url( 's3://test.library.civicknowledge.com/metatab', downloader=downloader) _, url, created = make_s3_csv_package(fs_url, package_dir, cache, {}, False) print(url) print(created)
def _write_path(self, path): if path: u = parse_app_url(str(path)) else: u = self.ref if u.scheme != 'file': raise MetatabError("Can't write file to URL '{}'".format(str(path))) path = u.fspath if path is None: try: path = pathlib.Path(self.ref.fspath) except AttributeError: if isinstance(self.ref, str): path = pathlib.Path(self.ref) else: path = pathlib.Path(DEFAULT_METATAB_FILE) return path else: return pathlib.Path(str(path))
def save(self, path=None): self.check_is_ready() # Resets the ref so that resource.resolved_url link to the resources as written in S3 self._doc._ref = self.access_url.join('metatab.csv') # Copy all of the files from the Filesystem package for root, dirs, files in walk(self.source_dir): for f in files: source = join(root, f) rel = source.replace(self.source_dir, '').strip('/') with open(source, 'rb') as f: self.write_to_s3(rel, f) # Re-write the URLS for the datafiles for r in self.datafiles: r.url = self.bucket.access_url(r.url) # s3_url = self.bucket.private_access_url(r.url) # r.new_child('S3Url', s3_url) # Re-write the HTML index file. self._write_html() # Rewrite Documentation urls: for r in self.doc.find(['Root.Documentation', 'Root.Image']): url = parse_app_url(r.url) if url.proto == 'file': r.url = self.bucket.access_url(url.path) return self.access_url
def geo_url(self): """Return a url for the geofile for this Census file""" from geoid.acs import AcsGeoid us = tiger_url(self.year, self.summary_level, AcsGeoid.parse(self.geoid).stusab) return parse_app_url(us)
def test_geo_dataframe(self): u = parse_app_url('census://2016/5/RI/140/B01002') gdf = u.generator.geoframe print(gdf.head()) print(gdf.geometry.head())
def _load_files(self): """Load other files""" def copy_dir(path): for (dr, _, files) in walk(path): for fn in files: if any([e in fn for e in self.excludes]): continue relpath = dr.replace(self.source_dir, '').strip('/') src = parse_app_url(join(dr, fn)) dest = join(relpath, fn) resource = src.get_resource() self._load_file(dest, resource.read()) for term in self.resources(term='Root.Pythonlib'): uv = parse_app_url(term.value) ur = parse_app_url(self.source_dir) # In the case that the input doc is a file, and the ref is to a file, # try interpreting the file as relative. if ur.proto == 'file' and uv.proto == 'file': # Either a file or a directory path = join(self.source_dir, uv.path) if isdir(path): copy_dir(path) else: # Load it as a URL f = self._get_ref_contents(term) try: self._load_file(term.value, f.read()) except Exception as e: raise PackageError( "Failed to load file for '{}': {} ".format( term.value, e)) # Copy the whole notebooks director, excluding some files. nb_dir = join(self.source_dir, 'notebooks') if exists(nb_dir) and isdir(nb_dir): copy_dir(nb_dir)
def shape_url(self): """Return the shapefile URL""" from geoid.acs import AcsGeoid us = tiger_url(self.year, self.summary_level, AcsGeoid.parse(self.geoid).stusab) return parse_app_url(us)
def run_url_scrape(args): m = MetapackCliMemo(args, downloader) from metapack.util import scrape_urls_from_web_page doc = m.doc url = m.args.url doc['resources'].new_term('DownloadPage', url) d = scrape_urls_from_web_page(url) if d.get('error'): err(d.get('error')) new_resources = 0 new_documentation = 0 if not args.no_resources: for k, v in d['sources'].items(): u = parse_app_url(v['url']) t = doc['Resources'].new_term('DataFile', v['url'], name=u.fspath.stem, description=v.get('description')) new_resources += 1 if args.verbose: prt(t, t.props) if not args.no_docs: for k, v in d['external_documentation'].items(): term_name = classify_url(v['url']) u = parse_app_url(v['url']) t = doc['Documentation'].new_term(term_name, v['url'], name=u.fspath.stem, description=v.get('description')) new_documentation += 1 if args.verbose: prt(t, t.props) prt("Added {} resource and {} documentation terms".format( new_resources, new_documentation)) if not args.dry_run: write_doc(doc)
def test_titles(self): import rowgenerators as rg #df = rg.dataframe(f'census:/2017/1/CA/50/B22003') #print(df.titles.head().T) u = parse_app_url('census:/2017/1/CA/50/B22003') for e in u.generator.table.columns: print(e.row)
def test_appurl(self): from publicdata.census.util import sub_geoids, sub_summarylevel from rowgenerators import parse_app_url from publicdata.census.exceptions import CensusParsingException #self.assertEqual(245,list(parse_app_url('census://2016/5/RI/140/B17001').generator)) #self.assertEqual(245, list(parse_app_url('census://RI/140/B17001').generator)) with self.assertRaises(ValueError): sub_geoids('foobar') u = parse_app_url('census://RI/140/B17001') self.assertEqual('B17001', u.tableid) self.assertEqual('04000US44', u.geoid) u = parse_app_url('census://B17001/140/RI') self.assertEqual('B17001', u.tableid) self.assertEqual('04000US44', u.geoid) u = parse_app_url('census://140/RI/B17001') self.assertEqual('B17001', u.tableid) self.assertEqual('04000US44', u.geoid) with self.assertRaises(CensusParsingException): parse_app_url('census://B17001/Frop/140') with self.assertRaises(CensusParsingException): parse_app_url('census://BINGO/RI/140')
def resolved_url(self): """Return a URL to the PUMS file""" # '{year}/{release}-Year/csv_{record_type}(state}.zip' us = self.url_proto.format(year=self._year, release=self._release, record_type=self.record_type.lower(), state=self._state.lower()) return parse_app_url(us)
def test_create(self): from publicdata.nlsy import NLSY97 u = parse_app_url('nlsy+file:test_data/test-package/') nlsy = u.nlsy print(nlsy)
def __init__(self, ref=None, decl=None, package_url=None, cache=None, resolver=None, clean_cache=False): self._input_ref = ref self._cache = cache if cache else get_cache() self.decl_terms = {} self.decl_sections = {} self.terms = [] self.sections = OrderedDict() self.super_terms = {} self.derived_terms = {} self.errors = [] self.package_url = package_url self.resolver = resolver or WebResolver() if decl is None: self.decls = [] elif not isinstance(decl, MutableSequence): self.decls = [decl] else: self.decls = decl self.root = RootSectionTerm(doc=self) self.add_section(self.root) self.load_declarations(self.decls) if ref: try: self._ref = parse_app_url(ref) if self._ref.scheme == 'file': try: self._mtime = getmtime(self._ref.path) except (FileNotFoundError, OSError): self._mtime = 0 else: self._mtime = 0 except AppUrlError as e: # ref is probably a generator, not a string or Url self._ref = None self._term_parser = TermParser(ref, resolver=self.resolver, doc=self) try: self.load_terms(self._term_parser) except SourceError as e: raise MetatabError("Failed to load terms for document '{}': {}".format(self._ref, e)) else: self._ref = None self._term_parser = None self._mtime = time()