def validate_path(schema, jsonfiles): schema = json.loads(schema.read()) for path in utils.get_files(jsonfiles): path_components = utils.get_path_parts(path) regex = schema[path_components[0]] if not re.compile(regex).match(path): raise AssertionError('Path "%s" does not match spec "%s"' % (path, regex))
def find_directory_in_tree(root, dirname): path_parts = utils.get_path_parts(dirname) cur_path = os.sep cur_node = root for part in path_parts: cur_path = os.path.join(cur_path, part) if part not in cur_node.children: new_node = get_directory_node(rootdir, cur_path, uid_map, gid_map) cur_node.children[part] = new_node cur_node = cur_node.children[part] return cur_node
def process(sources, output, force): """Download sources and process the file to the output directory. \b SOURCES: Source JSON file or directory of files. Required. OUTPUT: Destination directory for generated data. Required. """ for path in utils.get_files(sources): pathparts = utils.get_path_parts(path) pathparts[0] = output.strip(os.sep) pathparts[-1] = pathparts[-1].replace('.json', '.geojson') outdir = os.sep.join(pathparts[:-1]) outfile = os.sep.join(pathparts) source = utils.read_json(path) urlfile = urlparse(source['url']).path.split('/')[-1] if not hasattr(adapters, source['filetype']): utils.error('Unknown filetype', source['filetype'], '\n') continue if os.path.isfile(outfile) and not force: utils.error('Skipping', path, 'since generated file exists.', 'Use --force to regenerate.', '\n') continue utils.info('Downloading', source['url']) try: fp = utils.download(source['url']) except IOError: utils.error('Failed to download', source['url'], '\n') continue utils.info('Reading', urlfile) try: geojson = getattr(adapters, source['filetype']).read(fp, source['properties']) except IOError: utils.error('Failed to read', urlfile) continue finally: os.remove(fp.name) utils.make_sure_path_exists(outdir) utils.write_json(outfile, geojson) utils.success('Done. Processed to', outfile, '\n')
def ilist(self): """Return an iterator of hashes of all files in CAS. Return: The iterator of hashes. The iteration order is unspecified. """ for (dirpath, dirnames, filenames) in os.walk(self._root): # All files in uncorrupted CAS were previously stored # by a .store() call, so we construct the hashes by # joining shards encoded in dirpath and the file names. path_parts = utils.get_path_parts(dirpath) shard_parts = path_parts[-self._sharding:] for hash_without_shards in filenames: full_hash = ''.join(shard_parts) + hash_without_shards yield full_hash
def process(sources, output, force): """Download sources and process the file to the output directory. \b SOURCES: Source JSON file or directory of files. Required. OUTPUT: Destination directory for generated data. Required. """ logging.getLogger('shapely.geos').setLevel(logging.WARNING) catalog_features = [] failures = [] path_parts_to_skip = len(utils.get_path_parts(output)) success = True for path in utils.get_files(sources): try: utils.info("Processing " + path) pathparts = utils.get_path_parts(path) pathparts[0] = output.strip(os.sep) pathparts[-1] = pathparts[-1].replace('.json', '.geojson') outdir = os.sep.join(pathparts[:-1]) outfile = os.sep.join(pathparts) source = utils.read_json(path) urlfile = urlparse(source['url']).path.split('/')[-1] if not hasattr(adapters, source['filetype']): utils.error('Unknown filetype', source['filetype'], '\n') failures.append(path) continue if os.path.isfile(outfile) and \ os.path.getmtime(outfile) > os.path.getmtime(path) and not force: utils.error('Skipping', path, 'since generated file exists.', 'Use --force to regenerate.', '\n') with open(outfile, "rb") as f: geojson = json.load(f) properties = geojson['properties'] else: utils.info('Downloading', source['url']) try: fp = utils.download(source['url']) except IOError: utils.error('Failed to download', source['url'], '\n') failures.append(path) continue utils.info('Reading', urlfile) if 'filter' in source: filterer = BasicFilterer(source['filter'], source.get('filterOperator', 'and')) else: filterer = None try: geojson = getattr(adapters, source['filetype'])\ .read(fp, source['properties'], filterer=filterer, layer_name=source.get("layerName", None), source_filename=source.get("filenameInZip", None)) except IOError, e: utils.error('Failed to read', urlfile, str(e)) failures.append(path) continue except zipfile.BadZipfile, e: utils.error('Unable to open zip file', source['url']) failures.append(path) continue finally:
def process(sources, output, force): """Download sources and process the file to the output directory. \b SOURCES: Source JSON file or directory of files. Required. OUTPUT: Destination directory for generated data. Required. """ logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] - %(message)s', datefmt="%H:%M:%S") logging.getLogger('shapely.geos').setLevel(logging.WARNING) logging.getLogger('Fiona').setLevel(logging.WARNING) logging.getLogger('requests.packages.urllib3.connectionpool').setLevel( logging.WARNING) requests.packages.urllib3.disable_warnings() # logging.getLogger('processing').setLevel(logging.DEBUG) catalog_features = [] failures = [] path_parts_to_skip = utils.get_path_parts(sources).index("sources") + 1 success = True for path in utils.get_files(sources): try: logging.info("Processing " + path) pathparts = utils.get_path_parts(path)[path_parts_to_skip:] pathparts[-1] = pathparts[-1].replace('.json', '.geojson') outdir = os.path.join(output, *pathparts[:-1], pathparts[-1].replace('.geojson', '')) outfile = os.path.join(output, *pathparts) source = utils.read_json(path) urlfile = urlparse(source['url']).path.split('/')[-1] if not hasattr(adapters, source['filetype']): logging.error('Unknown filetype ' + source['filetype']) failures.append(path) continue read_existing = False if os.path.isfile(outfile): logging.info("Output file exists") if os.path.getmtime(outfile) > os.path.getmtime(path): logging.info("Output file is up to date") if not force: read_existing = True logging.warning( 'Skipping ' + path + ' since generated file exists. Use --force to regenerate.' ) else: logging.info("Output is outdated, {} < {}".format( datetime.datetime.fromtimestamp( os.path.getmtime(outfile)), datetime.datetime.fromtimestamp( os.path.getmtime(path)))) if read_existing: with open(outfile, "rb") as f: geojson = json.load(f) properties = geojson['properties'] else: logging.info('Downloading ' + source['url']) try: fp = utils.download(source['url']) except IOError: logging.error('Failed to download ' + source['url']) failures.append(path) continue logging.info('Reading ' + urlfile) if 'filter' in source: filterer = BasicFilterer( source['filter'], source.get('filterOperator', 'and')) else: filterer = None try: geojson = getattr(adapters, source['filetype'])\ .read(fp, source['properties'], filterer=filterer, layer_name=source.get("layerName", None), source_filename=source.get("filenameInZip", None)) except IOError as e: logging.error('Failed to read ' + urlfile + " " + str(e)) failures.append(path) continue except zipfile.BadZipfile as e: logging.error('Unable to open zip file ' + source['url']) failures.append(path) continue finally: os.remove(fp.name) if (len(geojson['features'])) == 0: logging.error("Result contained no features for " + path) continue excluded_keys = [ 'filetype', 'url', 'properties', 'filter', 'filenameInZip' ] properties = { k: v for k, v in list(source.items()) if k not in excluded_keys } properties['source_url'] = source['url'] properties['feature_count'] = len(geojson['features']) logging.info("Generating demo point") properties['demo'] = geoutils.get_demo_point(geojson) geojson['properties'] = properties utils.make_sure_path_exists(os.path.dirname(outfile)) #cleanup existing generated files if os.path.exists(outdir): rmtree(outdir) filename_to_match, ext = os.path.splitext(pathparts[-1]) output_file_dir = os.sep.join( utils.get_path_parts(outfile)[:-1]) logging.info("looking for generated files to delete in " + output_file_dir) for name in os.listdir(output_file_dir): base, ext = os.path.splitext(name) if base == filename_to_match: to_remove = os.path.join(output_file_dir, name) logging.info("Removing generated file " + to_remove) os.remove(to_remove) utils.write_json(outfile, geojson) logging.info("Generating label points") label_geojson = geoutils.get_label_points(geojson) label_path = outfile.replace('.geojson', '.labels.geojson') utils.write_json(label_path, label_geojson) logging.info('Done. Processed to ' + outfile) if not "demo" in properties: properties['demo'] = geoutils.get_demo_point(geojson) properties['path'] = "/".join(pathparts) catalog_entry = { 'type': 'Feature', 'properties': properties, 'geometry': geoutils.get_union(geojson) } catalog_features.append(catalog_entry) if not os.path.exists(outdir) or not os.path.exists( os.path.join(outdir, "units.json")): logging.info("Generated exploded GeoJSON to " + outdir) if not os.path.exists(outdir): os.makedirs(outdir) # .json instead of .geojson, incase there is a unit named "source" utils.write_json(os.path.join(outdir, "source.json"), catalog_entry) units = [] for feature in geojson['features']: feature_id = str(feature['properties']['id']) feature_id = feature_id.replace('/', '') feature_filename = os.path.join(outdir, feature_id + ".geojson") utils.write_json(feature_filename, feature) units.append(feature['properties']) utils.write_json(os.path.join(outdir, "units.json"), units) else: logging.debug( "exploded GeoJSON already exists, not generating") except Exception as e: logging.error(str(e)) logging.exception("Error processing file " + path) failures.append(path) success = False catalog = {'type': 'FeatureCollection', 'features': catalog_features} utils.write_json(os.path.join(output, 'catalog.geojson'), catalog) if not success: logging.error("Failed sources: " + ", ".join(failures)) sys.exit(-1)
def test_get_path_parts_4(): path = '/abc' res = utils.get_path_parts(path) assert list(res) == ['abc']
def test_get_path_parts_3(): path = '/' res = utils.get_path_parts(path) assert list(res) == []
def test_get_path_parts_2(): path = '/foo/bar/baz/' with pytest.raises(ValueError): res = utils.get_path_parts(path)
def test_get_path_parts_1(): path = '/foo/bar/baz' res = utils.get_path_parts(path) assert list(res) == ['foo', 'bar', 'baz']
def process(sources, output, force): """Download sources and process the file to the output directory. \b SOURCES: Source JSON file or directory of files. Required. OUTPUT: Destination directory for generated data. Required. """ catalog_features = [] failures = [] path_parts_to_skip = len(utils.get_path_parts(output)) success = True for path in utils.get_files(sources): try: utils.info("Processing " + path) pathparts = utils.get_path_parts(path) pathparts[0] = output.strip(os.sep) pathparts[-1] = pathparts[-1].replace('.json', '.geojson') outdir = os.sep.join(pathparts[:-1]) outfile = os.sep.join(pathparts) source = utils.read_json(path) urlfile = urlparse(source['url']).path.split('/')[-1] if not hasattr(adapters, source['filetype']): utils.error('Unknown filetype', source['filetype'], '\n') failures.append(path) continue if os.path.isfile(outfile) and \ os.path.getmtime(outfile) > os.path.getmtime(path) and not force: utils.error('Skipping', path, 'since generated file exists.', 'Use --force to regenerate.', '\n') with open(outfile, "rb") as f: geojson = json.load(f) properties = geojson['properties'] else: utils.info('Downloading', source['url']) try: fp = utils.download(source['url']) except IOError: utils.error('Failed to download', source['url'], '\n') failures.append(path) continue utils.info('Reading', urlfile) if 'filter' in source: filterer = BasicFilterer(source['filter'], source.get('filterOperator', 'and')) else: filterer = None try: geojson = getattr(adapters, source['filetype'])\ .read(fp, source['properties'], filterer=filterer, layer_name=source.get("layerName", None), source_filename=source.get("filenameInZip", None)) except IOError, e: utils.error('Failed to read', urlfile, str(e)) failures.append(path) continue except zipfile.BadZipfile, e: utils.error('Unable to open zip file', source['url']) failures.append(path) continue finally:
def process(sources, output, force, force_summary): """Download sources and process the file to the output directory. \b SOURCES: Source JSON file or directory of files. Required. OUTPUT: Destination directory for generated data. Required. """ logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] - %(message)s", datefmt="%H:%M:%S", ) logging.getLogger("shapely.geos").setLevel(logging.WARNING) logging.getLogger("Fiona").setLevel(logging.WARNING) logging.getLogger("requests.packages.urllib3.connectionpool").setLevel( logging.WARNING) requests.packages.urllib3.disable_warnings() # logging.getLogger('processing').setLevel(logging.DEBUG) catalog_features = [] failures = [] path_parts_to_skip = utils.get_path_parts(sources).index("sources") + 1 success = True for path in utils.get_files(sources): try: logging.info("Processing " + path) pathparts = utils.get_path_parts(path)[path_parts_to_skip:] pathparts[-1] = pathparts[-1].replace(".json", ".geojson") outdir = os.path.join(output, *pathparts[:-1], pathparts[-1].replace(".geojson", "")) outfile = os.path.join(output, *pathparts) source = utils.read_json(path) urlfile = urlparse(source["url"]).path.split("/")[-1] if not hasattr(adapters, source["filetype"]): logging.error("Unknown filetype " + source["filetype"]) failures.append(path) continue read_existing = False if os.path.isfile(outfile): logging.info("Output file exists") if os.path.getmtime(outfile) > os.path.getmtime(path): logging.info("Output file is up to date") if not force: read_existing = True logging.warning( "Skipping " + path + " since generated file exists. Use --force to regenerate." ) else: logging.info("Output is outdated, {} < {}".format( datetime.datetime.fromtimestamp( os.path.getmtime(outfile)), datetime.datetime.fromtimestamp( os.path.getmtime(path)), )) if read_existing: with open(outfile, "rb") as f: geojson = json.load(f) properties = geojson["properties"] else: logging.info("Downloading " + source["url"]) try: fp = utils.download(source["url"]) except IOError: logging.error("Failed to download " + source["url"]) failures.append(path) continue logging.info("Reading " + urlfile) if "filter" in source: filterer = BasicFilterer( source["filter"], source.get("filterOperator", "and")) else: filterer = None try: geojson = getattr(adapters, source["filetype"]).read( fp, source["properties"], filterer=filterer, layer_name=source.get("layerName", None), source_filename=source.get("filenameInZip", None), merge_on=source.get("mergeOn", None), ) except IOError as e: logging.error("Failed to read " + urlfile + " " + str(e)) failures.append(path) continue except zipfile.BadZipfile as e: logging.error("Unable to open zip file " + source["url"]) failures.append(path) continue finally: os.remove(fp.name) if (len(geojson["features"])) == 0: logging.error("Result contained no features for " + path) continue # generate properties excluded_keys = [ "filetype", "url", "properties", "filter", "filenameInZip", ] properties = { k: v for k, v in list(source.items()) if k not in excluded_keys } properties["source_url"] = source["url"] properties["feature_count"] = len(geojson["features"]) properties["demo"] = geoutils.get_demo_point(geojson) geojson["properties"] = properties if "bbox" not in geojson: geojson["bbox"] = geoutils.get_bbox_from_geojson(geojson) utils.make_sure_path_exists(os.path.dirname(outfile)) # cleanup existing generated files if os.path.exists(outdir): rmtree(outdir) filename_to_match, ext = os.path.splitext(pathparts[-1]) output_file_dir = os.sep.join( utils.get_path_parts(outfile)[:-1]) logging.info("looking for generated files to delete in " + output_file_dir) for name in os.listdir(output_file_dir): base, ext = os.path.splitext(name) if base == filename_to_match: to_remove = os.path.join(output_file_dir, name) logging.info("Removing generated file " + to_remove) os.remove(to_remove) utils.write_json(outfile, geojson) logging.info("Generating label points") label_geojson = geoutils.get_label_points(geojson) label_path = outfile.replace(".geojson", ".labels.geojson") utils.write_json(label_path, label_geojson) logging.info("Done. Processed to " + outfile) if not "demo" in properties: properties["demo"] = geoutils.get_demo_point(geojson) properties["path"] = "/".join(pathparts) catalog_entry = { "type": "Feature", "properties": properties, "geometry": geoutils.get_union(geojson), "bbox": geoutils.get_bbox_from_geojson(geojson), } catalog_features.append(catalog_entry) if (force_summary or not read_existing or not os.path.exists(outdir) or not os.path.exists(os.path.join(outdir, "units.json")) or not os.path.exists(os.path.join(outdir, "source.json"))): logging.info("Generated exploded GeoJSON to " + outdir) if not os.path.exists(outdir): os.makedirs(outdir) units = [] for feature in geojson["features"]: if not "bbox" in feature: feature[ "bbox"] = geoutils.get_bbox_from_geojson_geometry( feature["geometry"]) feature_id = str(feature["properties"]["id"]) feature_id = feature_id.replace("/", "") feature_filename = os.path.join(outdir, feature_id + ".geojson") utils.write_json(feature_filename, feature) units.append(feature["properties"]) # source.json is just the catalog entry # units.json is the properties dicts from all of the units in an array # .json instead of .geojson, incase there is a unit named "source" utils.write_json(os.path.join(outdir, "source.json"), catalog_entry) utils.write_json(os.path.join(outdir, "units.json"), units) else: logging.debug( "exploded GeoJSON already exists, not generating") except Exception as e: logging.error(str(e)) logging.exception("Error processing file " + path) failures.append(path) success = False catalog = {"type": "FeatureCollection", "features": catalog_features} utils.write_json(os.path.join(output, "catalog.geojson"), catalog) if not success: logging.error("Failed sources: " + ", ".join(failures)) sys.exit(-1)