def validate_path(schema, jsonfiles):
    schema = json.loads(schema.read())

    for path in utils.get_files(jsonfiles):
        path_components = utils.get_path_parts(path)

        regex = schema[path_components[0]]
        if not re.compile(regex).match(path):
            raise AssertionError('Path "%s" does not match spec "%s"' % (path, regex))
Exemple #2
0
 def find_directory_in_tree(root, dirname):
     path_parts = utils.get_path_parts(dirname)
     cur_path = os.sep
     cur_node = root
     for part in path_parts:
         cur_path = os.path.join(cur_path, part)
         if part not in cur_node.children:
             new_node = get_directory_node(rootdir, cur_path, uid_map, gid_map)
             cur_node.children[part] = new_node
         cur_node = cur_node.children[part]
     return cur_node
Exemple #3
0
def process(sources, output, force):
    """Download sources and process the file to the output directory.

    \b
    SOURCES: Source JSON file or directory of files. Required.
    OUTPUT: Destination directory for generated data. Required.
    """
    for path in utils.get_files(sources):
        pathparts = utils.get_path_parts(path)
        pathparts[0] = output.strip(os.sep)
        pathparts[-1] = pathparts[-1].replace('.json', '.geojson')

        outdir = os.sep.join(pathparts[:-1])
        outfile = os.sep.join(pathparts)

        source = utils.read_json(path)
        urlfile = urlparse(source['url']).path.split('/')[-1]

        if not hasattr(adapters, source['filetype']):
            utils.error('Unknown filetype', source['filetype'], '\n')
            continue

        if os.path.isfile(outfile) and not force:
            utils.error('Skipping', path, 'since generated file exists.',
                        'Use --force to regenerate.', '\n')
            continue

        utils.info('Downloading', source['url'])

        try:
            fp = utils.download(source['url'])
        except IOError:
            utils.error('Failed to download', source['url'], '\n')
            continue

        utils.info('Reading', urlfile)

        try:
            geojson = getattr(adapters, source['filetype']).read(fp, source['properties'])
        except IOError:
            utils.error('Failed to read', urlfile)
            continue
        finally:
            os.remove(fp.name)

        utils.make_sure_path_exists(outdir)
        utils.write_json(outfile, geojson)

        utils.success('Done. Processed to', outfile, '\n')
Exemple #4
0
    def ilist(self):
        """Return an iterator of hashes of all files in CAS.

        Return:
           The iterator of hashes. The iteration order is unspecified.
        """
        for (dirpath, dirnames, filenames) in os.walk(self._root):
            # All files in uncorrupted CAS were previously stored
            # by a .store() call, so we construct the hashes by
            # joining shards encoded in dirpath and the file names.
            path_parts = utils.get_path_parts(dirpath)
            shard_parts = path_parts[-self._sharding:]

            for hash_without_shards in filenames:
                full_hash = ''.join(shard_parts) + hash_without_shards
                yield full_hash
Exemple #5
0
def process(sources, output, force):
    """Download sources and process the file to the output directory.

    \b
    SOURCES: Source JSON file or directory of files. Required.
    OUTPUT: Destination directory for generated data. Required.
    """
    logging.getLogger('shapely.geos').setLevel(logging.WARNING)

    catalog_features = []
    failures = []
    path_parts_to_skip = len(utils.get_path_parts(output))
    success = True
    for path in utils.get_files(sources):
        try:
            utils.info("Processing " + path)
            pathparts = utils.get_path_parts(path)
            pathparts[0] = output.strip(os.sep)
            pathparts[-1] = pathparts[-1].replace('.json', '.geojson')
    
            outdir = os.sep.join(pathparts[:-1])
            outfile = os.sep.join(pathparts)
    
            source = utils.read_json(path)
            urlfile = urlparse(source['url']).path.split('/')[-1]
    
            if not hasattr(adapters, source['filetype']):
                utils.error('Unknown filetype', source['filetype'], '\n')
                failures.append(path)
                continue
    
            if os.path.isfile(outfile) and \
                os.path.getmtime(outfile) > os.path.getmtime(path) and not force:
                utils.error('Skipping', path, 'since generated file exists.',
                            'Use --force to regenerate.', '\n')
                with open(outfile, "rb") as f:
                    geojson = json.load(f)
                properties = geojson['properties']
            else:
                utils.info('Downloading', source['url'])
    
                try:
                    fp = utils.download(source['url'])
                except IOError:
                    utils.error('Failed to download', source['url'], '\n')
                    failures.append(path)
                    continue
    
                utils.info('Reading', urlfile)
    
                if 'filter' in source:
                    filterer = BasicFilterer(source['filter'], source.get('filterOperator', 'and'))
                else:
                    filterer = None
    
                try:
                    geojson = getattr(adapters, source['filetype'])\
                        .read(fp, source['properties'],
                            filterer=filterer,
                            layer_name=source.get("layerName", None),
                            source_filename=source.get("filenameInZip", None))
                except IOError, e:
                    utils.error('Failed to read', urlfile, str(e))
                    failures.append(path)
                    continue
                except zipfile.BadZipfile, e:
                    utils.error('Unable to open zip file', source['url'])
                    failures.append(path)
                    continue
                finally:
Exemple #6
0
def process(sources, output, force):
    """Download sources and process the file to the output directory.

    \b
    SOURCES: Source JSON file or directory of files. Required.
    OUTPUT: Destination directory for generated data. Required.
    """
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s [%(levelname)s] - %(message)s',
                        datefmt="%H:%M:%S")

    logging.getLogger('shapely.geos').setLevel(logging.WARNING)
    logging.getLogger('Fiona').setLevel(logging.WARNING)
    logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(
        logging.WARNING)
    requests.packages.urllib3.disable_warnings()
    # logging.getLogger('processing').setLevel(logging.DEBUG)

    catalog_features = []
    failures = []
    path_parts_to_skip = utils.get_path_parts(sources).index("sources") + 1
    success = True
    for path in utils.get_files(sources):
        try:
            logging.info("Processing " + path)
            pathparts = utils.get_path_parts(path)[path_parts_to_skip:]
            pathparts[-1] = pathparts[-1].replace('.json', '.geojson')

            outdir = os.path.join(output, *pathparts[:-1],
                                  pathparts[-1].replace('.geojson', ''))
            outfile = os.path.join(output, *pathparts)

            source = utils.read_json(path)
            urlfile = urlparse(source['url']).path.split('/')[-1]

            if not hasattr(adapters, source['filetype']):
                logging.error('Unknown filetype ' + source['filetype'])
                failures.append(path)
                continue

            read_existing = False
            if os.path.isfile(outfile):
                logging.info("Output file exists")
                if os.path.getmtime(outfile) > os.path.getmtime(path):
                    logging.info("Output file is up to date")
                    if not force:
                        read_existing = True
                        logging.warning(
                            'Skipping ' + path +
                            ' since generated file exists. Use --force to regenerate.'
                        )
                else:
                    logging.info("Output is outdated, {} < {}".format(
                        datetime.datetime.fromtimestamp(
                            os.path.getmtime(outfile)),
                        datetime.datetime.fromtimestamp(
                            os.path.getmtime(path))))

            if read_existing:
                with open(outfile, "rb") as f:
                    geojson = json.load(f)
                properties = geojson['properties']
            else:
                logging.info('Downloading ' + source['url'])

                try:
                    fp = utils.download(source['url'])
                except IOError:
                    logging.error('Failed to download ' + source['url'])
                    failures.append(path)
                    continue

                logging.info('Reading ' + urlfile)

                if 'filter' in source:
                    filterer = BasicFilterer(
                        source['filter'], source.get('filterOperator', 'and'))
                else:
                    filterer = None

                try:
                    geojson = getattr(adapters, source['filetype'])\
                        .read(fp, source['properties'],
                            filterer=filterer,
                            layer_name=source.get("layerName", None),
                            source_filename=source.get("filenameInZip", None))
                except IOError as e:
                    logging.error('Failed to read ' + urlfile + " " + str(e))
                    failures.append(path)
                    continue
                except zipfile.BadZipfile as e:
                    logging.error('Unable to open zip file ' + source['url'])
                    failures.append(path)
                    continue
                finally:
                    os.remove(fp.name)
                if (len(geojson['features'])) == 0:
                    logging.error("Result contained no features for " + path)
                    continue
                excluded_keys = [
                    'filetype', 'url', 'properties', 'filter', 'filenameInZip'
                ]
                properties = {
                    k: v
                    for k, v in list(source.items()) if k not in excluded_keys
                }
                properties['source_url'] = source['url']
                properties['feature_count'] = len(geojson['features'])
                logging.info("Generating demo point")
                properties['demo'] = geoutils.get_demo_point(geojson)

                geojson['properties'] = properties

                utils.make_sure_path_exists(os.path.dirname(outfile))

                #cleanup existing generated files
                if os.path.exists(outdir):
                    rmtree(outdir)
                filename_to_match, ext = os.path.splitext(pathparts[-1])
                output_file_dir = os.sep.join(
                    utils.get_path_parts(outfile)[:-1])
                logging.info("looking for generated files to delete in " +
                             output_file_dir)
                for name in os.listdir(output_file_dir):
                    base, ext = os.path.splitext(name)
                    if base == filename_to_match:
                        to_remove = os.path.join(output_file_dir, name)
                        logging.info("Removing generated file " + to_remove)
                        os.remove(to_remove)

                utils.write_json(outfile, geojson)

                logging.info("Generating label points")
                label_geojson = geoutils.get_label_points(geojson)
                label_path = outfile.replace('.geojson', '.labels.geojson')
                utils.write_json(label_path, label_geojson)

                logging.info('Done. Processed to ' + outfile)

            if not "demo" in properties:
                properties['demo'] = geoutils.get_demo_point(geojson)

            properties['path'] = "/".join(pathparts)
            catalog_entry = {
                'type': 'Feature',
                'properties': properties,
                'geometry': geoutils.get_union(geojson)
            }
            catalog_features.append(catalog_entry)

            if not os.path.exists(outdir) or not os.path.exists(
                    os.path.join(outdir, "units.json")):
                logging.info("Generated exploded GeoJSON to " + outdir)
                if not os.path.exists(outdir):
                    os.makedirs(outdir)
                # .json instead of .geojson, incase there is a unit named "source"
                utils.write_json(os.path.join(outdir, "source.json"),
                                 catalog_entry)
                units = []
                for feature in geojson['features']:
                    feature_id = str(feature['properties']['id'])
                    feature_id = feature_id.replace('/', '')
                    feature_filename = os.path.join(outdir,
                                                    feature_id + ".geojson")
                    utils.write_json(feature_filename, feature)
                    units.append(feature['properties'])
                utils.write_json(os.path.join(outdir, "units.json"), units)
            else:
                logging.debug(
                    "exploded GeoJSON already exists, not generating")

        except Exception as e:
            logging.error(str(e))
            logging.exception("Error processing file " + path)
            failures.append(path)
            success = False

    catalog = {'type': 'FeatureCollection', 'features': catalog_features}
    utils.write_json(os.path.join(output, 'catalog.geojson'), catalog)

    if not success:
        logging.error("Failed sources: " + ", ".join(failures))
        sys.exit(-1)
def test_get_path_parts_4():
    path = '/abc'
    res = utils.get_path_parts(path)
    assert list(res) == ['abc']
def test_get_path_parts_3():
    path = '/'
    res = utils.get_path_parts(path)
    assert list(res) == []
def test_get_path_parts_2():
    path = '/foo/bar/baz/'
    with pytest.raises(ValueError):
        res = utils.get_path_parts(path)
def test_get_path_parts_1():
    path = '/foo/bar/baz'
    res = utils.get_path_parts(path)
    assert list(res) == ['foo', 'bar', 'baz']
Exemple #11
0
def process(sources, output, force):
    """Download sources and process the file to the output directory.

    \b
    SOURCES: Source JSON file or directory of files. Required.
    OUTPUT: Destination directory for generated data. Required.
    """
    catalog_features = []
    failures = []
    path_parts_to_skip = len(utils.get_path_parts(output))
    success = True
    for path in utils.get_files(sources):
        try:
            utils.info("Processing " + path)
            pathparts = utils.get_path_parts(path)
            pathparts[0] = output.strip(os.sep)
            pathparts[-1] = pathparts[-1].replace('.json', '.geojson')
    
            outdir = os.sep.join(pathparts[:-1])
            outfile = os.sep.join(pathparts)
    
            source = utils.read_json(path)
            urlfile = urlparse(source['url']).path.split('/')[-1]
    
            if not hasattr(adapters, source['filetype']):
                utils.error('Unknown filetype', source['filetype'], '\n')
                failures.append(path)
                continue
    
            if os.path.isfile(outfile) and \
                os.path.getmtime(outfile) > os.path.getmtime(path) and not force:
                utils.error('Skipping', path, 'since generated file exists.',
                            'Use --force to regenerate.', '\n')
                with open(outfile, "rb") as f:
                    geojson = json.load(f)
                properties = geojson['properties']
            else:
                utils.info('Downloading', source['url'])
    
                try:
                    fp = utils.download(source['url'])
                except IOError:
                    utils.error('Failed to download', source['url'], '\n')
                    failures.append(path)
                    continue
    
                utils.info('Reading', urlfile)
    
                if 'filter' in source:
                    filterer = BasicFilterer(source['filter'], source.get('filterOperator', 'and'))
                else:
                    filterer = None
    
                try:
                    geojson = getattr(adapters, source['filetype'])\
                        .read(fp, source['properties'],
                            filterer=filterer,
                            layer_name=source.get("layerName", None),
                            source_filename=source.get("filenameInZip", None))
                except IOError, e:
                    utils.error('Failed to read', urlfile, str(e))
                    failures.append(path)
                    continue
                except zipfile.BadZipfile, e:
                    utils.error('Unable to open zip file', source['url'])
                    failures.append(path)
                    continue
                finally:
Exemple #12
0
def process(sources, output, force, force_summary):
    """Download sources and process the file to the output directory.

    \b
    SOURCES: Source JSON file or directory of files. Required.
    OUTPUT: Destination directory for generated data. Required.
    """
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] - %(message)s",
        datefmt="%H:%M:%S",
    )

    logging.getLogger("shapely.geos").setLevel(logging.WARNING)
    logging.getLogger("Fiona").setLevel(logging.WARNING)
    logging.getLogger("requests.packages.urllib3.connectionpool").setLevel(
        logging.WARNING)
    requests.packages.urllib3.disable_warnings()
    # logging.getLogger('processing').setLevel(logging.DEBUG)

    catalog_features = []
    failures = []
    path_parts_to_skip = utils.get_path_parts(sources).index("sources") + 1
    success = True
    for path in utils.get_files(sources):
        try:
            logging.info("Processing " + path)
            pathparts = utils.get_path_parts(path)[path_parts_to_skip:]
            pathparts[-1] = pathparts[-1].replace(".json", ".geojson")

            outdir = os.path.join(output, *pathparts[:-1],
                                  pathparts[-1].replace(".geojson", ""))
            outfile = os.path.join(output, *pathparts)

            source = utils.read_json(path)
            urlfile = urlparse(source["url"]).path.split("/")[-1]

            if not hasattr(adapters, source["filetype"]):
                logging.error("Unknown filetype " + source["filetype"])
                failures.append(path)
                continue

            read_existing = False
            if os.path.isfile(outfile):
                logging.info("Output file exists")
                if os.path.getmtime(outfile) > os.path.getmtime(path):
                    logging.info("Output file is up to date")
                    if not force:
                        read_existing = True
                        logging.warning(
                            "Skipping " + path +
                            " since generated file exists. Use --force to regenerate."
                        )
                else:
                    logging.info("Output is outdated, {} < {}".format(
                        datetime.datetime.fromtimestamp(
                            os.path.getmtime(outfile)),
                        datetime.datetime.fromtimestamp(
                            os.path.getmtime(path)),
                    ))

            if read_existing:
                with open(outfile, "rb") as f:
                    geojson = json.load(f)
                properties = geojson["properties"]
            else:
                logging.info("Downloading " + source["url"])

                try:
                    fp = utils.download(source["url"])
                except IOError:
                    logging.error("Failed to download " + source["url"])
                    failures.append(path)
                    continue

                logging.info("Reading " + urlfile)

                if "filter" in source:
                    filterer = BasicFilterer(
                        source["filter"], source.get("filterOperator", "and"))
                else:
                    filterer = None

                try:
                    geojson = getattr(adapters, source["filetype"]).read(
                        fp,
                        source["properties"],
                        filterer=filterer,
                        layer_name=source.get("layerName", None),
                        source_filename=source.get("filenameInZip", None),
                        merge_on=source.get("mergeOn", None),
                    )
                except IOError as e:
                    logging.error("Failed to read " + urlfile + " " + str(e))
                    failures.append(path)
                    continue
                except zipfile.BadZipfile as e:
                    logging.error("Unable to open zip file " + source["url"])
                    failures.append(path)
                    continue
                finally:
                    os.remove(fp.name)

                if (len(geojson["features"])) == 0:
                    logging.error("Result contained no features for " + path)
                    continue

                # generate properties
                excluded_keys = [
                    "filetype",
                    "url",
                    "properties",
                    "filter",
                    "filenameInZip",
                ]
                properties = {
                    k: v
                    for k, v in list(source.items()) if k not in excluded_keys
                }
                properties["source_url"] = source["url"]
                properties["feature_count"] = len(geojson["features"])
                properties["demo"] = geoutils.get_demo_point(geojson)
                geojson["properties"] = properties
                if "bbox" not in geojson:
                    geojson["bbox"] = geoutils.get_bbox_from_geojson(geojson)

                utils.make_sure_path_exists(os.path.dirname(outfile))

                # cleanup existing generated files
                if os.path.exists(outdir):
                    rmtree(outdir)
                filename_to_match, ext = os.path.splitext(pathparts[-1])
                output_file_dir = os.sep.join(
                    utils.get_path_parts(outfile)[:-1])
                logging.info("looking for generated files to delete in " +
                             output_file_dir)
                for name in os.listdir(output_file_dir):
                    base, ext = os.path.splitext(name)
                    if base == filename_to_match:
                        to_remove = os.path.join(output_file_dir, name)
                        logging.info("Removing generated file " + to_remove)
                        os.remove(to_remove)

                utils.write_json(outfile, geojson)

                logging.info("Generating label points")
                label_geojson = geoutils.get_label_points(geojson)
                label_path = outfile.replace(".geojson", ".labels.geojson")
                utils.write_json(label_path, label_geojson)

                logging.info("Done. Processed to " + outfile)

            if not "demo" in properties:
                properties["demo"] = geoutils.get_demo_point(geojson)

            properties["path"] = "/".join(pathparts)
            catalog_entry = {
                "type": "Feature",
                "properties": properties,
                "geometry": geoutils.get_union(geojson),
                "bbox": geoutils.get_bbox_from_geojson(geojson),
            }
            catalog_features.append(catalog_entry)

            if (force_summary or not read_existing
                    or not os.path.exists(outdir)
                    or not os.path.exists(os.path.join(outdir, "units.json"))
                    or
                    not os.path.exists(os.path.join(outdir, "source.json"))):
                logging.info("Generated exploded GeoJSON to " + outdir)
                if not os.path.exists(outdir):
                    os.makedirs(outdir)
                units = []
                for feature in geojson["features"]:
                    if not "bbox" in feature:
                        feature[
                            "bbox"] = geoutils.get_bbox_from_geojson_geometry(
                                feature["geometry"])
                    feature_id = str(feature["properties"]["id"])
                    feature_id = feature_id.replace("/", "")
                    feature_filename = os.path.join(outdir,
                                                    feature_id + ".geojson")
                    utils.write_json(feature_filename, feature)
                    units.append(feature["properties"])
                # source.json is just the catalog entry
                # units.json is the properties dicts from all of the units in an array
                # .json instead of .geojson, incase there is a unit named "source"
                utils.write_json(os.path.join(outdir, "source.json"),
                                 catalog_entry)
                utils.write_json(os.path.join(outdir, "units.json"), units)
            else:
                logging.debug(
                    "exploded GeoJSON already exists, not generating")

        except Exception as e:
            logging.error(str(e))
            logging.exception("Error processing file " + path)
            failures.append(path)
            success = False

    catalog = {"type": "FeatureCollection", "features": catalog_features}
    utils.write_json(os.path.join(output, "catalog.geojson"), catalog)

    if not success:
        logging.error("Failed sources: " + ", ".join(failures))
        sys.exit(-1)