Beispiel #1
0
def test_fast_import(repo_version, data_archive, tmp_path, cli_runner, chdir):
    table = H.POINTS.LAYER
    with data_archive("gpkg-points") as data:
        # list tables
        repo_path = tmp_path / "data.sno"
        repo_path.mkdir()

        with chdir(repo_path):
            r = cli_runner.invoke(["init", "--repo-version", repo_version])
            assert r.exit_code == 0, r

            repo = pygit2.Repository(str(repo_path))

            source = OgrImportSource.open(data / "nz-pa-points-topo-150k.gpkg",
                                          table=table)

            fast_import.fast_import_tables(repo, [source])

            assert not repo.is_empty
            assert repo.head.name == "refs/heads/master"
            assert repo.head.shorthand == "master"

            dataset = structure.RepositoryStructure(repo)[table]

            # has a single commit
            assert len([c for c in repo.walk(repo.head.target)]) == 1
            assert dataset.version == int(repo_version)
            assert list(dataset.meta_items())

            # has the right number of features
            feature_count = sum(1 for f in dataset.features())
            assert feature_count == source.feature_count
Beispiel #2
0
def test_feature_find_decode_performance(
    profile,
    repo_version,
    archive,
    source_gpkg,
    table,
    data_archive,
    data_imported,
    geopackage,
    benchmark,
    request,
):
    """ Check single-feature decoding performance """
    param_ids = H.parameter_ids(request)
    benchmark.group = (
        f"test_feature_find_decode_performance - {profile} - {param_ids[-1]}")

    repo_path = data_imported(archive, source_gpkg, table, repo_version)
    repo = pygit2.Repository(str(repo_path))
    tree = repo.head.peel(pygit2.Tree) / "mytable"
    dataset = structure.RepositoryStructure(repo)["mytable"]

    assert dataset.__class__.__name__ == f"Dataset{repo_version}"
    assert dataset.version == int(repo_version)

    with data_archive(archive) as data:
        db = geopackage(f"{data / source_gpkg}")
        dbcur = db.cursor()
        num_rows = dbcur.execute(
            f"SELECT COUNT(*) FROM {table};").fetchone()[0]
        pk_field = gpkg.pk(db, table)
        pk = dbcur.execute(
            f"SELECT {pk_field} FROM {table} ORDER BY {pk_field} LIMIT 1 OFFSET {min(97,num_rows-1)};"
        ).fetchone()[0]

    if profile == "get_feature":
        benchmark(dataset.get_feature, pk)

    elif profile == "feature_to_dict":
        feature_path = dataset.encode_1pk_to_path(pk, relative=True)
        feature_data = memoryview(tree / feature_path)

        # TODO: try to avoid two sets of code for two dataset versions -
        # either by making their interfaces more similar, or by deleting v1
        if repo_version == "1":
            benchmark(dataset.repo_feature_to_dict, feature_path, feature_data)
        elif repo_version == "2":
            benchmark(dataset.get_feature,
                      path=feature_path,
                      data=feature_data)
    else:
        raise NotImplementedError(f"Unknown profile: {profile}")
Beispiel #3
0
def _import_check(repo_path,
                  table,
                  source_gpkg,
                  geopackage,
                  repo_version=None):
    repo = pygit2.Repository(str(repo_path))
    dataset = structure.RepositoryStructure(repo)[table]

    if repo_version is not None:
        assert dataset.version == int(repo_version)

    db = geopackage(source_gpkg)
    num_rows = db.cursor().execute(
        f"SELECT COUNT(*) FROM {table};").fetchone()[0]

    o = subprocess.check_output(["git", "ls-tree", "-r", "-t", "HEAD", table])
    print("\n".join(l.decode("utf8") for l in o.splitlines()[:20]))

    if dataset.version == 1:
        re_paths = (
            r"^\d{6} blob [0-9a-f]{40}\t%s/.sno-table/[0-9a-f]{2}/[0-9a-f]{2}/([^/]+)$"
            % table)
    elif dataset.version == 2:
        re_paths = r"^\d{6} blob [0-9a-f]{40}\t%s/.sno-dataset/feature/.*$" % table
    else:
        raise NotImplementedError(dataset.version)

    git_paths = [
        m for m in re.findall(re_paths, o.decode("utf-8"), re.MULTILINE)
    ]
    assert len(git_paths) == num_rows

    num_features = sum(1 for _ in dataset.features())
    assert num_features == num_rows

    return dataset
Beispiel #4
0
def test_shp_import_meta(
    data_archive,
    tmp_path,
    cli_runner,
    request,
):
    with data_archive('gpkg-polygons') as data:
        # convert to SHP using OGR
        source_filename = tmp_path / "nz_waca_adjustments.shp"
        gdal.VectorTranslate(
            str(source_filename),
            gdal.OpenEx(str(data / 'nz-waca-adjustments.gpkg')),
            format='ESRI Shapefile',
            layers=['nz_waca_adjustments'],
        )

        # now import the SHP
        repo_path = tmp_path / "repo"
        r = cli_runner.invoke(
            ["init", "--import", source_filename,
             str(repo_path)])
        assert r.exit_code == 0, r

        # now check metadata
        path = "nz_waca_adjustments"
        repo = pygit2.Repository(str(repo_path))
        dataset = structure.RepositoryStructure(repo)[path]

        meta_items = dict(dataset.meta_items())
        assert set(meta_items) == {
            'description',
            'schema.json',
            'title',
            'crs/EPSG:4167.wkt',
        }
        schema = dataset.get_meta_item('schema.json')
        for col in schema:
            col.pop('id')
        assert schema == [
            {
                'name': 'FID',
                'dataType': 'integer',
                'primaryKeyIndex': 0,
                'size': 64
            },
            {
                'name': 'geom',
                'dataType': 'geometry',
                'primaryKeyIndex': None,
                'geometryType': 'POLYGON',
                'geometryCRS': 'EPSG:4167',
            },
            {
                'name': 'date_adjus',
                'dataType': 'date',
                'primaryKeyIndex': None
            },
            {
                'name': 'survey_ref',
                'dataType': 'text',
                'primaryKeyIndex': None
            },
            {
                'name': 'adjusted_n',
                'dataType': 'integer',
                'primaryKeyIndex': None,
                'size': 32,
            },
        ]
Beispiel #5
0
def _test_pg_import(tmp_path,
                    cli_runner,
                    chdir,
                    *,
                    table_name,
                    pk_name="id",
                    pk_size=64,
                    import_args=()):
    repo_path = tmp_path / "repo"
    r = cli_runner.invoke(['init', repo_path, "--repo-version=2"])
    assert r.exit_code == 0, r
    with chdir(repo_path):
        r = cli_runner.invoke([
            'import',
            os.environ['SNO_POSTGRES_URL'],
            table_name,
            *import_args,
        ])
        assert r.exit_code == 0, r
    # now check metadata
    repo = pygit2.Repository(str(repo_path))
    dataset = structure.RepositoryStructure(repo)[table_name]

    meta_items = dict(dataset.meta_items())
    assert set(meta_items.keys()) == {
        'description',
        'schema.json',
        'title',
        'crs/EPSG:4167.wkt',
    }
    schema = dataset.get_meta_item('schema.json')
    for col in schema:
        col.pop('id')
    assert schema == [
        {
            'name': pk_name,
            'dataType': 'integer',
            'primaryKeyIndex': 0,
            'size': pk_size,
        },
        {
            'name': 'geom',
            'dataType': 'geometry',
            'primaryKeyIndex': None,
            'geometryType': 'MULTIPOLYGON',
            'geometryCRS': 'EPSG:4167',
        },
        {
            'name': 'date_adjusted',
            'dataType': 'timestamp',
            'primaryKeyIndex': None
        },
        {
            'name': 'survey_reference',
            'dataType': 'text',
            'primaryKeyIndex': None
        },
        {
            'name': 'adjusted_nodes',
            'dataType': 'integer',
            'primaryKeyIndex': None,
            'size': 32,
        },
    ]
Beispiel #6
0
def test_import_from_non_gpkg(
    repo_version,
    archive,
    source_gpkg,
    table,
    data_archive,
    tmp_path,
    cli_runner,
    chdir,
    geopackage,
    request,
    source_format,
    source_ogr_driver,
):
    """
    Import something else into a Sno repository.
    """
    param_ids = H.parameter_ids(request)

    with data_archive(archive) as data:
        db = geopackage(f"{data / source_gpkg}")
        dbcur = db.cursor()
        if param_ids[-1] == "empty":
            with db:
                print(f"emptying table {table}...")
                dbcur.execute(f"DELETE FROM {table};")

        num_rows = dbcur.execute(
            f"SELECT COUNT(*) FROM {table};").fetchone()[0]

        if param_ids[-1] == "empty":
            assert num_rows == 0

        # First, import the original GPKG to one repo
        gpkg_repo_path = tmp_path / "gpkg"
        gpkg_repo_path.mkdir()
        with chdir(gpkg_repo_path):
            r = cli_runner.invoke(["init"])
            assert r.exit_code == 0, r
            r = cli_runner.invoke(["import", data / source_gpkg, table])
            assert r.exit_code == 0, r

        gpkg_repo = pygit2.Repository(str(gpkg_repo_path))
        gpkg_dataset = structure.RepositoryStructure(gpkg_repo)[table]

        # convert to a new format using OGR
        source_filename = tmp_path / f"data.{source_format.lower()}"
        gdal.VectorTranslate(
            str(source_filename),
            gdal.OpenEx(str(data / source_gpkg)),
            format=source_ogr_driver,
            layers=[table],
        )
        repo_path = tmp_path / "non-gpkg"
        repo_path.mkdir()
        with chdir(repo_path):
            r = cli_runner.invoke(["init", "--repo-version", repo_version])
            assert r.exit_code == 0, r

            repo = pygit2.Repository(str(repo_path))
            assert repo.is_bare
            assert repo.is_empty

            # Import from SHP/TAB/something into sno
            r = cli_runner.invoke([
                "import",
                str(source_filename),
                f"data:{table}",
            ])
            assert r.exit_code == 0, r

            assert not repo.is_empty
            assert repo.head.name == "refs/heads/master"
            assert repo.head.shorthand == "master"

            # has a single commit
            assert len([c for c in repo.walk(repo.head.target)]) == 1

            dataset = _import_check(repo_path, table, f"{data / source_gpkg}",
                                    geopackage, repo_version)

            assert dataset.__class__.__name__ == f"Dataset{repo_version}"
            assert int(float(dataset.version)) == int(repo_version)

            # Compare the meta items to the GPKG-imported ones
            repo = pygit2.Repository(str(repo_path))
            dataset = structure.RepositoryStructure(repo)[table]

            if dataset.version == 1:
                _compare_ogr_and_gpkg_meta_items(dataset, gpkg_dataset)
            elif dataset.version == 2:
                # TODO: Dataset2 needs to store more metadata.
                pass

            if num_rows > 0:
                # compare the first feature in the repo against the source DB
                key, got_feature = next(dataset.features())
                fid = dataset.decode_path_to_1pk(key)

                src_ds = ogr.Open(str(source_filename))
                src_layer = src_ds.GetLayer(0)
                assert src_layer.GetFeatureCount() == num_rows

                f = src_layer.GetFeature(fid)
                expected_feature = {
                    f.GetFieldDefnRef(i).GetName(): f.GetField(i)
                    for i in range(f.GetFieldCount())
                }
                if 'date_adjus' in expected_feature:
                    expected_feature['date_adjus'] = expected_feature[
                        'date_adjus'].replace('/', '-')
                expected_feature['FID'] = f.GetFID()
                if src_layer.GetGeomType() != ogr.wkbNone:
                    g = f.GetGeometryRef()
                    if g:
                        g.AssignSpatialReference(src_layer.GetSpatialRef())
                    expected_feature['geom'] = ogr_to_gpkg_geom(g)

                assert normalise_feature(got_feature) == expected_feature
Beispiel #7
0
def test_pg_import(
    postgis_layer,
    data_archive,
    tmp_path,
    cli_runner,
    request,
    chdir,
):
    with postgis_layer('gpkg-polygons', 'nz-waca-adjustments.gpkg',
                       'nz_waca_adjustments'):
        repo_path = tmp_path / "repo"
        r = cli_runner.invoke(['init', repo_path])
        assert r.exit_code == 0, r
        with chdir(repo_path):
            r = cli_runner.invoke([
                'import', os.environ['SNO_POSTGRES_URL'], 'nz_waca_adjustments'
            ])
            assert r.exit_code == 0, r
        # now check metadata
        path = "nz_waca_adjustments"
        repo = pygit2.Repository(str(repo_path))
        dataset = structure.RepositoryStructure(repo)[path]

        meta_items = dict(dataset.iter_meta_items(include_hidden=True))
        assert set(meta_items.keys()) == {
            'fields/geom',
            'version',
            'fields/id',
            'gpkg_geometry_columns',
            'gpkg_spatial_ref_sys',
            'fields/adjusted_nodes',
            'primary_key',
            'gpkg_contents',
            'fields/survey_reference',
            'fields/date_adjusted',
            'sqlite_table_info',
        }
        assert meta_items['sqlite_table_info'] == [
            {
                'cid': 0,
                'name': 'id',
                'type': 'INTEGER',
                'notnull': 1,
                'dflt_value': None,
                'pk': 1,
            },
            {
                'cid': 1,
                'name': 'geom',
                'type': 'MULTIPOLYGON',
                'notnull': 0,
                'dflt_value': None,
                'pk': 0,
            },
            {
                'cid': 2,
                'name': 'date_adjusted',
                'type': 'DATETIME',
                'notnull': 0,
                'dflt_value': None,
                'pk': 0,
            },
            {
                'cid': 3,
                'name': 'survey_reference',
                'type': 'TEXT(50)',
                'notnull': 0,
                'dflt_value': None,
                'pk': 0,
            },
            {
                'cid': 4,
                'name': 'adjusted_nodes',
                'type': 'MEDIUMINT',
                'notnull': 0,
                'dflt_value': None,
                'pk': 0,
            },
        ]
        contents = meta_items['gpkg_contents']
        assert contents == {
            'table_name': 'nz_waca_adjustments',
            'description': '',
            'data_type': 'features',
            'identifier': '',
            'srs_id': 4167,
        }
Beispiel #8
0
def test_shp_import_meta(
    data_archive,
    tmp_path,
    cli_runner,
    request,
):
    with data_archive('gpkg-polygons') as data:
        # convert to SHP using OGR
        source_filename = tmp_path / "nz_waca_adjustments.shp"
        gdal.VectorTranslate(
            str(source_filename),
            gdal.OpenEx(str(data / 'nz-waca-adjustments.gpkg')),
            format='ESRI Shapefile',
            layers=['nz_waca_adjustments'],
        )

        # now import the SHP
        repo_path = tmp_path / "repo"
        r = cli_runner.invoke(
            ["init", "--import", source_filename,
             str(repo_path)])
        assert r.exit_code == 0, r

        # now check metadata
        path = "nz_waca_adjustments"
        repo = pygit2.Repository(str(repo_path))
        dataset = structure.RepositoryStructure(repo)[path]

        meta_items = dict(dataset.iter_meta_items(include_hidden=True))
        assert set(meta_items) == {
            'gpkg_contents',
            'gpkg_geometry_columns',
            'gpkg_spatial_ref_sys',
            'primary_key',
            'sqlite_table_info',
            'version',
            'fields/FID',
            'fields/adjusted_n',
            'fields/date_adjus',
            'fields/geom',
            'fields/survey_ref',
        }
        assert meta_items['sqlite_table_info'] == [
            {
                'cid': 0,
                'name': 'FID',
                'type': 'INTEGER',
                'notnull': 1,
                'dflt_value': None,
                'pk': 1,
            },
            {
                'cid': 1,
                'name': 'geom',
                'type': 'POLYGON',
                'notnull': 0,
                'dflt_value': None,
                'pk': 0,
            },
            {
                'cid': 2,
                'name': 'date_adjus',
                'type': 'DATE',
                'notnull': 0,
                'dflt_value': None,
                'pk': 0,
            },
            {
                'cid': 3,
                'name': 'survey_ref',
                'type': 'TEXT(50)',
                'notnull': 0,
                'dflt_value': None,
                'pk': 0,
            },
            {
                'cid': 4,
                'name': 'adjusted_n',
                'type': 'MEDIUMINT',
                'notnull': 0,
                'dflt_value': None,
                'pk': 0,
            },
        ]
Beispiel #9
0
def _test_pg_import(tmp_path,
                    cli_runner,
                    chdir,
                    *,
                    table_name,
                    pk_name="id",
                    pk_size=64,
                    import_args=()):
    repo_path = tmp_path / "repo"
    r = cli_runner.invoke(["init", repo_path, "--repo-version=2"])
    assert r.exit_code == 0, r
    with chdir(repo_path):
        r = cli_runner.invoke([
            "import",
            os.environ["SNO_POSTGRES_URL"],
            table_name,
            *import_args,
        ])
        assert r.exit_code == 0, r
    # now check metadata
    repo = pygit2.Repository(str(repo_path))
    dataset = structure.RepositoryStructure(repo)[table_name]

    meta_items = dict(dataset.meta_items())
    assert set(meta_items.keys()) == {
        "description",
        "schema.json",
        "title",
        "crs/EPSG:4167.wkt",
    }
    schema = without_ids(dataset.get_meta_item("schema.json"))
    assert schema == [
        {
            "name": pk_name,
            "dataType": "integer",
            "primaryKeyIndex": 0,
            "size": pk_size,
        },
        {
            "name": "geom",
            "dataType": "geometry",
            "primaryKeyIndex": None,
            "geometryType": "MULTIPOLYGON",
            "geometryCRS": "EPSG:4167",
        },
        {
            "name": "date_adjusted",
            "dataType": "timestamp",
            "primaryKeyIndex": None
        },
        {
            "name": "survey_reference",
            "dataType": "text",
            "primaryKeyIndex": None
        },
        {
            "name": "adjusted_nodes",
            "dataType": "integer",
            "primaryKeyIndex": None,
            "size": 32,
        },
    ]
Beispiel #10
0
def test_shp_import_meta(
    data_archive,
    tmp_path,
    cli_runner,
    request,
):
    with data_archive("gpkg-polygons") as data:
        # convert to SHP using OGR
        source_filename = tmp_path / "nz_waca_adjustments.shp"
        gdal.VectorTranslate(
            str(source_filename),
            gdal.OpenEx(str(data / "nz-waca-adjustments.gpkg")),
            format="ESRI Shapefile",
            layers=["nz_waca_adjustments"],
        )

        # now import the SHP
        repo_path = tmp_path / "repo"
        r = cli_runner.invoke(
            ["init", "--import", source_filename,
             str(repo_path)])
        assert r.exit_code == 0, r

        # now check metadata
        path = "nz_waca_adjustments"
        repo = pygit2.Repository(str(repo_path))
        dataset = structure.RepositoryStructure(repo)[path]

        meta_items = dict(dataset.meta_items())
        assert set(meta_items) == {
            "description",
            "schema.json",
            "title",
            "crs/EPSG:4167.wkt",
        }
        schema = without_ids(dataset.get_meta_item("schema.json"))
        assert schema == [
            {
                "name": "FID",
                "dataType": "integer",
                "primaryKeyIndex": 0,
                "size": 64
            },
            {
                "name": "geom",
                "dataType": "geometry",
                "primaryKeyIndex": None,
                "geometryType": "POLYGON",
                "geometryCRS": "EPSG:4167",
            },
            {
                "name": "date_adjus",
                "dataType": "date",
                "primaryKeyIndex": None
            },
            {
                "name": "survey_ref",
                "dataType": "text",
                "primaryKeyIndex": None
            },
            {
                "name": "adjusted_n",
                "dataType": "integer",
                "primaryKeyIndex": None,
                "size": 32,
            },
        ]