Ejemplo n.º 1
0
def test_raw_feature_roundtrip():
    legend = Legend(["a"], ["b", "c", "d", "e", "f"])
    schema = abcdef_schema()
    empty_dataset = TableV3.new_dataset_for_writing(DATASET_PATH, schema,
                                                    MemoryRepo())
    legend_path, legend_data = empty_dataset.encode_legend(legend)

    raw_feature_dict = {
        "e": "eggs",
        "a": 123,
        "f": None,
        "d": 5.0,
        "c": True,
        "b": b"bytes",
    }
    feature_path, feature_data = empty_dataset.encode_raw_feature_dict(
        raw_feature_dict, legend, schema=schema)
    tree = MemoryTree({legend_path: legend_data, feature_path: feature_data})

    tableV3 = TableV3(tree / DATASET_PATH, DATASET_PATH, MemoryRepo())
    roundtripped = tableV3.get_raw_feature_dict(path=feature_path)
    assert roundtripped is not raw_feature_dict
    assert roundtripped == raw_feature_dict

    empty_feature_dict = {
        "a": 123,
        "b": None,
        "c": None,
        "d": None,
        "e": None,
        "f": None,
    }
    _, empty_feature_data = empty_dataset.encode_raw_feature_dict(
        empty_feature_dict,
        legend,
        schema=schema,
    )
    tree = MemoryTree({
        legend_path: legend_data,
        feature_path: empty_feature_data
    })

    tableV3 = TableV3(tree / DATASET_PATH, DATASET_PATH, MemoryRepo())
    roundtripped = tableV3.get_raw_feature_dict(path=feature_path)
    # Overwriting the old feature with an empty feature at the same path only
    # clears the non-pk values, since the pk values are part of the path.
    assert roundtripped == {
        "a": 123,
        "b": None,
        "c": None,
        "d": None,
        "e": None,
        "f": None,
    }
Ejemplo n.º 2
0
def test_pk_encoder_int_pk():
    schema = Schema.from_column_dicts([{
        "name": "mypk",
        "dataType": "integer",
        "size": 64,
        "id": "abc123",
        "primaryKeyIndex": 0,
    }])
    ds = TableV3.new_dataset_for_writing("mytable", schema, MemoryRepo())
    e = ds.feature_path_encoder
    assert isinstance(e, IntPathEncoder)
    assert e.encoding == "base64"
    assert e.branches == 64
    assert e.levels == 4

    with pytest.raises(TypeError):
        ds.encode_1pk_to_path("Dave")
    with pytest.raises(TypeError):
        ds.encode_1pk_to_path(0.1)

    assert ds.encode_1pk_to_path(
        0) == "mytable/.table-dataset/feature/A/A/A/A/kQA="
    assert ds.encode_1pk_to_path(
        1) == "mytable/.table-dataset/feature/A/A/A/A/kQE="
    assert ds.encode_1pk_to_path(
        -1) == "mytable/.table-dataset/feature/_/_/_/_/kf8="
    assert (ds.encode_1pk_to_path(1181) ==
            "mytable/.table-dataset/feature/A/A/A/S/kc0EnQ==")
    # trees hit wraparound with large PKs, but don't break
    assert (ds.encode_1pk_to_path(
        64**5) == "mytable/.table-dataset/feature/A/A/A/A/kc5AAAAA")
    assert (ds.encode_1pk_to_path(-(64**5)) ==
            "mytable/.table-dataset/feature/A/A/A/A/kdLAAAAA")
Ejemplo n.º 3
0
def test_legend_roundtrip():
    orig = Legend(["a", "b", "c"], ["d", "e", "f"])

    roundtripped = Legend.loads(orig.dumps())

    assert roundtripped is not orig
    assert roundtripped == orig

    empty_dataset = TableV3.new_dataset_for_writing(DATASET_PATH, None,
                                                    MemoryRepo())
    path, data = empty_dataset.encode_legend(orig)
    tree = MemoryTree({path: data})

    tableV3 = TableV3(tree / DATASET_PATH, DATASET_PATH, MemoryRepo())
    roundtripped = tableV3.get_legend(orig.hexhash())

    assert roundtripped is not orig
    assert roundtripped == orig
Ejemplo n.º 4
0
def test_feature_roundtrip(gen_uuid):
    schema = Schema([
        ColumnSchema(gen_uuid(), "geom", "geometry", None, **GEOM_TYPE_INFO),
        ColumnSchema(gen_uuid(), "id", "integer", 1, size=64),
        ColumnSchema(gen_uuid(), "artist", "text", 0, length=200),
        ColumnSchema(gen_uuid(), "recording", "blob", None),
    ])
    empty_dataset = TableV3.new_dataset_for_writing(DATASET_PATH, schema,
                                                    MemoryRepo())
    schema_path, schema_data = empty_dataset.encode_schema(schema)
    legend_path, legend_data = empty_dataset.encode_legend(schema.legend)

    # encode_feature also accepts a feature tuple, but mostly we use dicts everywhere.
    feature_tuple = ("010100000087BF756489EF5C4C", 7, "GIS Choir", b"MP3")
    # When encoding dicts, we use the keys - so the correct initialisation order is not necessary.
    feature_dict = {
        "artist": "GIS Choir",
        "recording": b"MP3",
        "id": 7,
        "geom": "010100000087BF756489EF5C4C",
    }

    feature_path, feature_data = empty_dataset.encode_feature(
        feature_tuple, schema)
    feature_path2, feature_data2 = empty_dataset.encode_feature(
        feature_dict, schema)
    # Either encode method should give the same result.
    assert (feature_path, feature_data) == (feature_path2, feature_data2)

    tree = MemoryTree({
        schema_path: schema_data,
        legend_path: legend_data,
        feature_path: feature_data
    })

    tableV3 = TableV3(tree / DATASET_PATH, DATASET_PATH, MemoryRepo())
    roundtripped_feature = tableV3.get_feature(path=feature_path)
    assert roundtripped_feature is not feature_dict
    assert roundtripped_feature == feature_dict
    # We guarantee that the dict iterates in row-order.
    assert tuple(roundtripped_feature.values()) == feature_tuple
Ejemplo n.º 5
0
def test_schema_roundtrip(gen_uuid):
    orig = Schema([
        ColumnSchema(gen_uuid(), "geom", "geometry", None, **GEOM_TYPE_INFO),
        ColumnSchema(gen_uuid(), "id", "integer", 1, size=64),
        ColumnSchema(gen_uuid(), "artist", "text", 0, length=200),
        ColumnSchema(gen_uuid(), "recording", "blob", None),
    ])

    roundtripped = Schema.loads(orig.dumps())

    assert roundtripped is not orig
    assert roundtripped == orig

    empty_dataset = TableV3.new_dataset_for_writing(DATASET_PATH, None,
                                                    MemoryRepo())
    path, data = empty_dataset.encode_schema(orig)
    tree = MemoryTree({path: data})

    tableV3 = TableV3(tree / DATASET_PATH, DATASET_PATH, MemoryRepo())
    roundtripped = tableV3.schema

    assert roundtripped is not orig
    assert roundtripped == orig
Ejemplo n.º 6
0
def test_write_feature_performance(
    archive,
    source_gpkg,
    table,
    data_archive,
    tmp_path,
    cli_runner,
    chdir,
    benchmark,
    request,
):
    """ Per-feature import performance. """
    param_ids = H.parameter_ids(request)

    with data_archive(archive) as data:
        # list tables
        repo_path = tmp_path / "repo"
        repo_path.mkdir()

        benchmark.group = f"test_write_feature_performance - {param_ids[-1]}"

        with chdir(repo_path):
            r = cli_runner.invoke(["init"])
            assert r.exit_code == 0, r

            repo = KartRepo(repo_path)

            source = TableImportSource.open(data / source_gpkg, table=table)
            with source:
                dataset = TableV3.new_dataset_for_writing(
                    table, source.schema, MemoryRepo())
                feature_iter = itertools.cycle(list(source.features()))

                index = pygit2.Index()

                encode_kwargs = {"schema": source.schema}

                def _write_feature():
                    feature = next(feature_iter)
                    dest_path, dest_data = dataset.encode_feature(
                        feature, **encode_kwargs)
                    blob_id = repo.create_blob(dest_data)
                    entry = pygit2.IndexEntry(f"{dataset.path}/{dest_path}",
                                              blob_id,
                                              pygit2.GIT_FILEMODE_BLOB)
                    index.add(entry)

                benchmark(_write_feature)
Ejemplo n.º 7
0
def test_pk_encoder_string_pk():
    schema = Schema.from_column_dicts([{
        "name": "mypk",
        "dataType": "text",
        "id": "abc123"
    }])
    ds = TableV3.new_dataset_for_writing("mytable", schema, MemoryRepo())
    e = ds.feature_path_encoder
    assert isinstance(e, MsgpackHashPathEncoder)
    assert e.encoding == "base64"
    assert e.branches == 64
    assert e.levels == 4
    assert ds.encode_1pk_to_path(
        "") == "mytable/.table-dataset/feature/I/6/M/_/kaA="
    assert (ds.encode_1pk_to_path("Dave") ==
            "mytable/.table-dataset/feature/s/v/7/j/kaREYXZl")
Ejemplo n.º 8
0
def test_schema_change_roundtrip(gen_uuid):
    old_schema = Schema([
        ColumnSchema(gen_uuid(), "ID", "integer", 0),
        ColumnSchema(gen_uuid(), "given_name", "text", None),
        ColumnSchema(gen_uuid(), "surname", "text", None),
        ColumnSchema(gen_uuid(), "date_of_birth", "date", None),
    ])
    new_schema = Schema([
        ColumnSchema(old_schema[0].id, "personnel_id", "integer", 0),
        ColumnSchema(gen_uuid(), "tax_file_number", "text", None),
        ColumnSchema(old_schema[2].id, "last_name", "text", None),
        ColumnSchema(old_schema[1].id, "first_name", "text", None),
        ColumnSchema(gen_uuid(), "middle_names", "text", None),
    ])
    # Updating the schema without updating features is only possible
    # if the old and new schemas have the same primary key columns:
    assert old_schema.is_pk_compatible(new_schema)

    feature_tuple = (7, "Joe", "Bloggs", "1970-01-01")
    feature_dict = {
        "given_name": "Joe",
        "surname": "Bloggs",
        "date_of_birth": "1970-01-01",
        "ID": 7,
    }

    empty_dataset = TableV3.new_dataset_for_writing(DATASET_PATH, old_schema,
                                                    MemoryRepo())
    feature_path, feature_data = empty_dataset.encode_feature(
        feature_tuple, old_schema)
    feature_path2, feature_data2 = empty_dataset.encode_feature(
        feature_dict, old_schema)
    # Either encode method should give the same result.
    assert (feature_path, feature_data) == (feature_path2, feature_data2)

    # The dataset should store only the current schema, but all legends.
    schema_path, schema_data = empty_dataset.encode_schema(new_schema)
    new_legend_path, new_legend_data = empty_dataset.encode_legend(
        new_schema.legend)
    old_legend_path, old_legend_data = empty_dataset.encode_legend(
        old_schema.legend)
    tree = MemoryTree({
        schema_path: schema_data,
        new_legend_path: new_legend_data,
        old_legend_path: old_legend_data,
        feature_path: feature_data,
    })

    tableV3 = TableV3(tree / DATASET_PATH, DATASET_PATH, MemoryRepo())
    # Old columns that are not present in the new schema are gone.
    # New columns that are not present in the old schema have 'None's.
    roundtripped = tableV3.get_feature(path=feature_path)
    assert roundtripped == {
        "personnel_id": 7,
        "tax_file_number": None,
        "last_name": "Bloggs",
        "first_name": "Joe",
        "middle_names": None,
    }
    # We guarantee that the dict iterates in row-order.
    assert tuple(roundtripped.values()) == (7, None, "Bloggs", "Joe", None)