コード例 #1
0
def test_csv_mac_newlines():
    # Test a CSV file with old Mac-style newlines (\r)

    with open(os.path.join(INGESTION_RESOURCES_CSV, "mac_newlines.csv"),
              "rb") as f:
        options = CSVOptions()
        options, reader = make_csv_reader(f, options)

        assert options.encoding == "utf-8"
        assert options.header is True

        data = list(reader)
        assert len(data) == 5
        assert data[0] == ["fruit_id", "timestamp", "name"]

        schema = generate_column_names(infer_sg_schema(data))
        assert schema == [
            TableColumn(ordinal=1,
                        name="fruit_id",
                        pg_type="integer",
                        is_pk=False,
                        comment=None),
            TableColumn(ordinal=2,
                        name="timestamp",
                        pg_type="timestamp",
                        is_pk=False,
                        comment=None),
            TableColumn(ordinal=3,
                        name="name",
                        pg_type="character varying",
                        is_pk=False,
                        comment=None),
        ]
コード例 #2
0
def test_table_schema_params_to_dict():
    assert table_schema_params_to_dict({
        "fruits": (
            [
                TableColumn(ordinal=1,
                            name="fruit_id",
                            pg_type="integer",
                            is_pk=False,
                            comment=None),
                TableColumn(
                    ordinal=2,
                    name="name",
                    pg_type="character varying",
                    is_pk=False,
                    comment=None,
                ),
            ],
            {
                "key": "value"
            },
        ),
        "vegetables": (
            [
                TableColumn(ordinal=1,
                            name="vegetable_id",
                            pg_type="integer",
                            is_pk=False,
                            comment=None),
                TableColumn(
                    ordinal=2,
                    name="name",
                    pg_type="character varying",
                    is_pk=False,
                    comment=None,
                ),
            ],
            {
                "key": "value"
            },
        ),
    }) == {
        "fruits": {
            "schema": {
                "fruit_id": "integer",
                "name": "character varying"
            },
            "options": {
                "key": "value"
            },
        },
        "vegetables": {
            "schema": {
                "name": "character varying",
                "vegetable_id": "integer"
            },
            "options": {
                "key": "value"
            },
        },
    }
コード例 #3
0
ファイル: test_socrata.py プロジェクト: yanyu510/splitgraph
def test_socrata_column_deduplication():
    assert dedupe_sg_schema(
        [
            TableColumn(1, "normal_col", "some_type", True),
            TableColumn(
                2,
                "long_col_but_not_unique_until_the_59th_char_somewhere_there_yep_this_is_different",
                "some_type",
                False,
            ),
            TableColumn(3, "long_col_but_still_unique" * 3, "some_type", False),
            TableColumn(
                4,
                "long_col_but_not_unique_until_the_59th_char_somewhere_there_and_this_is_even_more_so",
                "some_type",
                False,
            ),
            TableColumn(
                5,
                "long_col_but_not_unique_until_the_59th_char_somewhere_there_and_wow_yep_were_done",
                "some_type",
                False,
            ),
        ]
    ) == [
        TableColumn(ordinal=1, name="normal_col", pg_type="some_type", is_pk=True, comment=None),
        TableColumn(
            ordinal=2,
            name="long_col_but_not_unique_until_the_59th_char_somewhere_there_000",
            pg_type="some_type",
            is_pk=False,
            comment=None,
        ),
        TableColumn(
            ordinal=3,
            name="long_col_but_still_uniquelong_col_but_still_uniquelong_col_but_",
            pg_type="some_type",
            is_pk=False,
            comment=None,
        ),
        TableColumn(
            ordinal=4,
            name="long_col_but_not_unique_until_the_59th_char_somewhere_there_001",
            pg_type="some_type",
            is_pk=False,
            comment=None,
        ),
        TableColumn(
            ordinal=5,
            name="long_col_but_not_unique_until_the_59th_char_somewhere_there_002",
            pg_type="some_type",
            is_pk=False,
            comment=None,
        ),
    ]
コード例 #4
0
def socrata_to_sg_schema(metadata: Dict[str, Any]) -> Tuple[TableSchema, Dict[str, str]]:
    try:
        col_names = metadata["resource"]["columns_field_name"]
        col_types = metadata["resource"]["columns_datatype"]
    except KeyError:
        raise ValueError("Invalid Socrata metadata!")

    col_desc = metadata["resource"].get("columns_description") or [None] * len(col_names)

    # Prepend the Socrata :id column that we can order on and use as PK.
    col_names = [":id"] + col_names
    col_types = ["text"] + col_types
    col_desc = ["Socrata column ID"] + col_desc

    result = [
        TableColumn(i, n, _socrata_to_pg_type(t), False, d)
        for i, (n, t, d) in enumerate(zip(col_names, col_types, col_desc))
    ]

    # Truncate Socrata column names to 63 characters and calculate
    # a map of Splitgraph columns to Socrata columns.
    result_deduped = dedupe_sg_schema(result)

    sg_to_socrata_cols = {
        d.name: r.name for r, d in zip(result, result_deduped) if d.name != r.name
    }

    return result_deduped, sg_to_socrata_cols
コード例 #5
0
def infer_sg_schema(
    sample: List[Tuple[str, ...]],
    override_types: Optional[Dict[str, str]] = None,
    primary_keys: Optional[List[str]] = None,
):
    override_types = override_types or {}
    primary_keys = primary_keys or []
    result: TableSchema = []

    header = sample[0]
    columns = list(zip(*sample[1:]))
    if len(columns) != len(header):
        raise ValueError(
            "Malformed CSV: header has %d columns, rows have %d columns" %
            (len(header), len(columns)))

    for i, (c_name, c_sample) in enumerate(zip(header, columns)):
        pg_type = override_types.get(c_name, _infer_column_schema(c_sample))

        result.append(
            TableColumn(
                ordinal=i + 1,
                name=c_name,
                pg_type=pg_type,
                is_pk=(c_name in primary_keys),
            ))

    return result
コード例 #6
0
    def get_full_table_schema(self, schema: str,
                              table_name: str) -> "TableSchema":
        """
        Generates a list of (column ordinal, name, data type, is_pk, column comment),
        used to detect schema changes like columns being dropped/added/renamed or type changes.

        NB this doesn't work for temporary tables (pg_temp) and returns an empty schema.
        """
        assert schema != "pg_temp"

        results = self.run_sql(
            SQL("SELECT c.attnum, c.attname, "
                "pg_catalog.format_type(c.atttypid, c.atttypmod), "
                "col_description('{}.{}'::regclass, c.attnum) "
                "FROM pg_attribute c JOIN pg_class t ON c.attrelid = t.oid "
                "JOIN pg_namespace n ON t.relnamespace = n.oid "
                "WHERE n.nspname = %s AND t.relname = %s AND NOT c.attisdropped "
                "AND c.attnum >= 0 ORDER BY c.attnum ").format(
                    Identifier(schema), Identifier(table_name)),
            (schema, table_name),
        )

        def _convert_type(ctype):
            # We don't keep a lot of type information, so e.g. char(5) gets turned into char
            # which defaults into char(1).
            return ctype if ctype != "character" else "character varying"

        # Do we need to make sure the PK has the same type + ordinal position here?
        pks = [pk for pk, _ in self.get_primary_keys(schema, table_name)]

        return [
            TableColumn(o, n, _convert_type(dt), (n in pks), c)
            for o, n, dt, c in results
        ]
コード例 #7
0
def test_schema_changes(pg_repo_local, test_case):
    action, expected_new_schema = test_case

    assert (
        pg_repo_local.engine.get_full_table_schema(pg_repo_local.to_schema(), "fruits")
        == OLD_SCHEMA
    )
    pg_repo_local.run_sql(action)
    pg_repo_local.commit_engines()
    assert (
        pg_repo_local.engine.get_full_table_schema(pg_repo_local.to_schema(), "fruits")
        == expected_new_schema
    )

    head = pg_repo_local.head
    new_head = pg_repo_local.commit()

    # Test that the new image was stored as new object with the new schema.
    assert len(new_head.get_table("fruits").objects) == 1
    new_snap = new_head.get_table("fruits").objects[0]
    assert pg_repo_local.engine.get_object_schema(new_snap) == _drop_comments(
        expected_new_schema
        + [TableColumn(expected_new_schema[-1].ordinal + 1, SG_UD_FLAG, "boolean", False)]
    )

    head.checkout()
    assert (
        pg_repo_local.engine.get_full_table_schema(pg_repo_local.to_schema(), "fruits")
        == OLD_SCHEMA
    )

    new_head.checkout()
    assert pg_repo_local.engine.get_full_table_schema(
        pg_repo_local.to_schema(), "fruits"
    ) == _reassign_ordinals(expected_new_schema)
コード例 #8
0
def test_mount_rename_table(local_engine_empty):
    tables = {
        "fruits_renamed": (
            [
                TableColumn(ordinal=1,
                            name="fruit_id",
                            pg_type="integer",
                            is_pk=False,
                            comment=None),
                TableColumn(
                    ordinal=2,
                    name="name",
                    pg_type="character varying",
                    is_pk=False,
                    comment=None,
                ),
            ],
            {
                "table_name": "fruits"
            },
        )
    }
    handler = PostgreSQLDataSource(
        engine=local_engine_empty,
        credentials={
            "username": "******",
            "password": "******"
        },
        params={
            "host": "pgorigin",
            "port": 5432,
            "dbname": "origindb",
            "remote_schema": "public"
        },
        tables=tables,
    )

    preview = handler.preview(tables)
    assert preview == {
        "fruits_renamed": [{
            "fruit_id": 1,
            "name": "apple"
        }, {
            "fruit_id": 2,
            "name": "orange"
        }],
    }
コード例 #9
0
ファイル: fdw.py プロジェクト: splitgraph/splitgraph
    def get_table_schema(self, table_name, table_schema):
        # Add the "_id" column to the schema if it's not already there.
        if any(c.name == "_id" for c in table_schema):
            return table_schema

        return table_schema + [
            TableColumn(table_schema[-1].ordinal + 1, "_id", "NAME", False)
        ]
コード例 #10
0
def test_csv_dialect_encoding_inference():
    # Test CSV dialect inference with:
    #  - win-1252 encoding (will autodetect with chardet)
    #  - Windows line endings
    #  - different separator
    #  - first column name missing

    with open(os.path.join(INGESTION_RESOURCES_CSV, "encoding-win-1252.csv"),
              "rb") as f:
        options = CSVOptions()
        options, reader = make_csv_reader(f, options)

        assert options.encoding == "Windows-1252"
        assert options.header is True
        # NB we don't extract everything from the sniffed dialect, just the delimiter and the
        # quotechar. The sniffer also returns doublequote and skipinitialspace.
        assert options.delimiter == ";"

        data = list(reader)

        assert data == [
            ["", "DATE", "TEXT"],
            ["1", "01/07/2021", "Pañamao"],
            ["2", "06/11/2018", "–"],
            ["3", "28/05/2018", "División"],
        ]

        schema = generate_column_names(infer_sg_schema(data))
        assert schema == [
            TableColumn(ordinal=1,
                        name="col_1",
                        pg_type="integer",
                        is_pk=False,
                        comment=None),
            TableColumn(ordinal=2,
                        name="DATE",
                        pg_type="character varying",
                        is_pk=False,
                        comment=None),
            TableColumn(ordinal=3,
                        name="TEXT",
                        pg_type="character varying",
                        is_pk=False,
                        comment=None),
        ]
コード例 #11
0
ファイル: test_mounting.py プロジェクト: dazzag24/splitgraph
def test_mount_elasticsearch(local_engine_empty):
    # No ES running in this stack: this is just a test that we can instantiate the FDW.
    repo = Repository("test", "es_mount")
    try:
        mount(
            repo.to_schema(),
            "elasticsearch",
            dict(
                username=None,
                password=None,
                server="elasticsearch",
                port=9200,
                table_spec={
                    "table_1": {
                        "schema": {
                            "id": "text",
                            "@timestamp": "timestamp",
                            "query": "text",
                            "col_1": "text",
                            "col_2": "boolean",
                        },
                        "index": "index-pattern*",
                        "rowid_column": "id",
                        "query_column": "query",
                    }
                },
            ),
        )

        assert get_engine().get_full_table_schema(repo.to_schema(), "table_1") == [
            TableColumn(ordinal=1, name="id", pg_type="text", is_pk=False, comment=None),
            TableColumn(
                ordinal=2,
                name="@timestamp",
                pg_type="timestamp without time zone",
                is_pk=False,
                comment=None,
            ),
            TableColumn(ordinal=3, name="query", pg_type="text", is_pk=False, comment=None),
            TableColumn(ordinal=4, name="col_1", pg_type="text", is_pk=False, comment=None),
            TableColumn(ordinal=5, name="col_2", pg_type="boolean", is_pk=False, comment=None),
        ]

    finally:
        repo.delete()
コード例 #12
0
def test_pandas_basic_insert(ingestion_test_repo):
    df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch")
    ingestion_test_repo.commit()

    assert ingestion_test_repo.head.get_table("test_table").table_schema == [
        TableColumn(1, "fruit_id", "bigint", True),
        TableColumn(2, "timestamp", "timestamp without time zone", False),
        TableColumn(3, "name", "text", False),
    ]

    assert ingestion_test_repo.run_sql(
        "SELECT fruit_id, timestamp, name FROM test_table ORDER BY fruit_id"
    ) == [
        (1, dt(2018, 1, 1, 0, 11, 11), "apple"),
        (2, dt(2018, 1, 2, 0, 22, 22), "orange"),
        (3, dt(2018, 1, 3, 0, 33, 33), "mayonnaise"),
        (4, dt(2018, 1, 4, 0, 44, 44), "mustard"),
    ]
コード例 #13
0
def dedupe_sg_schema(schema_spec: TableSchema, prefix_len: int = 59) -> TableSchema:
    """
    Some Socrata schemas have columns that are longer than 63 characters
    where the first 63 characters are the same between several columns
    (e.g. odn.data.socrata.com). This routine renames columns in a schema
    to make sure this can't happen (by giving duplicates a number suffix).
    """

    # We truncate the column name to 59 to leave space for the underscore
    # and 3 digits (max PG identifier is 63 chars)
    prefix_counts: Dict[str, int] = {}
    columns_nums: List[Tuple[str, int]] = []

    for column in schema_spec:
        column_short = column.name[:prefix_len]
        count = prefix_counts.get(column_short, 0)
        columns_nums.append((column_short, count))
        prefix_counts[column_short] = count + 1

    result = []
    for (_, position), column in zip(columns_nums, schema_spec):
        column_short = column.name[:prefix_len]
        count = prefix_counts[column_short]
        if count > 1:
            result.append(
                TableColumn(
                    column.ordinal,
                    f"{column_short}_{position:03d}",
                    column.pg_type,
                    column.is_pk,
                    column.comment,
                )
            )
        else:
            result.append(
                TableColumn(
                    column.ordinal,
                    column.name[:POSTGRES_MAX_IDENTIFIER],
                    column.pg_type,
                    column.is_pk,
                    column.comment,
                )
            )
    return result
コード例 #14
0
def test_pandas_no_processing_insert(ingestion_test_repo):
    # Make sure everything still works when we don't have a PK.
    df = pd.read_csv(os.path.join(INGESTION_RESOURCES_CSV, "base_df.csv"))
    df_to_table(df, ingestion_test_repo, "test_table")
    ingestion_test_repo.commit()

    assert ingestion_test_repo.head.get_table("test_table").table_schema == [
        TableColumn(1, "fruit_id", "bigint", False),
        TableColumn(2, "timestamp", "text", False),
        TableColumn(3, "name", "text", False),
    ]

    assert ingestion_test_repo.run_sql(
        "SELECT fruit_id, timestamp, name FROM test_table ORDER BY fruit_id"
    ) == [
        (1, "2018-01-01 00:11:11", "apple"),
        (2, "2018-01-02 00:22:22", "orange"),
        (3, "2018-01-03 00:33:33", "mayonnaise"),
        (4, "2018-01-04 00:44:44", "mustard"),
    ]
コード例 #15
0
def _mount_mysql(repository):
    # We don't use this one in tests beyond basic mounting, so no point importing it.
    mount(
        repository.to_schema(),
        "mysql_fdw",
        dict(
            host="mysqlorigin",
            port=3306,
            username="******",
            password="******",
            dbname="mysqlschema",
        ),
        tables={
            "mushrooms": (
                [
                    TableColumn(1, "mushroom_id", "integer", False),
                    TableColumn(2, "name", "character varying (20)", False),
                    TableColumn(3, "discovery", "timestamp", False),
                    TableColumn(4, "friendly", "boolean", False),
                    TableColumn(5, "binary_data", "bytea", False),
                    TableColumn(6, "varbinary_data", "bytea", False),
                ],
                {},
            )
        },
    )
コード例 #16
0
def test_dict_to_table_schema_params():
    assert dict_to_table_schema_params(
        {
            k: ExternalTableRequest.parse_obj(v)
            for k, v in {
                "fruits": {
                    "schema": {"fruit_id": "integer", "name": "character varying"},
                    "options": {"key": "value"},
                },
                "vegetables": {
                    "schema": {"name": "character varying", "vegetable_id": "integer"},
                    "options": {"key": "value"},
                },
            }.items()
        }
    ) == {
        "fruits": (
            [
                TableColumn(
                    ordinal=1, name="fruit_id", pg_type="integer", is_pk=False, comment=None
                ),
                TableColumn(
                    ordinal=2, name="name", pg_type="character varying", is_pk=False, comment=None
                ),
            ],
            {"key": "value"},
        ),
        "vegetables": (
            [
                TableColumn(
                    ordinal=1, name="name", pg_type="character varying", is_pk=False, comment=None
                ),
                TableColumn(
                    ordinal=2, name="vegetable_id", pg_type="integer", is_pk=False, comment=None
                ),
            ],
            {"key": "value"},
        ),
    }
コード例 #17
0
def test_mount_force_schema(local_engine_empty):
    _mount_postgres(
        PG_MNT,
        tables={"fruits": {
            "schema": {
                "fruit_id": "character varying"
            }
        }})
    assert get_engine().table_exists(PG_MNT.to_schema(), "fruits")
    assert get_engine().get_full_table_schema(
        PG_MNT.to_schema(), "fruits") == [
            TableColumn(1, "fruit_id", "character varying", False, None)
        ]
コード例 #18
0
def generate_table(repository: "Repository", table_name: str, size: int) -> None:
    """
    Creates a table with an integer primary key and a string value.

    :param repository: Checked-out Repository to create the table in.
    :param table_name: Name of the table to generate
    :param size: Number of rows in the table.
    """
    from psycopg2.sql import SQL
    from psycopg2.sql import Identifier

    repository.engine.create_table(
        repository.to_schema(),
        table_name,
        [
            TableColumn(1, "key", "integer", True, "Some key"),
            TableColumn(2, "value", "varchar", False, "Some value"),
        ],
    )
    repository.engine.run_sql_batch(
        SQL("INSERT INTO {} VALUES (%s, %s)").format(Identifier(table_name)),
        [(i, _hash(i)) for i in range(size)],
        schema=repository.to_schema(),
    )
コード例 #19
0
ファイル: querying.py プロジェクト: cxz/splitgraph
def socrata_to_sg_schema(metadata: Dict[str, Any]) -> TableSchema:
    try:
        col_names = metadata["resource"]["columns_field_name"]
        col_types = metadata["resource"]["columns_datatype"]
    except KeyError:
        raise ValueError("Invalid Socrata metadata!")

    col_desc = metadata["resource"].get(
        "columns_description") or [None] * len(col_names)

    # Prepend the Socrata :id column that we can order on and use as PK.
    col_names = [":id"] + col_names
    col_types = ["text"] + col_types
    col_desc = ["Socrata column ID"] + col_desc

    result = [
        TableColumn(i, n, _socrata_to_pg_type(t), False, d)
        for i, (n, t, d) in enumerate(zip(col_names, col_types, col_desc))
    ]

    return dedupe_sg_schema(result)
コード例 #20
0
def test_singer_ingestion_schema_change(local_engine_empty):
    # Run the initial ingestion and then another one where we've changed the user_id in
    # stargazers to be a string.

    runner = CliRunner(mix_stderr=False)

    with open(os.path.join(INGESTION_RESOURCES, "singer/initial.json"),
              "r") as f:
        result = runner.invoke(singer_target, [TEST_REPO + ":latest"],
                               input=f,
                               catch_exceptions=False)

    assert result.exit_code == 0

    with open(os.path.join(INGESTION_RESOURCES, "singer/schema_change.json"),
              "r") as f:
        result = runner.invoke(singer_target, [TEST_REPO + ":latest"],
                               input=f,
                               catch_exceptions=False)

    assert result.exit_code == 0

    assert json.loads(result.stdout) == {
        "bookmarks": {
            "splitgraph/splitgraph": {
                "stargazers": {
                    "since": "2020-10-14T11:06:42.565793Z"
                },
            }
        }
    }
    repo = Repository.from_schema(TEST_REPO)

    assert len(repo.images()) == 2
    image = repo.images["latest"]
    assert sorted(image.get_tables()) == ["releases", "stargazers"]
    image.checkout()

    assert repo.run_sql("SELECT COUNT(1) FROM releases",
                        return_shape=ResultShape.ONE_ONE) == 6
    assert repo.run_sql("SELECT COUNT(1) FROM stargazers",
                        return_shape=ResultShape.ONE_ONE) == 6

    assert repo.run_sql(
        "SELECT user_id, starred_at FROM stargazers ORDER BY user_id") == [
            ("100001", datetime(2018, 10, 17, 22, 14, 12)),
            ("100002", datetime(2018, 11, 6, 11, 26, 16)),
            ("100003", datetime(2018, 12, 11, 16, 0, 42)),
            ("100004", datetime(2020, 10, 11, 21, 9, 30)),
            ("100005", datetime(2019, 4, 18, 2, 40, 47)),
            ("string_user_id", datetime(2019, 4, 18, 2, 40, 47)),
        ]

    # Releases unchanged -- same table
    assert image.get_table("releases").table_schema == _RELEASES_SCHEMA
    assert image.get_table("releases").objects == [
        "o160e0b0db4ad7e7eb7c4db26bf8183461f65968be64b8594c7cc71fbf5ff2a"
    ]

    assert image.get_table("stargazers").table_schema == [
        TableColumn(
            ordinal=0,
            name="_sdc_repository",
            pg_type="character varying",
            is_pk=False,
            comment=None,
        ),
        TableColumn(
            ordinal=1,
            name="starred_at",
            pg_type="timestamp without time zone",
            is_pk=False,
            comment=None,
        ),
        TableColumn(ordinal=2,
                    name="user",
                    pg_type="jsonb",
                    is_pk=False,
                    comment=None),
        TableColumn(ordinal=3,
                    name="user_id",
                    pg_type="character varying",
                    is_pk=True,
                    comment=None),
    ]

    # Stargazers: had a migration, new object
    assert image.get_table("stargazers").objects == [
        "o9e54958076c86d854ad21da17239daecaec839e84daee8ff9ca5dcecd84cdd"
    ]
コード例 #21
0
def test_singer_tap_mysql_introspection(local_engine_empty):
    source = _source(local_engine_empty)
    assert source.introspect() == {
        "mushrooms": (
            [
                TableColumn(
                    ordinal=0,
                    name="discovery",
                    pg_type="timestamp without time zone",
                    is_pk=False,
                    comment=None,
                ),
                TableColumn(ordinal=1,
                            name="friendly",
                            pg_type="boolean",
                            is_pk=False,
                            comment=None),
                TableColumn(ordinal=2,
                            name="mushroom_id",
                            pg_type="integer",
                            is_pk=True,
                            comment=None),
                TableColumn(ordinal=3,
                            name="name",
                            pg_type="character varying",
                            is_pk=False,
                            comment=None),
            ],
            {},
        )
    }

    singer_config = source.get_singer_config()
    assert singer_config == {
        "host": "localhost",
        "password": "******",
        "port": 3306,
        "replication_method": "INCREMENTAL",
        "user": "******",
    }

    # Binary datatypes aren't supported by tap-singer but we make sure it's aware of them
    # (shows that they're not supported).
    singer_catalog = source._run_singer_discovery(singer_config)
    assert singer_catalog == {
        "streams": [{
            "metadata": mock.ANY,
            "schema": {
                "properties": {
                    "discovery": {
                        "format": "date-time",
                        "inclusion": "available",
                        "type": ["null", "string"],
                    },
                    "friendly": {
                        "inclusion": "available",
                        "type": ["null", "boolean"]
                    },
                    "mushroom_id": {
                        "inclusion": "automatic",
                        "maximum": 2147483647,
                        "minimum": -2147483648,
                        "type": ["null", "integer"],
                    },
                    "name": {
                        "inclusion": "available",
                        "maxLength": 20,
                        "type": ["null", "string"],
                    },
                    "binary_data": {
                        "description": "Unsupported column type binary(7)",
                        "inclusion": "unsupported",
                    },
                    "varbinary_data": {
                        "description": "Unsupported column type varbinary(16)",
                        "inclusion": "unsupported",
                    },
                },
                "type": "object",
            },
            "stream": "mushrooms",
            "table_name": "mushrooms",
            "tap_stream_id": "mysqlschema-mushrooms",
        }]
    }

    assert sorted(singer_catalog["streams"][0]["metadata"],
                  key=lambda m: m["breadcrumb"]) == [
                      {
                          "breadcrumb": [],
                          "metadata": {
                              "database-name": "mysqlschema",
                              "is-view": False,
                              "row-count": 2,
                              "selected-by-default": False,
                              "table-key-properties": ["mushroom_id"],
                          },
                      },
                      {
                          "breadcrumb": ["properties", "binary_data"],
                          "metadata": {
                              "selected-by-default": False,
                              "sql-datatype": "binary(7)"
                          },
                      },
                      {
                          "breadcrumb": ["properties", "discovery"],
                          "metadata": {
                              "selected-by-default": True,
                              "sql-datatype": "datetime"
                          },
                      },
                      {
                          "breadcrumb": ["properties", "friendly"],
                          "metadata": {
                              "selected-by-default": True,
                              "sql-datatype": "tinyint(1)"
                          },
                      },
                      {
                          "breadcrumb": ["properties", "mushroom_id"],
                          "metadata": {
                              "selected-by-default": True,
                              "sql-datatype": "int(11)"
                          },
                      },
                      {
                          "breadcrumb": ["properties", "name"],
                          "metadata": {
                              "selected-by-default": True,
                              "sql-datatype": "varchar(20)",
                          },
                      },
                      {
                          "breadcrumb": ["properties", "varbinary_data"],
                          "metadata": {
                              "selected-by-default": False,
                              "sql-datatype": "varbinary(16)"
                          },
                      },
                  ]

    selected_catalog = source.build_singer_catalog(singer_catalog, tables=None)
    assert select_breadcrumb(selected_catalog["streams"][0], []) == {
        "database-name": "mysqlschema",
        "is-view": False,
        "replication-key": "mushroom_id",
        "replication-method": "INCREMENTAL",
        "row-count": 2,
        "selected": True,
        "selected-by-default": False,
        "table-key-properties": ["mushroom_id"],
    }
コード例 #22
0
def test_mount_introspection_preview(local_engine_empty):
    handler = PostgreSQLDataSource(
        engine=local_engine_empty,
        credentials={
            "username": "******",
            "password": "******"
        },
        params={
            "host": "pgorigin",
            "port": 5432,
            "dbname": "origindb",
            "remote_schema": "public"
        },
    )

    tables = handler.introspect()

    assert tables == {
        "fruits": (
            [
                TableColumn(ordinal=1,
                            name="fruit_id",
                            pg_type="integer",
                            is_pk=False,
                            comment=None),
                TableColumn(ordinal=2,
                            name="name",
                            pg_type="character varying",
                            is_pk=False,
                            comment=None),
            ],
            {
                "schema_name": "public",
                "table_name": "fruits"
            },
        ),
        "vegetables": (
            [
                TableColumn(ordinal=1,
                            name="vegetable_id",
                            pg_type="integer",
                            is_pk=False,
                            comment=None),
                TableColumn(ordinal=2,
                            name="name",
                            pg_type="character varying",
                            is_pk=False,
                            comment=None),
            ],
            {
                "schema_name": "public",
                "table_name": "vegetables"
            },
        ),
    }

    preview = handler.preview(tables=tables)
    assert preview == {
        "fruits": [{
            "fruit_id": 1,
            "name": "apple"
        }, {
            "fruit_id": 2,
            "name": "orange"
        }],
        "vegetables": [
            {
                "name": "potato",
                "vegetable_id": 1
            },
            {
                "name": "carrot",
                "vegetable_id": 2
            },
        ],
    }
コード例 #23
0
from splitgraph.core.repository import Repository
from splitgraph.core.types import TableColumn
from splitgraph.engine import ResultShape
from splitgraph.ingestion.singer.commandline import singer_target
from splitgraph.ingestion.singer.data_source import GenericSingerDataSource, MySQLSingerDataSource
from splitgraph.ingestion.singer.db_sync import select_breadcrumb

TEST_REPO = "test/singer"
TEST_TAP = os.path.join(INGESTION_RESOURCES, "singer/fake_tap.py")

_STARGAZERS_SCHEMA = [
    TableColumn(
        ordinal=0,
        name="_sdc_repository",
        pg_type="character varying",
        is_pk=False,
        comment=None,
    ),
    TableColumn(
        ordinal=1,
        name="starred_at",
        pg_type="timestamp without time zone",
        is_pk=False,
        comment=None,
    ),
    TableColumn(ordinal=2,
                name="user",
                pg_type="jsonb",
                is_pk=False,
                comment=None),
コード例 #24
0
ファイル: db_sync.py プロジェクト: Trase/splitgraph
def _get_sg_schema(flattened_schema, primary_key) -> TableSchema:
    return [
        TableColumn(i, name, column_type(schema_property), name in primary_key, None)
        for i, (name, schema_property) in enumerate(flattened_schema.items())
    ]
コード例 #25
0
ファイル: test_socrata.py プロジェクト: cxz/splitgraph
def test_socrata_mounting(local_engine_empty):
    with open(os.path.join(INGESTION_RESOURCES, "socrata/find_datasets.json"),
              "r") as f:
        socrata_meta = json.load(f)

    socrata = MagicMock(spec=Socrata)
    socrata.datasets.return_value = socrata_meta
    with mock.patch("sodapy.Socrata", return_value=socrata):

        mount_socrata(
            "test/pg_mount",
            None,
            None,
            None,
            None,
            "example.com",
            {"some_table": "xzkq-xp2w"},
            "some_token",
        )

    assert local_engine_empty.get_full_table_schema(
        "test/pg_mount", "some_table"
    ) == [
        TableColumn(ordinal=1,
                    name=":id",
                    pg_type="text",
                    is_pk=False,
                    comment="Socrata column ID"),
        TableColumn(
            ordinal=2,
            name="full_or_part_time",
            pg_type="text",
            is_pk=False,
            comment=
            "Whether the employee was employed full- (F) or part-time (P).",
        ),
        TableColumn(ordinal=3,
                    name="hourly_rate",
                    pg_type="numeric",
                    is_pk=False,
                    comment=mock.ANY),
        TableColumn(ordinal=4,
                    name="salary_or_hourly",
                    pg_type="text",
                    is_pk=False,
                    comment=mock.ANY),
        TableColumn(
            ordinal=5,
            name="job_titles",
            pg_type="text",
            is_pk=False,
            comment="Title of employee at the time when the data was updated.",
        ),
        TableColumn(ordinal=6,
                    name="typical_hours",
                    pg_type="numeric",
                    is_pk=False,
                    comment=mock.ANY),
        TableColumn(ordinal=7,
                    name="annual_salary",
                    pg_type="numeric",
                    is_pk=False,
                    comment=mock.ANY),
        TableColumn(ordinal=8,
                    name="name",
                    pg_type="text",
                    is_pk=False,
                    comment=mock.ANY),
        TableColumn(
            ordinal=9,
            name="department",
            pg_type="text",
            is_pk=False,
            comment="Department where employee worked.",
        ),
    ]
コード例 #26
0
ファイル: base.py プロジェクト: Trase/splitgraph
    Credentials,
    Params,
    TableInfo,
    SyncState,
    MountError,
    IntrospectionResult,
)
from splitgraph.engine import ResultShape

if TYPE_CHECKING:
    from splitgraph.engine.postgres.engine import PostgresEngine
    from splitgraph.core.repository import Repository

INGESTION_STATE_TABLE = "_sg_ingestion_state"
INGESTION_STATE_SCHEMA = [
    TableColumn(1, "timestamp", "timestamp", True, None),
    TableColumn(2, "state", "json", False, None),
]


class DataSource(ABC):
    params_schema: Dict[str, Any]
    credentials_schema: Dict[str, Any]
    table_params_schema: Dict[str, Any]

    supports_mount = False
    supports_sync = False
    supports_load = False

    @classmethod
    @abstractmethod
コード例 #27
0
def test_csv_data_source_s3(local_engine_empty):
    source = CSVDataSource(
        local_engine_empty,
        credentials={
            "s3_access_key": "minioclient",
            "s3_secret_key": "supersecure",
        },
        params={
            "s3_endpoint": "objectstorage:9000",
            "s3_secure": False,
            "s3_bucket": "test_csv",
            "s3_object_prefix": "some_prefix/",
        },
    )

    schema = source.introspect()

    assert len(schema.keys()) == 4
    assert schema["fruits.csv"] == (
        [
            TableColumn(ordinal=1,
                        name="fruit_id",
                        pg_type="integer",
                        is_pk=False,
                        comment=None),
            TableColumn(
                ordinal=2,
                name="timestamp",
                pg_type="timestamp without time zone",
                is_pk=False,
                comment=None,
            ),
            TableColumn(ordinal=3,
                        name="name",
                        pg_type="character varying",
                        is_pk=False,
                        comment=None),
            TableColumn(ordinal=4,
                        name="number",
                        pg_type="integer",
                        is_pk=False,
                        comment=None),
            TableColumn(ordinal=5,
                        name="bignumber",
                        pg_type="bigint",
                        is_pk=False,
                        comment=None),
            TableColumn(ordinal=6,
                        name="vbignumber",
                        pg_type="numeric",
                        is_pk=False,
                        comment=None),
        ],
        {
            "s3_object": "some_prefix/fruits.csv",
            "autodetect_dialect": False,
            "autodetect_encoding": False,
            "autodetect_header": False,
            "delimiter": ",",
            "encoding": "utf-8",
            "header": True,
            "quotechar": '"',
        },
    )
    assert schema["encoding-win-1252.csv"] == (
        [
            TableColumn(ordinal=1,
                        name="col_1",
                        pg_type="integer",
                        is_pk=False,
                        comment=None),
            TableColumn(ordinal=2,
                        name="DATE",
                        pg_type="character varying",
                        is_pk=False,
                        comment=None),
            TableColumn(ordinal=3,
                        name="TEXT",
                        pg_type="character varying",
                        is_pk=False,
                        comment=None),
        ],
        {
            "s3_object": "some_prefix/encoding-win-1252.csv",
            "autodetect_dialect": False,
            "autodetect_encoding": False,
            "autodetect_header": False,
            "delimiter": ";",
            "encoding": "Windows-1252",
            "header": True,
            "quotechar": '"',
        },
    )
    assert len(schema["rdu-weather-history.csv"][0]) == 28

    assert schema["not_a_csv.txt"] == MountError(
        table_name="not_a_csv.txt",
        error="ValueError",
        error_text="Malformed CSV: header has 7 columns, rows have 0 columns",
    )

    schema = unwrap(schema)[0]

    # Add a nonexistent file to the schema with malformed params to check preview error reporting
    schema["doesnt_exist"] = (
        [],
        {
            "s3_object": "doesnt_exist"
        },
    )
    schema["exists_but_broken"] = (
        # Force a schema that doesn't work for this CSV
        [TableColumn(1, "col_1", "date", False)],
        {
            "s3_object": "some_prefix/fruits.csv"
        },
    )

    preview = source.preview(schema)
    assert len(preview.keys()) == 5
    assert len(preview["fruits.csv"]) == 4
    assert len(preview["encoding-win-1252.csv"]) == 3
    assert len(preview["rdu-weather-history.csv"]) == 10
    assert preview["doesnt_exist"] == MountError(table_name="doesnt_exist",
                                                 error="minio.error.S3Error",
                                                 error_text=mock.ANY)
    assert preview["exists_but_broken"] == MountError(
        table_name="exists_but_broken",
        error="psycopg2.errors.InvalidDatetimeFormat",
        error_text='invalid input syntax for type date: "1"',
    )

    try:
        source.mount("temp_data")

        assert local_engine_empty.run_sql(
            'SELECT COUNT(1) FROM temp_data."fruits.csv"') == [(4, )]

        # Test NULL "inference" for numbers
        assert (local_engine_empty.run_sql(
            'SELECT number FROM temp_data."fruits.csv"',
            return_shape=ResultShape.MANY_ONE,
        ) == [1, 2, None, 4])

        assert local_engine_empty.run_sql(
            'SELECT COUNT(1) FROM temp_data."rdu-weather-history.csv"') == [
                (4633, )
            ]

        assert local_engine_empty.run_sql(
            'SELECT "TEXT" FROM temp_data."encoding-win-1252.csv"') == [
                ("Pañamao", ), ("–", ), ("División", )
            ]
    finally:
        local_engine_empty.delete_schema("temp_data")
コード例 #28
0
def test_csv_data_source_multiple(local_engine_empty):
    # End-to-end version for test_csv_introspection_multiple to check things like table params
    # getting serialized and deserialized properly.

    url = MINIO.presigned_get_object("test_csv",
                                     "some_prefix/rdu-weather-history.csv")

    credentials = {
        "s3_access_key": "minioclient",
        "s3_secret_key": "supersecure",
    }

    params = {
        "s3_endpoint": "objectstorage:9000",
        "s3_secure": False,
        "s3_bucket": "test_csv",
        # Put this delimiter in as a canary to make sure table params override server params.
        "delimiter": ",",
    }

    tables = {
        # Pass an empty table schema to denote we want to introspect it
        "from_url": ([], {
            "url": url
        }),
        "from_s3_rdu": ([], {
            "s3_object": "some_prefix/rdu-weather-history.csv"
        }),
        "from_s3_encoding": ([], {
            "s3_object": "some_prefix/encoding-win-1252.csv"
        }),
        "from_url_broken": ([], {
            "url": "invalid_url"
        }),
        "from_s3_broken": ([], {
            "s3_object": "invalid_object"
        }),
    }

    source = CSVDataSource(
        local_engine_empty,
        credentials,
        params,
        tables,
    )

    schema = source.introspect()

    assert schema == {
        "from_url": (
            mock.ANY,
            {
                "autodetect_dialect": False,
                "url": url,
                "quotechar": '"',
                "header": True,
                "encoding": "utf-8",
                "delimiter": ";",
                "autodetect_header": False,
                "autodetect_encoding": False,
            },
        ),
        "from_s3_rdu": (
            mock.ANY,
            {
                "encoding": "utf-8",
                "autodetect_dialect": False,
                "autodetect_encoding": False,
                "autodetect_header": False,
                "delimiter": ";",
                "header": True,
                "quotechar": '"',
                "s3_object": "some_prefix/rdu-weather-history.csv",
            },
        ),
        "from_s3_encoding": (
            mock.ANY,
            {
                "s3_object": "some_prefix/encoding-win-1252.csv",
                "quotechar": '"',
                "header": True,
                "encoding": "Windows-1252",
                "autodetect_dialect": False,
                "delimiter": ";",
                "autodetect_header": False,
                "autodetect_encoding": False,
            },
        ),
        "from_url_broken":
        MountError(
            table_name="from_url_broken",
            error="requests.exceptions.MissingSchema",
            error_text=
            "Invalid URL 'invalid_url': No schema supplied. Perhaps you meant http://invalid_url?",
        ),
        "from_s3_broken":
        MountError(
            table_name="from_s3_broken",
            error="minio.error.S3Error",
            error_text=mock.ANY,
        ),
    }

    # Mount the datasets with this introspected schema.
    schema = unwrap(schema)[0]
    try:
        source.mount("temp_data", tables=schema)
        rows = local_engine_empty.run_sql(
            "SELECT * FROM temp_data.from_s3_encoding")
        assert len(rows) == 3
        assert len(rows[0]) == 3
    finally:
        local_engine_empty.delete_schema("temp_data")

    # Override the delimiter and blank out the schema for a single table
    schema["from_s3_encoding"] = (
        [],
        {
            "s3_object": "some_prefix/encoding-win-1252.csv",
            "quotechar": '"',
            "header": True,
            "encoding": "Windows-1252",
            "autodetect_dialect": False,
            # We force a delimiter "," here which will make the CSV a single-column one
            # (to test we can actually override these)
            "delimiter": ",",
            "autodetect_header": False,
            "autodetect_encoding": False,
        },
    )

    # Reintrospect the source with the new table parameters
    source = CSVDataSource(local_engine_empty, credentials, params, schema)
    new_schema = source.introspect()
    assert len(new_schema) == 3
    # Check other tables are unchanged
    assert new_schema["from_url"] == schema["from_url"]
    assert new_schema["from_s3_rdu"] == schema["from_s3_rdu"]

    # Table with a changed separator only has one column (since we have , for delimiter
    # instead of ;)
    assert new_schema["from_s3_encoding"][0] == [
        TableColumn(ordinal=1,
                    name=";DATE;TEXT",
                    pg_type="character varying",
                    is_pk=False,
                    comment=None)
    ]

    try:
        source.mount("temp_data", tables=new_schema)
        rows = local_engine_empty.run_sql(
            "SELECT * FROM temp_data.from_s3_encoding")
        assert len(rows) == 3
        # Check we get one column now
        assert rows[0] == ("1;01/07/2021;Pañamao", )
    finally:
        local_engine_empty.delete_schema("temp_data")
コード例 #29
0
ファイル: test_socrata.py プロジェクト: Trase/splitgraph
def test_socrata_mounting(local_engine_empty):
    with open(os.path.join(INGESTION_RESOURCES, "socrata/find_datasets.json"),
              "r") as f:
        socrata_meta = json.load(f)

    socrata = MagicMock(spec=Socrata)
    socrata.datasets.return_value = socrata_meta
    with mock.patch("sodapy.Socrata", return_value=socrata):
        mount(
            "test/pg_mount",
            "socrata",
            {
                "domain": "example.com",
                "tables": {
                    "some_table": "xzkq-xp2w"
                },
                "app_token": "some_token",
            },
        )

    assert local_engine_empty.get_full_table_schema(
        "test/pg_mount", "some_table"
    ) == [
        TableColumn(ordinal=1,
                    name=":id",
                    pg_type="text",
                    is_pk=False,
                    comment="Socrata column ID"),
        TableColumn(
            ordinal=2,
            name="full_or_part_time",
            pg_type="text",
            is_pk=False,
            comment=
            "Whether the employee was employed full- (F) or part-time (P).",
        ),
        TableColumn(ordinal=3,
                    name="hourly_rate",
                    pg_type="numeric",
                    is_pk=False,
                    comment=mock.ANY),
        TableColumn(ordinal=4,
                    name="salary_or_hourly",
                    pg_type="text",
                    is_pk=False,
                    comment=mock.ANY),
        TableColumn(
            ordinal=5,
            name="job_titles",
            pg_type="text",
            is_pk=False,
            comment="Title of employee at the time when the data was updated.",
        ),
        TableColumn(ordinal=6,
                    name="typical_hours",
                    pg_type="numeric",
                    is_pk=False,
                    comment=mock.ANY),
        TableColumn(ordinal=7,
                    name="annual_salary",
                    pg_type="numeric",
                    is_pk=False,
                    comment=mock.ANY),
        TableColumn(ordinal=8,
                    name=_long_name_col_sg,
                    pg_type="text",
                    is_pk=False,
                    comment=mock.ANY),
        TableColumn(
            ordinal=9,
            name="department",
            pg_type="text",
            is_pk=False,
            comment="Department where employee worked.",
        ),
    ]

    assert local_engine_empty.run_sql(
        "SELECT option_value FROM information_schema.foreign_table_options "
        "WHERE foreign_table_name = 'some_table' "
        "AND foreign_table_schema = 'test/pg_mount' "
        "AND option_name = 'column_map'") == [
            (f'{{"{_long_name_col_sg}": "{_long_name_col}"}}', )
        ]
コード例 #30
0
ファイル: mount.py プロジェクト: splitgraph/splitgraph
from typing import Dict, Optional

from splitgraph.core.types import (
    TableColumn,
    TableInfo,
    IntrospectionResult,
)

# Define the schema of the foreign table we wish to create
# We're only going to be fetching stories, so limit the columns to the ones that
# show up for stories. See https://github.com/HackerNews/API for reference.
from splitgraph.hooks.data_source.fdw import ForeignDataWrapperDataSource

_story_schema_spec = [
    TableColumn(1, "id", "integer", True),
    TableColumn(2, "by", "text", False),
    TableColumn(3, "time", "integer", False),
    TableColumn(4, "title", "text", False),
    TableColumn(5, "url", "text", False),
    TableColumn(6, "text", "text", False),
    TableColumn(7, "score", "integer", False),
    TableColumn(8, "kids", "integer[]", False),
    TableColumn(9, "descendants", "integer", False),
]

_all_endpoints = [
    "topstories",
    "newstories",
    "beststories",
    "askstories",
    "showstories",