def test_csv_mac_newlines(): # Test a CSV file with old Mac-style newlines (\r) with open(os.path.join(INGESTION_RESOURCES_CSV, "mac_newlines.csv"), "rb") as f: options = CSVOptions() options, reader = make_csv_reader(f, options) assert options.encoding == "utf-8" assert options.header is True data = list(reader) assert len(data) == 5 assert data[0] == ["fruit_id", "timestamp", "name"] schema = generate_column_names(infer_sg_schema(data)) assert schema == [ TableColumn(ordinal=1, name="fruit_id", pg_type="integer", is_pk=False, comment=None), TableColumn(ordinal=2, name="timestamp", pg_type="timestamp", is_pk=False, comment=None), TableColumn(ordinal=3, name="name", pg_type="character varying", is_pk=False, comment=None), ]
def test_table_schema_params_to_dict(): assert table_schema_params_to_dict({ "fruits": ( [ TableColumn(ordinal=1, name="fruit_id", pg_type="integer", is_pk=False, comment=None), TableColumn( ordinal=2, name="name", pg_type="character varying", is_pk=False, comment=None, ), ], { "key": "value" }, ), "vegetables": ( [ TableColumn(ordinal=1, name="vegetable_id", pg_type="integer", is_pk=False, comment=None), TableColumn( ordinal=2, name="name", pg_type="character varying", is_pk=False, comment=None, ), ], { "key": "value" }, ), }) == { "fruits": { "schema": { "fruit_id": "integer", "name": "character varying" }, "options": { "key": "value" }, }, "vegetables": { "schema": { "name": "character varying", "vegetable_id": "integer" }, "options": { "key": "value" }, }, }
def test_socrata_column_deduplication(): assert dedupe_sg_schema( [ TableColumn(1, "normal_col", "some_type", True), TableColumn( 2, "long_col_but_not_unique_until_the_59th_char_somewhere_there_yep_this_is_different", "some_type", False, ), TableColumn(3, "long_col_but_still_unique" * 3, "some_type", False), TableColumn( 4, "long_col_but_not_unique_until_the_59th_char_somewhere_there_and_this_is_even_more_so", "some_type", False, ), TableColumn( 5, "long_col_but_not_unique_until_the_59th_char_somewhere_there_and_wow_yep_were_done", "some_type", False, ), ] ) == [ TableColumn(ordinal=1, name="normal_col", pg_type="some_type", is_pk=True, comment=None), TableColumn( ordinal=2, name="long_col_but_not_unique_until_the_59th_char_somewhere_there_000", pg_type="some_type", is_pk=False, comment=None, ), TableColumn( ordinal=3, name="long_col_but_still_uniquelong_col_but_still_uniquelong_col_but_", pg_type="some_type", is_pk=False, comment=None, ), TableColumn( ordinal=4, name="long_col_but_not_unique_until_the_59th_char_somewhere_there_001", pg_type="some_type", is_pk=False, comment=None, ), TableColumn( ordinal=5, name="long_col_but_not_unique_until_the_59th_char_somewhere_there_002", pg_type="some_type", is_pk=False, comment=None, ), ]
def socrata_to_sg_schema(metadata: Dict[str, Any]) -> Tuple[TableSchema, Dict[str, str]]: try: col_names = metadata["resource"]["columns_field_name"] col_types = metadata["resource"]["columns_datatype"] except KeyError: raise ValueError("Invalid Socrata metadata!") col_desc = metadata["resource"].get("columns_description") or [None] * len(col_names) # Prepend the Socrata :id column that we can order on and use as PK. col_names = [":id"] + col_names col_types = ["text"] + col_types col_desc = ["Socrata column ID"] + col_desc result = [ TableColumn(i, n, _socrata_to_pg_type(t), False, d) for i, (n, t, d) in enumerate(zip(col_names, col_types, col_desc)) ] # Truncate Socrata column names to 63 characters and calculate # a map of Splitgraph columns to Socrata columns. result_deduped = dedupe_sg_schema(result) sg_to_socrata_cols = { d.name: r.name for r, d in zip(result, result_deduped) if d.name != r.name } return result_deduped, sg_to_socrata_cols
def infer_sg_schema( sample: List[Tuple[str, ...]], override_types: Optional[Dict[str, str]] = None, primary_keys: Optional[List[str]] = None, ): override_types = override_types or {} primary_keys = primary_keys or [] result: TableSchema = [] header = sample[0] columns = list(zip(*sample[1:])) if len(columns) != len(header): raise ValueError( "Malformed CSV: header has %d columns, rows have %d columns" % (len(header), len(columns))) for i, (c_name, c_sample) in enumerate(zip(header, columns)): pg_type = override_types.get(c_name, _infer_column_schema(c_sample)) result.append( TableColumn( ordinal=i + 1, name=c_name, pg_type=pg_type, is_pk=(c_name in primary_keys), )) return result
def get_full_table_schema(self, schema: str, table_name: str) -> "TableSchema": """ Generates a list of (column ordinal, name, data type, is_pk, column comment), used to detect schema changes like columns being dropped/added/renamed or type changes. NB this doesn't work for temporary tables (pg_temp) and returns an empty schema. """ assert schema != "pg_temp" results = self.run_sql( SQL("SELECT c.attnum, c.attname, " "pg_catalog.format_type(c.atttypid, c.atttypmod), " "col_description('{}.{}'::regclass, c.attnum) " "FROM pg_attribute c JOIN pg_class t ON c.attrelid = t.oid " "JOIN pg_namespace n ON t.relnamespace = n.oid " "WHERE n.nspname = %s AND t.relname = %s AND NOT c.attisdropped " "AND c.attnum >= 0 ORDER BY c.attnum ").format( Identifier(schema), Identifier(table_name)), (schema, table_name), ) def _convert_type(ctype): # We don't keep a lot of type information, so e.g. char(5) gets turned into char # which defaults into char(1). return ctype if ctype != "character" else "character varying" # Do we need to make sure the PK has the same type + ordinal position here? pks = [pk for pk, _ in self.get_primary_keys(schema, table_name)] return [ TableColumn(o, n, _convert_type(dt), (n in pks), c) for o, n, dt, c in results ]
def test_schema_changes(pg_repo_local, test_case): action, expected_new_schema = test_case assert ( pg_repo_local.engine.get_full_table_schema(pg_repo_local.to_schema(), "fruits") == OLD_SCHEMA ) pg_repo_local.run_sql(action) pg_repo_local.commit_engines() assert ( pg_repo_local.engine.get_full_table_schema(pg_repo_local.to_schema(), "fruits") == expected_new_schema ) head = pg_repo_local.head new_head = pg_repo_local.commit() # Test that the new image was stored as new object with the new schema. assert len(new_head.get_table("fruits").objects) == 1 new_snap = new_head.get_table("fruits").objects[0] assert pg_repo_local.engine.get_object_schema(new_snap) == _drop_comments( expected_new_schema + [TableColumn(expected_new_schema[-1].ordinal + 1, SG_UD_FLAG, "boolean", False)] ) head.checkout() assert ( pg_repo_local.engine.get_full_table_schema(pg_repo_local.to_schema(), "fruits") == OLD_SCHEMA ) new_head.checkout() assert pg_repo_local.engine.get_full_table_schema( pg_repo_local.to_schema(), "fruits" ) == _reassign_ordinals(expected_new_schema)
def test_mount_rename_table(local_engine_empty): tables = { "fruits_renamed": ( [ TableColumn(ordinal=1, name="fruit_id", pg_type="integer", is_pk=False, comment=None), TableColumn( ordinal=2, name="name", pg_type="character varying", is_pk=False, comment=None, ), ], { "table_name": "fruits" }, ) } handler = PostgreSQLDataSource( engine=local_engine_empty, credentials={ "username": "******", "password": "******" }, params={ "host": "pgorigin", "port": 5432, "dbname": "origindb", "remote_schema": "public" }, tables=tables, ) preview = handler.preview(tables) assert preview == { "fruits_renamed": [{ "fruit_id": 1, "name": "apple" }, { "fruit_id": 2, "name": "orange" }], }
def get_table_schema(self, table_name, table_schema): # Add the "_id" column to the schema if it's not already there. if any(c.name == "_id" for c in table_schema): return table_schema return table_schema + [ TableColumn(table_schema[-1].ordinal + 1, "_id", "NAME", False) ]
def test_csv_dialect_encoding_inference(): # Test CSV dialect inference with: # - win-1252 encoding (will autodetect with chardet) # - Windows line endings # - different separator # - first column name missing with open(os.path.join(INGESTION_RESOURCES_CSV, "encoding-win-1252.csv"), "rb") as f: options = CSVOptions() options, reader = make_csv_reader(f, options) assert options.encoding == "Windows-1252" assert options.header is True # NB we don't extract everything from the sniffed dialect, just the delimiter and the # quotechar. The sniffer also returns doublequote and skipinitialspace. assert options.delimiter == ";" data = list(reader) assert data == [ ["", "DATE", "TEXT"], ["1", "01/07/2021", "Pañamao"], ["2", "06/11/2018", "–"], ["3", "28/05/2018", "División"], ] schema = generate_column_names(infer_sg_schema(data)) assert schema == [ TableColumn(ordinal=1, name="col_1", pg_type="integer", is_pk=False, comment=None), TableColumn(ordinal=2, name="DATE", pg_type="character varying", is_pk=False, comment=None), TableColumn(ordinal=3, name="TEXT", pg_type="character varying", is_pk=False, comment=None), ]
def test_mount_elasticsearch(local_engine_empty): # No ES running in this stack: this is just a test that we can instantiate the FDW. repo = Repository("test", "es_mount") try: mount( repo.to_schema(), "elasticsearch", dict( username=None, password=None, server="elasticsearch", port=9200, table_spec={ "table_1": { "schema": { "id": "text", "@timestamp": "timestamp", "query": "text", "col_1": "text", "col_2": "boolean", }, "index": "index-pattern*", "rowid_column": "id", "query_column": "query", } }, ), ) assert get_engine().get_full_table_schema(repo.to_schema(), "table_1") == [ TableColumn(ordinal=1, name="id", pg_type="text", is_pk=False, comment=None), TableColumn( ordinal=2, name="@timestamp", pg_type="timestamp without time zone", is_pk=False, comment=None, ), TableColumn(ordinal=3, name="query", pg_type="text", is_pk=False, comment=None), TableColumn(ordinal=4, name="col_1", pg_type="text", is_pk=False, comment=None), TableColumn(ordinal=5, name="col_2", pg_type="boolean", is_pk=False, comment=None), ] finally: repo.delete()
def test_pandas_basic_insert(ingestion_test_repo): df_to_table(base_df, ingestion_test_repo, "test_table", if_exists="patch") ingestion_test_repo.commit() assert ingestion_test_repo.head.get_table("test_table").table_schema == [ TableColumn(1, "fruit_id", "bigint", True), TableColumn(2, "timestamp", "timestamp without time zone", False), TableColumn(3, "name", "text", False), ] assert ingestion_test_repo.run_sql( "SELECT fruit_id, timestamp, name FROM test_table ORDER BY fruit_id" ) == [ (1, dt(2018, 1, 1, 0, 11, 11), "apple"), (2, dt(2018, 1, 2, 0, 22, 22), "orange"), (3, dt(2018, 1, 3, 0, 33, 33), "mayonnaise"), (4, dt(2018, 1, 4, 0, 44, 44), "mustard"), ]
def dedupe_sg_schema(schema_spec: TableSchema, prefix_len: int = 59) -> TableSchema: """ Some Socrata schemas have columns that are longer than 63 characters where the first 63 characters are the same between several columns (e.g. odn.data.socrata.com). This routine renames columns in a schema to make sure this can't happen (by giving duplicates a number suffix). """ # We truncate the column name to 59 to leave space for the underscore # and 3 digits (max PG identifier is 63 chars) prefix_counts: Dict[str, int] = {} columns_nums: List[Tuple[str, int]] = [] for column in schema_spec: column_short = column.name[:prefix_len] count = prefix_counts.get(column_short, 0) columns_nums.append((column_short, count)) prefix_counts[column_short] = count + 1 result = [] for (_, position), column in zip(columns_nums, schema_spec): column_short = column.name[:prefix_len] count = prefix_counts[column_short] if count > 1: result.append( TableColumn( column.ordinal, f"{column_short}_{position:03d}", column.pg_type, column.is_pk, column.comment, ) ) else: result.append( TableColumn( column.ordinal, column.name[:POSTGRES_MAX_IDENTIFIER], column.pg_type, column.is_pk, column.comment, ) ) return result
def test_pandas_no_processing_insert(ingestion_test_repo): # Make sure everything still works when we don't have a PK. df = pd.read_csv(os.path.join(INGESTION_RESOURCES_CSV, "base_df.csv")) df_to_table(df, ingestion_test_repo, "test_table") ingestion_test_repo.commit() assert ingestion_test_repo.head.get_table("test_table").table_schema == [ TableColumn(1, "fruit_id", "bigint", False), TableColumn(2, "timestamp", "text", False), TableColumn(3, "name", "text", False), ] assert ingestion_test_repo.run_sql( "SELECT fruit_id, timestamp, name FROM test_table ORDER BY fruit_id" ) == [ (1, "2018-01-01 00:11:11", "apple"), (2, "2018-01-02 00:22:22", "orange"), (3, "2018-01-03 00:33:33", "mayonnaise"), (4, "2018-01-04 00:44:44", "mustard"), ]
def _mount_mysql(repository): # We don't use this one in tests beyond basic mounting, so no point importing it. mount( repository.to_schema(), "mysql_fdw", dict( host="mysqlorigin", port=3306, username="******", password="******", dbname="mysqlschema", ), tables={ "mushrooms": ( [ TableColumn(1, "mushroom_id", "integer", False), TableColumn(2, "name", "character varying (20)", False), TableColumn(3, "discovery", "timestamp", False), TableColumn(4, "friendly", "boolean", False), TableColumn(5, "binary_data", "bytea", False), TableColumn(6, "varbinary_data", "bytea", False), ], {}, ) }, )
def test_dict_to_table_schema_params(): assert dict_to_table_schema_params( { k: ExternalTableRequest.parse_obj(v) for k, v in { "fruits": { "schema": {"fruit_id": "integer", "name": "character varying"}, "options": {"key": "value"}, }, "vegetables": { "schema": {"name": "character varying", "vegetable_id": "integer"}, "options": {"key": "value"}, }, }.items() } ) == { "fruits": ( [ TableColumn( ordinal=1, name="fruit_id", pg_type="integer", is_pk=False, comment=None ), TableColumn( ordinal=2, name="name", pg_type="character varying", is_pk=False, comment=None ), ], {"key": "value"}, ), "vegetables": ( [ TableColumn( ordinal=1, name="name", pg_type="character varying", is_pk=False, comment=None ), TableColumn( ordinal=2, name="vegetable_id", pg_type="integer", is_pk=False, comment=None ), ], {"key": "value"}, ), }
def test_mount_force_schema(local_engine_empty): _mount_postgres( PG_MNT, tables={"fruits": { "schema": { "fruit_id": "character varying" } }}) assert get_engine().table_exists(PG_MNT.to_schema(), "fruits") assert get_engine().get_full_table_schema( PG_MNT.to_schema(), "fruits") == [ TableColumn(1, "fruit_id", "character varying", False, None) ]
def generate_table(repository: "Repository", table_name: str, size: int) -> None: """ Creates a table with an integer primary key and a string value. :param repository: Checked-out Repository to create the table in. :param table_name: Name of the table to generate :param size: Number of rows in the table. """ from psycopg2.sql import SQL from psycopg2.sql import Identifier repository.engine.create_table( repository.to_schema(), table_name, [ TableColumn(1, "key", "integer", True, "Some key"), TableColumn(2, "value", "varchar", False, "Some value"), ], ) repository.engine.run_sql_batch( SQL("INSERT INTO {} VALUES (%s, %s)").format(Identifier(table_name)), [(i, _hash(i)) for i in range(size)], schema=repository.to_schema(), )
def socrata_to_sg_schema(metadata: Dict[str, Any]) -> TableSchema: try: col_names = metadata["resource"]["columns_field_name"] col_types = metadata["resource"]["columns_datatype"] except KeyError: raise ValueError("Invalid Socrata metadata!") col_desc = metadata["resource"].get( "columns_description") or [None] * len(col_names) # Prepend the Socrata :id column that we can order on and use as PK. col_names = [":id"] + col_names col_types = ["text"] + col_types col_desc = ["Socrata column ID"] + col_desc result = [ TableColumn(i, n, _socrata_to_pg_type(t), False, d) for i, (n, t, d) in enumerate(zip(col_names, col_types, col_desc)) ] return dedupe_sg_schema(result)
def test_singer_ingestion_schema_change(local_engine_empty): # Run the initial ingestion and then another one where we've changed the user_id in # stargazers to be a string. runner = CliRunner(mix_stderr=False) with open(os.path.join(INGESTION_RESOURCES, "singer/initial.json"), "r") as f: result = runner.invoke(singer_target, [TEST_REPO + ":latest"], input=f, catch_exceptions=False) assert result.exit_code == 0 with open(os.path.join(INGESTION_RESOURCES, "singer/schema_change.json"), "r") as f: result = runner.invoke(singer_target, [TEST_REPO + ":latest"], input=f, catch_exceptions=False) assert result.exit_code == 0 assert json.loads(result.stdout) == { "bookmarks": { "splitgraph/splitgraph": { "stargazers": { "since": "2020-10-14T11:06:42.565793Z" }, } } } repo = Repository.from_schema(TEST_REPO) assert len(repo.images()) == 2 image = repo.images["latest"] assert sorted(image.get_tables()) == ["releases", "stargazers"] image.checkout() assert repo.run_sql("SELECT COUNT(1) FROM releases", return_shape=ResultShape.ONE_ONE) == 6 assert repo.run_sql("SELECT COUNT(1) FROM stargazers", return_shape=ResultShape.ONE_ONE) == 6 assert repo.run_sql( "SELECT user_id, starred_at FROM stargazers ORDER BY user_id") == [ ("100001", datetime(2018, 10, 17, 22, 14, 12)), ("100002", datetime(2018, 11, 6, 11, 26, 16)), ("100003", datetime(2018, 12, 11, 16, 0, 42)), ("100004", datetime(2020, 10, 11, 21, 9, 30)), ("100005", datetime(2019, 4, 18, 2, 40, 47)), ("string_user_id", datetime(2019, 4, 18, 2, 40, 47)), ] # Releases unchanged -- same table assert image.get_table("releases").table_schema == _RELEASES_SCHEMA assert image.get_table("releases").objects == [ "o160e0b0db4ad7e7eb7c4db26bf8183461f65968be64b8594c7cc71fbf5ff2a" ] assert image.get_table("stargazers").table_schema == [ TableColumn( ordinal=0, name="_sdc_repository", pg_type="character varying", is_pk=False, comment=None, ), TableColumn( ordinal=1, name="starred_at", pg_type="timestamp without time zone", is_pk=False, comment=None, ), TableColumn(ordinal=2, name="user", pg_type="jsonb", is_pk=False, comment=None), TableColumn(ordinal=3, name="user_id", pg_type="character varying", is_pk=True, comment=None), ] # Stargazers: had a migration, new object assert image.get_table("stargazers").objects == [ "o9e54958076c86d854ad21da17239daecaec839e84daee8ff9ca5dcecd84cdd" ]
def test_singer_tap_mysql_introspection(local_engine_empty): source = _source(local_engine_empty) assert source.introspect() == { "mushrooms": ( [ TableColumn( ordinal=0, name="discovery", pg_type="timestamp without time zone", is_pk=False, comment=None, ), TableColumn(ordinal=1, name="friendly", pg_type="boolean", is_pk=False, comment=None), TableColumn(ordinal=2, name="mushroom_id", pg_type="integer", is_pk=True, comment=None), TableColumn(ordinal=3, name="name", pg_type="character varying", is_pk=False, comment=None), ], {}, ) } singer_config = source.get_singer_config() assert singer_config == { "host": "localhost", "password": "******", "port": 3306, "replication_method": "INCREMENTAL", "user": "******", } # Binary datatypes aren't supported by tap-singer but we make sure it's aware of them # (shows that they're not supported). singer_catalog = source._run_singer_discovery(singer_config) assert singer_catalog == { "streams": [{ "metadata": mock.ANY, "schema": { "properties": { "discovery": { "format": "date-time", "inclusion": "available", "type": ["null", "string"], }, "friendly": { "inclusion": "available", "type": ["null", "boolean"] }, "mushroom_id": { "inclusion": "automatic", "maximum": 2147483647, "minimum": -2147483648, "type": ["null", "integer"], }, "name": { "inclusion": "available", "maxLength": 20, "type": ["null", "string"], }, "binary_data": { "description": "Unsupported column type binary(7)", "inclusion": "unsupported", }, "varbinary_data": { "description": "Unsupported column type varbinary(16)", "inclusion": "unsupported", }, }, "type": "object", }, "stream": "mushrooms", "table_name": "mushrooms", "tap_stream_id": "mysqlschema-mushrooms", }] } assert sorted(singer_catalog["streams"][0]["metadata"], key=lambda m: m["breadcrumb"]) == [ { "breadcrumb": [], "metadata": { "database-name": "mysqlschema", "is-view": False, "row-count": 2, "selected-by-default": False, "table-key-properties": ["mushroom_id"], }, }, { "breadcrumb": ["properties", "binary_data"], "metadata": { "selected-by-default": False, "sql-datatype": "binary(7)" }, }, { "breadcrumb": ["properties", "discovery"], "metadata": { "selected-by-default": True, "sql-datatype": "datetime" }, }, { "breadcrumb": ["properties", "friendly"], "metadata": { "selected-by-default": True, "sql-datatype": "tinyint(1)" }, }, { "breadcrumb": ["properties", "mushroom_id"], "metadata": { "selected-by-default": True, "sql-datatype": "int(11)" }, }, { "breadcrumb": ["properties", "name"], "metadata": { "selected-by-default": True, "sql-datatype": "varchar(20)", }, }, { "breadcrumb": ["properties", "varbinary_data"], "metadata": { "selected-by-default": False, "sql-datatype": "varbinary(16)" }, }, ] selected_catalog = source.build_singer_catalog(singer_catalog, tables=None) assert select_breadcrumb(selected_catalog["streams"][0], []) == { "database-name": "mysqlschema", "is-view": False, "replication-key": "mushroom_id", "replication-method": "INCREMENTAL", "row-count": 2, "selected": True, "selected-by-default": False, "table-key-properties": ["mushroom_id"], }
def test_mount_introspection_preview(local_engine_empty): handler = PostgreSQLDataSource( engine=local_engine_empty, credentials={ "username": "******", "password": "******" }, params={ "host": "pgorigin", "port": 5432, "dbname": "origindb", "remote_schema": "public" }, ) tables = handler.introspect() assert tables == { "fruits": ( [ TableColumn(ordinal=1, name="fruit_id", pg_type="integer", is_pk=False, comment=None), TableColumn(ordinal=2, name="name", pg_type="character varying", is_pk=False, comment=None), ], { "schema_name": "public", "table_name": "fruits" }, ), "vegetables": ( [ TableColumn(ordinal=1, name="vegetable_id", pg_type="integer", is_pk=False, comment=None), TableColumn(ordinal=2, name="name", pg_type="character varying", is_pk=False, comment=None), ], { "schema_name": "public", "table_name": "vegetables" }, ), } preview = handler.preview(tables=tables) assert preview == { "fruits": [{ "fruit_id": 1, "name": "apple" }, { "fruit_id": 2, "name": "orange" }], "vegetables": [ { "name": "potato", "vegetable_id": 1 }, { "name": "carrot", "vegetable_id": 2 }, ], }
from splitgraph.core.repository import Repository from splitgraph.core.types import TableColumn from splitgraph.engine import ResultShape from splitgraph.ingestion.singer.commandline import singer_target from splitgraph.ingestion.singer.data_source import GenericSingerDataSource, MySQLSingerDataSource from splitgraph.ingestion.singer.db_sync import select_breadcrumb TEST_REPO = "test/singer" TEST_TAP = os.path.join(INGESTION_RESOURCES, "singer/fake_tap.py") _STARGAZERS_SCHEMA = [ TableColumn( ordinal=0, name="_sdc_repository", pg_type="character varying", is_pk=False, comment=None, ), TableColumn( ordinal=1, name="starred_at", pg_type="timestamp without time zone", is_pk=False, comment=None, ), TableColumn(ordinal=2, name="user", pg_type="jsonb", is_pk=False, comment=None),
def _get_sg_schema(flattened_schema, primary_key) -> TableSchema: return [ TableColumn(i, name, column_type(schema_property), name in primary_key, None) for i, (name, schema_property) in enumerate(flattened_schema.items()) ]
def test_socrata_mounting(local_engine_empty): with open(os.path.join(INGESTION_RESOURCES, "socrata/find_datasets.json"), "r") as f: socrata_meta = json.load(f) socrata = MagicMock(spec=Socrata) socrata.datasets.return_value = socrata_meta with mock.patch("sodapy.Socrata", return_value=socrata): mount_socrata( "test/pg_mount", None, None, None, None, "example.com", {"some_table": "xzkq-xp2w"}, "some_token", ) assert local_engine_empty.get_full_table_schema( "test/pg_mount", "some_table" ) == [ TableColumn(ordinal=1, name=":id", pg_type="text", is_pk=False, comment="Socrata column ID"), TableColumn( ordinal=2, name="full_or_part_time", pg_type="text", is_pk=False, comment= "Whether the employee was employed full- (F) or part-time (P).", ), TableColumn(ordinal=3, name="hourly_rate", pg_type="numeric", is_pk=False, comment=mock.ANY), TableColumn(ordinal=4, name="salary_or_hourly", pg_type="text", is_pk=False, comment=mock.ANY), TableColumn( ordinal=5, name="job_titles", pg_type="text", is_pk=False, comment="Title of employee at the time when the data was updated.", ), TableColumn(ordinal=6, name="typical_hours", pg_type="numeric", is_pk=False, comment=mock.ANY), TableColumn(ordinal=7, name="annual_salary", pg_type="numeric", is_pk=False, comment=mock.ANY), TableColumn(ordinal=8, name="name", pg_type="text", is_pk=False, comment=mock.ANY), TableColumn( ordinal=9, name="department", pg_type="text", is_pk=False, comment="Department where employee worked.", ), ]
Credentials, Params, TableInfo, SyncState, MountError, IntrospectionResult, ) from splitgraph.engine import ResultShape if TYPE_CHECKING: from splitgraph.engine.postgres.engine import PostgresEngine from splitgraph.core.repository import Repository INGESTION_STATE_TABLE = "_sg_ingestion_state" INGESTION_STATE_SCHEMA = [ TableColumn(1, "timestamp", "timestamp", True, None), TableColumn(2, "state", "json", False, None), ] class DataSource(ABC): params_schema: Dict[str, Any] credentials_schema: Dict[str, Any] table_params_schema: Dict[str, Any] supports_mount = False supports_sync = False supports_load = False @classmethod @abstractmethod
def test_csv_data_source_s3(local_engine_empty): source = CSVDataSource( local_engine_empty, credentials={ "s3_access_key": "minioclient", "s3_secret_key": "supersecure", }, params={ "s3_endpoint": "objectstorage:9000", "s3_secure": False, "s3_bucket": "test_csv", "s3_object_prefix": "some_prefix/", }, ) schema = source.introspect() assert len(schema.keys()) == 4 assert schema["fruits.csv"] == ( [ TableColumn(ordinal=1, name="fruit_id", pg_type="integer", is_pk=False, comment=None), TableColumn( ordinal=2, name="timestamp", pg_type="timestamp without time zone", is_pk=False, comment=None, ), TableColumn(ordinal=3, name="name", pg_type="character varying", is_pk=False, comment=None), TableColumn(ordinal=4, name="number", pg_type="integer", is_pk=False, comment=None), TableColumn(ordinal=5, name="bignumber", pg_type="bigint", is_pk=False, comment=None), TableColumn(ordinal=6, name="vbignumber", pg_type="numeric", is_pk=False, comment=None), ], { "s3_object": "some_prefix/fruits.csv", "autodetect_dialect": False, "autodetect_encoding": False, "autodetect_header": False, "delimiter": ",", "encoding": "utf-8", "header": True, "quotechar": '"', }, ) assert schema["encoding-win-1252.csv"] == ( [ TableColumn(ordinal=1, name="col_1", pg_type="integer", is_pk=False, comment=None), TableColumn(ordinal=2, name="DATE", pg_type="character varying", is_pk=False, comment=None), TableColumn(ordinal=3, name="TEXT", pg_type="character varying", is_pk=False, comment=None), ], { "s3_object": "some_prefix/encoding-win-1252.csv", "autodetect_dialect": False, "autodetect_encoding": False, "autodetect_header": False, "delimiter": ";", "encoding": "Windows-1252", "header": True, "quotechar": '"', }, ) assert len(schema["rdu-weather-history.csv"][0]) == 28 assert schema["not_a_csv.txt"] == MountError( table_name="not_a_csv.txt", error="ValueError", error_text="Malformed CSV: header has 7 columns, rows have 0 columns", ) schema = unwrap(schema)[0] # Add a nonexistent file to the schema with malformed params to check preview error reporting schema["doesnt_exist"] = ( [], { "s3_object": "doesnt_exist" }, ) schema["exists_but_broken"] = ( # Force a schema that doesn't work for this CSV [TableColumn(1, "col_1", "date", False)], { "s3_object": "some_prefix/fruits.csv" }, ) preview = source.preview(schema) assert len(preview.keys()) == 5 assert len(preview["fruits.csv"]) == 4 assert len(preview["encoding-win-1252.csv"]) == 3 assert len(preview["rdu-weather-history.csv"]) == 10 assert preview["doesnt_exist"] == MountError(table_name="doesnt_exist", error="minio.error.S3Error", error_text=mock.ANY) assert preview["exists_but_broken"] == MountError( table_name="exists_but_broken", error="psycopg2.errors.InvalidDatetimeFormat", error_text='invalid input syntax for type date: "1"', ) try: source.mount("temp_data") assert local_engine_empty.run_sql( 'SELECT COUNT(1) FROM temp_data."fruits.csv"') == [(4, )] # Test NULL "inference" for numbers assert (local_engine_empty.run_sql( 'SELECT number FROM temp_data."fruits.csv"', return_shape=ResultShape.MANY_ONE, ) == [1, 2, None, 4]) assert local_engine_empty.run_sql( 'SELECT COUNT(1) FROM temp_data."rdu-weather-history.csv"') == [ (4633, ) ] assert local_engine_empty.run_sql( 'SELECT "TEXT" FROM temp_data."encoding-win-1252.csv"') == [ ("Pañamao", ), ("–", ), ("División", ) ] finally: local_engine_empty.delete_schema("temp_data")
def test_csv_data_source_multiple(local_engine_empty): # End-to-end version for test_csv_introspection_multiple to check things like table params # getting serialized and deserialized properly. url = MINIO.presigned_get_object("test_csv", "some_prefix/rdu-weather-history.csv") credentials = { "s3_access_key": "minioclient", "s3_secret_key": "supersecure", } params = { "s3_endpoint": "objectstorage:9000", "s3_secure": False, "s3_bucket": "test_csv", # Put this delimiter in as a canary to make sure table params override server params. "delimiter": ",", } tables = { # Pass an empty table schema to denote we want to introspect it "from_url": ([], { "url": url }), "from_s3_rdu": ([], { "s3_object": "some_prefix/rdu-weather-history.csv" }), "from_s3_encoding": ([], { "s3_object": "some_prefix/encoding-win-1252.csv" }), "from_url_broken": ([], { "url": "invalid_url" }), "from_s3_broken": ([], { "s3_object": "invalid_object" }), } source = CSVDataSource( local_engine_empty, credentials, params, tables, ) schema = source.introspect() assert schema == { "from_url": ( mock.ANY, { "autodetect_dialect": False, "url": url, "quotechar": '"', "header": True, "encoding": "utf-8", "delimiter": ";", "autodetect_header": False, "autodetect_encoding": False, }, ), "from_s3_rdu": ( mock.ANY, { "encoding": "utf-8", "autodetect_dialect": False, "autodetect_encoding": False, "autodetect_header": False, "delimiter": ";", "header": True, "quotechar": '"', "s3_object": "some_prefix/rdu-weather-history.csv", }, ), "from_s3_encoding": ( mock.ANY, { "s3_object": "some_prefix/encoding-win-1252.csv", "quotechar": '"', "header": True, "encoding": "Windows-1252", "autodetect_dialect": False, "delimiter": ";", "autodetect_header": False, "autodetect_encoding": False, }, ), "from_url_broken": MountError( table_name="from_url_broken", error="requests.exceptions.MissingSchema", error_text= "Invalid URL 'invalid_url': No schema supplied. Perhaps you meant http://invalid_url?", ), "from_s3_broken": MountError( table_name="from_s3_broken", error="minio.error.S3Error", error_text=mock.ANY, ), } # Mount the datasets with this introspected schema. schema = unwrap(schema)[0] try: source.mount("temp_data", tables=schema) rows = local_engine_empty.run_sql( "SELECT * FROM temp_data.from_s3_encoding") assert len(rows) == 3 assert len(rows[0]) == 3 finally: local_engine_empty.delete_schema("temp_data") # Override the delimiter and blank out the schema for a single table schema["from_s3_encoding"] = ( [], { "s3_object": "some_prefix/encoding-win-1252.csv", "quotechar": '"', "header": True, "encoding": "Windows-1252", "autodetect_dialect": False, # We force a delimiter "," here which will make the CSV a single-column one # (to test we can actually override these) "delimiter": ",", "autodetect_header": False, "autodetect_encoding": False, }, ) # Reintrospect the source with the new table parameters source = CSVDataSource(local_engine_empty, credentials, params, schema) new_schema = source.introspect() assert len(new_schema) == 3 # Check other tables are unchanged assert new_schema["from_url"] == schema["from_url"] assert new_schema["from_s3_rdu"] == schema["from_s3_rdu"] # Table with a changed separator only has one column (since we have , for delimiter # instead of ;) assert new_schema["from_s3_encoding"][0] == [ TableColumn(ordinal=1, name=";DATE;TEXT", pg_type="character varying", is_pk=False, comment=None) ] try: source.mount("temp_data", tables=new_schema) rows = local_engine_empty.run_sql( "SELECT * FROM temp_data.from_s3_encoding") assert len(rows) == 3 # Check we get one column now assert rows[0] == ("1;01/07/2021;Pañamao", ) finally: local_engine_empty.delete_schema("temp_data")
def test_socrata_mounting(local_engine_empty): with open(os.path.join(INGESTION_RESOURCES, "socrata/find_datasets.json"), "r") as f: socrata_meta = json.load(f) socrata = MagicMock(spec=Socrata) socrata.datasets.return_value = socrata_meta with mock.patch("sodapy.Socrata", return_value=socrata): mount( "test/pg_mount", "socrata", { "domain": "example.com", "tables": { "some_table": "xzkq-xp2w" }, "app_token": "some_token", }, ) assert local_engine_empty.get_full_table_schema( "test/pg_mount", "some_table" ) == [ TableColumn(ordinal=1, name=":id", pg_type="text", is_pk=False, comment="Socrata column ID"), TableColumn( ordinal=2, name="full_or_part_time", pg_type="text", is_pk=False, comment= "Whether the employee was employed full- (F) or part-time (P).", ), TableColumn(ordinal=3, name="hourly_rate", pg_type="numeric", is_pk=False, comment=mock.ANY), TableColumn(ordinal=4, name="salary_or_hourly", pg_type="text", is_pk=False, comment=mock.ANY), TableColumn( ordinal=5, name="job_titles", pg_type="text", is_pk=False, comment="Title of employee at the time when the data was updated.", ), TableColumn(ordinal=6, name="typical_hours", pg_type="numeric", is_pk=False, comment=mock.ANY), TableColumn(ordinal=7, name="annual_salary", pg_type="numeric", is_pk=False, comment=mock.ANY), TableColumn(ordinal=8, name=_long_name_col_sg, pg_type="text", is_pk=False, comment=mock.ANY), TableColumn( ordinal=9, name="department", pg_type="text", is_pk=False, comment="Department where employee worked.", ), ] assert local_engine_empty.run_sql( "SELECT option_value FROM information_schema.foreign_table_options " "WHERE foreign_table_name = 'some_table' " "AND foreign_table_schema = 'test/pg_mount' " "AND option_name = 'column_map'") == [ (f'{{"{_long_name_col_sg}": "{_long_name_col}"}}', ) ]
from typing import Dict, Optional from splitgraph.core.types import ( TableColumn, TableInfo, IntrospectionResult, ) # Define the schema of the foreign table we wish to create # We're only going to be fetching stories, so limit the columns to the ones that # show up for stories. See https://github.com/HackerNews/API for reference. from splitgraph.hooks.data_source.fdw import ForeignDataWrapperDataSource _story_schema_spec = [ TableColumn(1, "id", "integer", True), TableColumn(2, "by", "text", False), TableColumn(3, "time", "integer", False), TableColumn(4, "title", "text", False), TableColumn(5, "url", "text", False), TableColumn(6, "text", "text", False), TableColumn(7, "score", "integer", False), TableColumn(8, "kids", "integer[]", False), TableColumn(9, "descendants", "integer", False), ] _all_endpoints = [ "topstories", "newstories", "beststories", "askstories", "showstories",