def test_column_name_change(self): """Tests correct renaming of snowflake columns after source change""" tap_lines_before_column_name_change = test_utils.get_test_tap_lines('messages-with-three-streams.json') tap_lines_after_column_name_change = test_utils.get_test_tap_lines( 'messages-with-three-streams-modified-column.json') # Load with default settings self.persist_lines_with_cache(tap_lines_before_column_name_change) self.persist_lines_with_cache(tap_lines_after_column_name_change) # Get loaded rows from tables snowflake = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') table_one = snowflake.query("SELECT * FROM {}.test_table_one ORDER BY c_pk".format(target_schema)) table_two = snowflake.query("SELECT * FROM {}.test_table_two ORDER BY c_pk".format(target_schema)) table_three = snowflake.query("SELECT * FROM {}.test_table_three ORDER BY c_pk".format(target_schema)) # Get the previous column name from information schema in test_table_two previous_column_name = snowflake.query(""" SELECT column_name FROM information_schema.columns WHERE table_catalog = '{}' AND table_schema = '{}' AND table_name = 'TEST_TABLE_TWO' AND ordinal_position = 1 """.format( self.config.get('dbname', '').upper(), target_schema.upper()))[0]["COLUMN_NAME"] # Table one should have no changes self.assertEqual( table_one, [{'C_INT': 1, 'C_PK': 1, 'C_VARCHAR': '1'}]) # Table two should have versioned column self.assertEquals( table_two, [ {previous_column_name: datetime.datetime(2019, 2, 1, 15, 12, 45), 'C_INT': 1, 'C_PK': 1, 'C_VARCHAR': '1', 'C_DATE': None}, {previous_column_name: datetime.datetime(2019, 2, 10, 2), 'C_INT': 2, 'C_PK': 2, 'C_VARCHAR': '2', 'C_DATE': '2019-02-12 02:00:00'}, {previous_column_name: None, 'C_INT': 3, 'C_PK': 3, 'C_VARCHAR': '2', 'C_DATE': '2019-02-15 02:00:00'} ] ) # Table three should have renamed columns self.assertEqual( table_three, [ {'C_INT': 1, 'C_PK': 1, 'C_TIME': datetime.time(4, 0), 'C_VARCHAR': '1', 'C_TIME_RENAMED': None}, {'C_INT': 2, 'C_PK': 2, 'C_TIME': datetime.time(7, 15), 'C_VARCHAR': '2', 'C_TIME_RENAMED': None}, {'C_INT': 3, 'C_PK': 3, 'C_TIME': datetime.time(23, 0, 3), 'C_VARCHAR': '3', 'C_TIME_RENAMED': datetime.time(8, 15)}, {'C_INT': 4, 'C_PK': 4, 'C_TIME': None, 'C_VARCHAR': '4', 'C_TIME_RENAMED': datetime.time(23, 0, 3)} ])
def test_nested_schema_unflattening(self): """Loading nested JSON objects with no props without flattening""" tap_lines = test_utils.get_test_tap_lines('messages-with-nested-schema.json') # Load with default settings - Flattening disabled self.persist_lines(tap_lines) # Get loaded rows from tables - Transform JSON to string at query time bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') unflattened_table = query(bigquery, """ SELECT c_pk , c_array c_array , c_object c_object , c_object c_object_with_props , c_nested_object c_nested_object FROM {}.test_table_nested_schema ORDER BY c_pk""".format(target_schema)) # Should be valid nested JSON strings self.assertEqual( unflattened_table, [{ 'c_pk': 1, 'c_array': '[1, 2, 3]', 'c_object': '{"key_1": "value_1"}', 'c_object_with_props': '{"key_1": "value_1"}', 'c_nested_object': {'nested_prop_1': 'nested_value_1', 'nested_prop_2': 'nested_value_2', 'nested_prop_3': {'multi_nested_prop_1': 'multi_value_1', 'multi_nested_prop_2': 'multi_value_2'}}, }])
def test_flush_streams_with_no_intermediate_flushes(self, mock_emit_state): """Test emitting states when no intermediate flush required""" mock_emit_state.get.return_value = None tap_lines = test_utils.get_test_tap_lines('messages-pg-logical-streams.json') # Set batch size big enough to never has to flush in the middle self.config['hard_delete'] = True self.config['batch_size_rows'] = 1000 self.persist_lines(tap_lines) # State should be emitted only once with the latest received STATE message self.assertEquals( mock_emit_state.mock_calls, [ mock.call({"currently_syncing": None, "bookmarks": { "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723596, "xmin": None}, "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723618, "xmin": None}, "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723635, "xmin": None}, "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723651, "xmin": None}, "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079}, "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None}, "public2-wearehere": {}}}) ]) # Every table should be loaded correctly self.assert_logical_streams_are_in_bigquery(True)
def test_nested_schema_unflattening(self): """Loading nested JSON objects into VARIANT columns without flattening""" tap_lines = test_utils.get_test_tap_lines( 'messages-with-nested-schema.json') # Load with default settings - Flattening disabled self.persist_lines_with_cache(tap_lines) # Get loaded rows from tables - Transform JSON to string at query time snowflake = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') unflattened_table = snowflake.query(""" SELECT c_pk ,TO_CHAR(c_array) c_array ,TO_CHAR(c_object) c_object ,TO_CHAR(c_object) c_object_with_props ,TO_CHAR(c_nested_object) c_nested_object FROM {}.test_table_nested_schema ORDER BY c_pk""".format(target_schema)) # Should be valid nested JSON strings self.assertEqual(unflattened_table, [{ 'C_PK': 1, 'C_ARRAY': '[1,2,3]', 'C_OBJECT': '{"key_1":"value_1"}', 'C_OBJECT_WITH_PROPS': '{"key_1":"value_1"}', 'C_NESTED_OBJECT': '{"nested_prop_1":"nested_value_1","nested_prop_2":"nested_value_2","nested_prop_3":{"multi_nested_prop_1":"multi_value_1","multi_nested_prop_2":"multi_value_2"}}' }])
def test_nested_schema_flattening(self): """Loading nested JSON objects with flattening and not not flattening""" tap_lines = test_utils.get_test_tap_lines('messages-with-nested-schema.json') # Turning on data flattening self.config['data_flattening_max_level'] = 10 # Load with default settings - Flattening disabled self.persist_lines_with_cache(tap_lines) # Get loaded rows from tables snowflake = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') flattened_table = snowflake.query( "SELECT * FROM {}.test_table_nested_schema ORDER BY c_pk".format(target_schema)) # Should be flattened columns self.assertEqual( flattened_table, [{ 'C_PK': 1, 'C_ARRAY': '[\n 1,\n 2,\n 3\n]', 'C_OBJECT': None, # Cannot map RECORD to SCHEMA. SCHEMA doesn't have properties that requires for flattening 'C_OBJECT_WITH_PROPS__KEY_1': 'value_1', 'C_NESTED_OBJECT__NESTED_PROP_1': 'nested_value_1', 'C_NESTED_OBJECT__NESTED_PROP_2': 'nested_value_2', 'C_NESTED_OBJECT__NESTED_PROP_3__MULTI_NESTED_PROP_1': 'multi_value_1', 'C_NESTED_OBJECT__NESTED_PROP_3__MULTI_NESTED_PROP_2': 'multi_value_2', }])
def test_loading_csv_files(self): """Loading multiple tables from the same input tap with various columns types""" tap_lines = test_utils.get_test_tap_lines( 'messages-with-three-streams.json') self.persist_messages(tap_lines) self.assert_three_streams_are_in_s3_bucket()
def test_information_schema_cache_outdated(self): """If informations schema cache is not up to date then it should fail""" tap_lines_with_multi_streams = test_utils.get_test_tap_lines( "messages-with-three-streams.json") # 1) Simulate an out of data cache: # Table is in cache but not exists in database snowflake = DbSync(self.config) target_schema = self.config.get("default_target_schema", "").upper() snowflake.query(""" CREATE TABLE IF NOT EXISTS {}.columns (table_schema VARCHAR, table_name VARCHAR, column_name VARCHAR, data_type VARCHAR) """.format(snowflake.pipelinewise_schema)) snowflake.query(""" INSERT INTO {0}.columns (table_schema, table_name, column_name, data_type) SELECT '{1}', 'TEST_TABLE_ONE', 'DUMMY_COLUMN_1', 'TEXT' UNION SELECT '{1}', 'TEST_TABLE_ONE', 'DUMMY_COLUMN_2', 'TEXT' UNION SELECT '{1}', 'TEST_TABLE_TWO', 'DUMMY_COLUMN_3', 'TEXT' """.format(snowflake.pipelinewise_schema, target_schema)) # Loading into an outdated information_schema cache should fail with table not exists with self.assertRaises(Exception): self.persist_lines_with_cache(tap_lines_with_multi_streams) # 2) Simulate an out of data cache: # Table is in cache structure is not in sync with the actual table in the database snowflake.query("CREATE SCHEMA IF NOT EXISTS {}".format(target_schema)) snowflake.query( "CREATE OR REPLACE TABLE {}.test_table_one (C_PK NUMBER, C_INT NUMBER, C_VARCHAR TEXT)" .format(target_schema)) # Loading into an outdated information_schema cache should fail with columns exists # It should try adding the new column based on the values in cache but the column already exists with self.assertRaises(Exception): self.persist_lines_with_cache(tap_lines_with_multi_streams)
def test_nested_schema_flattening(self): """Loading nested JSON objects with flattening and not not flattening""" tap_lines = test_utils.get_test_tap_lines('messages-with-nested-schema.json') # Turning on data flattening self.config['data_flattening_max_level'] = 10 # Load with default settings - Flattening disabled self.persist_lines(tap_lines) # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') flattened_table = query(bigquery, "SELECT * FROM {}.test_table_nested_schema ORDER BY c_pk".format(target_schema)) # Should be flattened columns self.assertEqual( flattened_table, [{ 'c_pk': 1, 'c_array': '[1, 2, 3]', 'c_object': None, 'c_object_with_props__key_1': 'value_1', 'c_nested_object__nested_prop_1': 'nested_value_1', 'c_nested_object__nested_prop_2': 'nested_value_2', 'c_nested_object__nested_prop_3__multi_nested_prop_1': 'multi_value_1', 'c_nested_object__nested_prop_3__multi_nested_prop_2': 'multi_value_2', }])
def test_naming_convention(self): tap_lines = test_utils.get_test_tap_lines( 'messages-with-three-streams.json') self.config['naming_convention'] = "tester/{stream}/{timestamp}.csv" self.persist_messages(tap_lines) self.assert_three_streams_are_in_s3_bucket()
def test_table_with_pk_multi_column_removed(self): """Test table with a pk with multiple columns gets clustered by those and removing the pk doesnt cause errors""" tap_lines = test_utils.get_test_tap_lines('table_with_multi_pk_cluster.json') self.persist_lines(tap_lines) # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') table = query(bigquery, "SELECT * FROM {}.test_table_cluster_multi ORDER BY c_pk".format(target_schema)) cluster_columns = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster_multi' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema)) # ---------------------------------------------------------------------- # Check rows in table # ---------------------------------------------------------------------- expected_table = [ {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)}, {'c_pk': 3, 'c_int': 3, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 15, 2, 0, 0, tzinfo=timezone.utc)} ] expected_cluster_columns = [ {'clustering_ordinal_position': 1, 'column_name': 'c_pk'}, {'clustering_ordinal_position': 2, 'column_name': 'c_varchar'} ] self.assertEqual(self.remove_metadata_columns_from_rows(table), expected_table) self.assertEqual(cluster_columns, expected_cluster_columns) # ---------------------------------------------------------------------- # Remove the primary key and check if clustering stayed unchanged # ---------------------------------------------------------------------- self.config['primary_key_required'] = False tap_lines = test_utils.get_test_tap_lines('table_with_multi_pk_cluster_changed.json') self.persist_lines(tap_lines) table_changed = query(bigquery, "SELECT * FROM {}.test_table_cluster_multi ORDER BY c_pk".format(target_schema)) cluster_columns_changed = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster_multi' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema)) expected_table_changed = [ {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)}, {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)}, {'c_pk': 3, 'c_int': 3, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 15, 2, 0, 0, tzinfo=timezone.utc)} ] expected_cluster_columns_changed = [] self.assertEqual(self.remove_metadata_columns_from_rows(table_changed), expected_table_changed) self.assertEqual(cluster_columns_changed, expected_cluster_columns)
def test_loading_csv_files_with_gzip_compression(self): """Loading multiple tables from the same input tap with gzip compression""" tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json') # Turning on gzip compression self.config['compression'] = 'gzip' self.persist_messages(tap_lines) self.assert_three_streams_are_in_s3_bucket(compression='gzip')
def test_loading_tables_with_client_side_encryption_and_wrong_master_key(self): """Loading multiple tables from the same input tap with various columns types""" tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json') # Turning on client-side encryption and load but using a well formatted but wrong master key self.config['client_side_encryption_master_key'] = "Wr0n6m45t3rKeY0123456789a0123456789a0123456=" with assert_raises(ProgrammingError): self.persist_lines_with_cache(tap_lines)
def test_loading_tables_with_no_encryption(self): """Loading multiple tables from the same input tap with various columns types""" tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json') # Turning off client-side encryption and load self.config['client_side_encryption_master_key'] = '' self.persist_lines(tap_lines) self.assert_three_streams_are_into_bigquery()
def test_logical_streams_from_pg_with_hard_delete_and_default_batch_size_should_pass(self): """Tests logical streams from pg with inserts, updates and deletes""" tap_lines = test_utils.get_test_tap_lines('messages-pg-logical-streams.json') # Turning on hard delete mode self.config['hard_delete'] = True self.persist_lines(tap_lines) self.assert_logical_streams_are_in_bigquery(True)
def test_loading_tables_with_custom_temp_dir(self): """Loading multiple tables from the same input tap using custom temp directory""" tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json') # Use custom temp_dir self.config['temp_dir'] = ('~/.pipelinewise/tmp') self.persist_messages(tap_lines) self.assert_three_streams_are_in_s3_bucket()
def test_loading_unicode_characters(self): """Loading unicode encoded characters""" tap_lines = test_utils.get_test_tap_lines( 'messages-with-unicode-characters.json') # Load with default settings target_snowflake.persist_lines(self.config, tap_lines) # Get loaded rows from tables snowflake = DbSync(self.config) target_schema = self.config.get('schema', '') table_unicode = snowflake.query( "SELECT * FROM {}.test_table_unicode".format(target_schema)) self.assertEqual(table_unicode, [{ 'C_INT': 1, 'C_PK': 1, 'C_VARCHAR': 'Hello world, Καλημέρα κόσμε, コンニチハ' }, { 'C_INT': 2, 'C_PK': 2, 'C_VARCHAR': 'Chinese: 和毛泽东 <<重上井冈山>>. 严永欣, 一九八八年.' }, { 'C_INT': 3, 'C_PK': 3, 'C_VARCHAR': 'Russian: Зарегистрируйтесь сейчас на Десятую Международную Конференцию по' }, { 'C_INT': 4, 'C_PK': 4, 'C_VARCHAR': 'Thai: แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช' }, { 'C_INT': 5, 'C_PK': 5, 'C_VARCHAR': 'Arabic: لقد لعبت أنت وأصدقاؤك لمدة وحصلتم علي من إجمالي النقاط' }, { 'C_INT': 6, 'C_PK': 6, 'C_VARCHAR': 'Special Characters: [",\'!@£$%^&*()]' }])
def test_loading_tables_with_client_side_encryption(self): """Loading multiple tables from the same input tap with various columns types""" tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json') # Turning on client-side encryption and load self.config['client_side_encryption_master_key'] = os.environ.get('CLIENT_SIDE_ENCRYPTION_MASTER_KEY') self.persist_lines(tap_lines) self.assert_three_streams_are_into_bigquery()
def test_column_name_change(self): """Tests correct renaming of bigquery columns after source change""" tap_lines_before_column_name_change = test_utils.get_test_tap_lines('messages-with-three-streams.json') tap_lines_after_column_name_change = test_utils.get_test_tap_lines( 'messages-with-three-streams-modified-column.json') # Load with default settings self.persist_lines(tap_lines_before_column_name_change) self.persist_lines(tap_lines_after_column_name_change) # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') table_one = query(bigquery, "SELECT * FROM {}.test_table_one ORDER BY c_pk".format(target_schema)) table_two = query(bigquery, "SELECT * FROM {}.test_table_two ORDER BY c_pk".format(target_schema)) table_three = query(bigquery, "SELECT * FROM {}.test_table_three ORDER BY c_pk".format(target_schema)) # Table one should have no changes self.assertEqual( table_one, [{'c_int': 1, 'c_pk': 1, 'c_varchar': '1'}]) # Table two should have versioned column self.assertEquals( table_two, [ {'c_int': 1, 'c_pk': 1, 'c_varchar': '1', 'c_date': datetime.datetime(2019, 2, 1, 15, 12, 45, tzinfo=timezone.utc), 'c_date__st': None}, {'c_int': 2, 'c_pk': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 10, 2, tzinfo=timezone.utc), 'c_date__st': '2019-02-12 02:00:00'}, {'c_int': 3, 'c_pk': 3, 'c_varchar': '2', 'c_date': None, 'c_date__st': '2019-02-15 02:00:00'} ] ) # Table three should have renamed columns self.assertEqual( table_three, [ {'c_int': 1, 'c_pk': 1, 'c_time': datetime.time(4, 0), 'c_varchar': '1', 'c_time_renamed': None}, {'c_int': 2, 'c_pk': 2, 'c_time': datetime.time(7, 15), 'c_varchar': '2', 'c_time_renamed': None}, {'c_int': 3, 'c_pk': 3, 'c_time': datetime.time(23, 0, 3), 'c_varchar': '3', 'c_time_renamed': datetime.time(8, 15)}, {'c_int': 4, 'c_pk': 4, 'c_time': None, 'c_varchar': '4', 'c_time_renamed': datetime.time(23, 0, 3)} ])
def test_loading_unicode_characters(self): """Loading unicode encoded characters""" tap_lines = test_utils.get_test_tap_lines( "messages-with-unicode-characters.json") # Load with default settings self.persist_lines_with_cache(tap_lines) # Get loaded rows from tables snowflake = DbSync(self.config) target_schema = self.config.get("default_target_schema", "") table_unicode = snowflake.query( "SELECT * FROM {}.test_table_unicode ORDER BY C_INT".format( target_schema)) self.assertEqual( table_unicode, [ { "C_INT": 1, "C_PK": 1, "C_VARCHAR": "Hello world, Καλημέρα κόσμε, コンニチハ" }, { "C_INT": 2, "C_PK": 2, "C_VARCHAR": "Chinese: 和毛泽东 <<重上井冈山>>. 严永欣, 一九八八年." }, { "C_INT": 3, "C_PK": 3, "C_VARCHAR": "Russian: Зарегистрируйтесь сейчас на Десятую Международную Конференцию по", }, { "C_INT": 4, "C_PK": 4, "C_VARCHAR": "Thai: แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช" }, { "C_INT": 5, "C_PK": 5, "C_VARCHAR": "Arabic: لقد لعبت أنت وأصدقاؤك لمدة وحصلتم علي من إجمالي النقاط", }, { "C_INT": 6, "C_PK": 6, "C_VARCHAR": "Special Characters: [\",'!@£$%^&*()]" }, ], )
def test_loading_tables_with_metadata_columns(self): """Loading multiple tables from the same input tap with various columns types""" tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json') # Turning on adding metadata columns self.config['add_metadata_columns'] = True self.persist_lines(tap_lines) # Check if data loaded correctly and metadata columns exist self.assert_three_streams_are_into_bigquery(should_metadata_columns_exist=True)
def test_loading_csv_files_with_invalid_compression(self): """Loading multiple tables from the same input tap with invalid compression""" tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json') # Turning on a not supported compression method self.config['compression'] = 'INVALID_COMPRESSION_METHOD' # Invalid compression method should raise exception with assert_raises(NotImplementedError): self.persist_messages(tap_lines)
def test_logical_streams_from_pg_with_hard_delete_and_batch_size_of_5_and_no_records_should_pass(self): """Tests logical streams from pg with inserts, updates and deletes""" tap_lines = test_utils.get_test_tap_lines('messages-pg-logical-streams-no-records.json') # Turning on hard delete mode self.config['hard_delete'] = True self.config['batch_size_rows'] = 5 self.persist_lines(tap_lines) self.assert_logical_streams_are_in_bigquery_and_are_empty()
def test_loading_tables_with_no_encryption(self): """Loading multiple tables from the same input tap with various columns types""" tap_lines = test_utils.get_test_tap_lines( "messages-with-three-streams.json") # Turning off client-side encryption and load self.config["client_side_encryption_master_key"] = "" self.persist_lines_with_cache(tap_lines) self.assert_three_streams_are_into_snowflake()
def test_logical_streams_from_pg_with_hard_delete_and_batch_size_of_5(self): """Tests logical streams from pg with inserts, updates and deletes""" tap_lines = test_utils.get_test_tap_lines('messages-logical-streams.json') # Turning on hard delete mode self.config['hard_delete'] = True self.config['batch_size_rows'] = 5 self.persist_lines_with_cache(tap_lines) self.assert_logical_streams_are_in_snowflake(True)
def test_loading_tables_with_defined_parallelism(self): """Loading multiple tables from the same input tap with various columns types""" tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json') # Using fixed 1 thread parallelism self.config['parallelism'] = 1 self.persist_lines(tap_lines) # Check if data loaded correctly and metadata columns exist self.assert_three_streams_are_into_bigquery()
def test_table_with_pk_adds_clustering(self): """Tests table with a primary key gets clustered on those fields""" tap_lines = test_utils.get_test_tap_lines('table_with_pk_cluster.json') self.persist_lines(tap_lines) # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') table = query(bigquery, "SELECT * FROM {}.test_table_cluster ORDER BY c_pk".format(target_schema)) cluster_columns = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema)) # ---------------------------------------------------------------------- # Check rows in table # ---------------------------------------------------------------------- expected_table = [ {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)}, {'c_pk': 3, 'c_int': 3, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 15, 2, 0, 0, tzinfo=timezone.utc)} ] expected_cluster_columns = [ {'clustering_ordinal_position': 1, 'column_name': 'c_pk'}, ] self.assertEqual(self.remove_metadata_columns_from_rows(table), expected_table) self.assertEqual(cluster_columns, expected_cluster_columns) # ---------------------------------------------------------------------- # Change the primary key and check if clustering stayed unchanged # ---------------------------------------------------------------------- tap_lines = test_utils.get_test_tap_lines('table_with_pk_cluster_changed.json') self.persist_lines(tap_lines) table_changed = query(bigquery, "SELECT * FROM {}.test_table_cluster ORDER BY c_pk".format(target_schema)) cluster_columns_changed = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema)) expected_table_changed = [ {'c_pk': 2, 'c_int': 2, 'c_varchar': 'c', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)}, {'c_pk': 3, 'c_int': 3, 'c_varchar': 'c', 'c_date': datetime.datetime(2022, 5, 15, 5, 0, 0, tzinfo=timezone.utc)} ] self.assertEqual(self.remove_metadata_columns_from_rows(table_changed), expected_table_changed) self.assertEqual(cluster_columns_changed, expected_cluster_columns)
def test_table_with_no_pk(self): """Tests table with a primary key gets clustered on those fields""" tap_lines = test_utils.get_test_tap_lines('table_with_no_pk.json') self.config['primary_key_required'] = False self.persist_lines(tap_lines) # Get loaded rows from tables bigquery = DbSync(self.config) target_schema = self.config.get('default_target_schema', '') table = query(bigquery, "SELECT * FROM {}.test_table_no_pk ORDER BY c_id".format(target_schema)) self.assertEqual(len(table), 2)
def test_loading_tables_with_hard_delete(self): """Loading multiple tables from the same input tap with deleted rows""" tap_lines = test_utils.get_test_tap_lines( "messages-with-three-streams.json") # Turning on hard delete mode self.config["hard_delete"] = True self.persist_lines_with_cache(tap_lines) # Check if data loaded correctly and metadata columns exist self.assert_three_streams_are_into_snowflake( should_metadata_columns_exist=True, should_hard_deleted_rows=True)
def test_loading_with_multiple_schema(self): """Loading table with multiple SCHEMA messages""" tap_lines = test_utils.get_test_tap_lines( "messages-with-multi-schemas.json") # Load with default settings self.persist_lines_with_cache(tap_lines) # Check if data loaded correctly self.assert_three_streams_are_into_snowflake( should_metadata_columns_exist=False, should_hard_deleted_rows=False)
def test_non_db_friendly_columns(self): """Loading non-db friendly columns like, camelcase, minus signs, etc.""" tap_lines = test_utils.get_test_tap_lines( "messages-with-non-db-friendly-columns.json") # Load with default settings self.persist_lines_with_cache(tap_lines) # Get loaded rows from tables snowflake = DbSync(self.config) target_schema = self.config.get("default_target_schema", "") table_non_db_friendly_columns = snowflake.query( "SELECT * FROM {}.test_table_non_db_friendly_columns ORDER BY c_pk" .format(target_schema)) self.assertEqual( table_non_db_friendly_columns, [ { "C_PK": 1, "CAMELCASECOLUMN": "Dummy row 1", "MINUS-COLUMN": "Dummy row 1" }, { "C_PK": 2, "CAMELCASECOLUMN": "Dummy row 2", "MINUS-COLUMN": "Dummy row 2" }, { "C_PK": 3, "CAMELCASECOLUMN": "Dummy row 3", "MINUS-COLUMN": "Dummy row 3" }, { "C_PK": 4, "CAMELCASECOLUMN": "Dummy row 4", "MINUS-COLUMN": "Dummy row 4" }, { "C_PK": 5, "CAMELCASECOLUMN": "Dummy row 5", "MINUS-COLUMN": "Dummy row 5" }, ], )