def test_run(self): """ Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } # self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream][ 'table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part if records_by_stream[stream]['messages'][-1].get("data"): last_row_data = True else: last_row_data = False self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-2]['action'], 'activate_version') if last_row_data: self.assertEqual( records_by_stream[stream]['messages'][-3]['action'], 'activate_version') else: self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertEqual( len([ m for m in records_by_stream[stream]['messages'][1:] if m["action"] == "activate_version" ]), 2, msg= "Expect 2 more activate version messages for end of full table and beginning of log based" ) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # Verify all data is correct for the full table part if last_row_data: final_row = -3 else: final_row = -2 for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream] ['messages'][1:final_row])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, time): # sql server time has second resolution only self.assertEqual( expected_value.replace( microsecond=0).isoformat().replace( '+00:00', ''), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, date): # sql server time has second resolution only self.assertEqual( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value, actual_row["data"][column_name])) # Verify all data is correct for the log replication part if sent if records_by_stream[stream]['messages'][-1].get("data"): for column_name, expected_value in expected_messages[-1][ "data"].items(): if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, time): # sql server time has second resolution only self.assertEqual( expected_value.replace( microsecond=0).isoformat().replace( '+00:00', ''), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, date): # sql server time has second resolution only self.assertEqual( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value, actual_row["data"][column_name])) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg= "expected bookmark to have current_log_version because we are using log replication" ) self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") inital_log_version = bookmark['current_log_version'] self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "dates_and_times" column_name = [ "pk", "just_a_date", "date_and_time", "bigger_range_and_precision_datetime", "datetime_with_timezones", "datetime_no_seconds", "its_time" ] new_date_value = datetime(2019, 7, 22, 21, 11, 40, 573000) insert_value = [ (6, new_date_value.date(), new_date_value, datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc), datetime(5749, 4, 3, 1, 47, 47, 110809, tzinfo=timezone(timedelta(hours=10, minutes=5))).isoformat(), datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc), time(21, 9, 56, 0, tzinfo=timezone.utc)) ] update_value = [ (2, new_date_value.date(), new_date_value, datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc), datetime(5749, 4, 3, 1, 47, 47, 110809, tzinfo=timezone(timedelta(hours=10, minutes=5))).isoformat(), datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc), time(21, 9, 56, 0, tzinfo=timezone.utc)) ] delete_value = [(3, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = [ (6, new_date_value.date(), new_date_value, datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc), datetime(5749, 4, 3, 1, 47, 47, 110809, tzinfo=timezone(timedelta( hours=10, minutes=5))).astimezone(timezone.utc), datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc), time(21, 9, 56, 0, tzinfo=timezone.utc)) ] update_value = [ (2, new_date_value.date(), new_date_value, datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc), datetime(5749, 4, 3, 1, 47, 47, 110809, tzinfo=timezone(timedelta( hours=10, minutes=5))).astimezone(timezone.utc), datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc), time(21, 9, 56, 0, tzinfo=timezone.utc)) ] insert_value = [insert_value[0] + (None, )] update_value = [update_value[0] + (None, )] delete_value = [(3, None, None, None, None, None, None, datetime.utcnow())] self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"]["values"] = \ [self.expected_metadata()["data_types_database_dbo_dates_and_times"]["values"][-1]] + \ insert_value + delete_value + update_value self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"][ "fields"].append({ "_sdc_deleted_at": { 'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic' } }) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertTrue( all([ message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:] ])) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertGreaterEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if column_name != "_sdc_deleted_at": if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace( '+00:00', 'Z').replace('000', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace( '+00:00', 'Z').replace('000', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, time): # sql server time has second resolution only self.assertEqual( expected_value.replace( microsecond=0).isoformat().replace( '+00:00', ''), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, date): # sql server time has second resolution only self.assertEqual( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value, actual_row["data"][column_name])) elif expected_value: # we have an expected value for a deleted row try: actual_value = datetime.strptime( actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%S.%fZ") except ValueError: actual_value = datetime.strptime( actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%SZ") self.assertGreaterEqual( actual_value, expected_value - timedelta(seconds=15)) self.assertLessEqual( actual_value, expected_value + timedelta(seconds=15)) else: # the row wasn't deleted so we can either not pass the column or it can be None self.assertIsNone( actual_row["data"].get(column_name)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg= "expected bookmark to have current_log_version because we are using log replication" ) self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") new_log_version = bookmark['current_log_version'] self.assertGreater(new_log_version, inital_log_version, msg='expected log version to increase') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): """ Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{"breadcrumb": [], "metadata": {'replication-method': 'LOG_BASED'}}] # Don't select unsupported data types non_selected_properties = ["nvarchar_text", "varchar_text", "varbinary_data", "geospacial", "geospacial_map", "markup", "tree", "variant", "SpecialPurposeColumns", "started_at", "ended_at"] BaseTapTest.select_all_streams_and_fields( conn_id, found_catalogs, additional_md=additional_md, non_selected_properties=non_selected_properties) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()} # self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream]['table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part if records_by_stream[stream]['messages'][-1].get("data"): last_row_data = True else: last_row_data = False self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-2]['action'], 'activate_version') if last_row_data: self.assertEqual( records_by_stream[stream]['messages'][-3]['action'], 'activate_version') else: self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertEqual( len([m for m in records_by_stream[stream]['messages'][1:] if m["action"] == "activate_version"]), 2, msg="Expect 2 more activate version messages for end of full table and beginning of log based") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [ { "action": "upsert", "data": { column: value for column, value in list(zip(column_names, stream_expected_data[self.VALUES][row])) if column not in non_selected_properties } } for row in range(len(stream_expected_data[self.VALUES])) ] # Verify all data is correct for the full table part if last_row_data: final_row = -3 else: final_row = -2 for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:final_row])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row["data"].items(): self.assertEqual(expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) # Verify all data is correct for the log replication part if sent if records_by_stream[stream]['messages'][-1].get("data"): for column_name, expected_value in expected_messages[-1]["data"].items(): self.assertEqual(expected_value, records_by_stream[stream]['messages'][-1]["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg="expected bookmark to have current_log_version because we are using log replication") self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") inital_log_version = bookmark['current_log_version'] self.assertEqual(bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format(expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "text_and_image_deprecated_soon" column_name = ["pk", "nvarchar_text", "varchar_text", "varbinary_data", "rowversion_synonym_timestamp"] insert_value = [(2, "JKL", "MNO", "PQR".encode('utf-8'))] update_value = [(1, "JKL", "MNO", "PQR".encode('utf-8'))] delete_value = [(0, )] query_list = (insert(database_name, schema_name, table_name, insert_value, column_name[:-1])) query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) values = insert_value + update_value rows = mssql_cursor_context_manager(*[ "select rowversion_synonym_timestamp from data_types_database.dbo.text_and_image_deprecated_soon " "where pk in (0,1,2) order by pk desc"]) rows = [tuple(row) for row in rows] rows = [("0x{}".format(value.hex().upper()), ) for value, in rows] row_with_version = [x[0] + x[1] + (None, ) for x in zip(values, rows)] row_with_version.append((0, None, None, None, None, datetime.utcnow())) row_with_version[1], row_with_version[2] = row_with_version[2], row_with_version[1] self.EXPECTED_METADATA['data_types_database_dbo_text_and_image_deprecated_soon']['values'] = row_with_version self.EXPECTED_METADATA["data_types_database_dbo_text_and_image_deprecated_soon"]["fields"].append( {"_sdc_deleted_at": { 'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic'}} ) database_name = "data_types_database" schema_name = "dbo" table_name = "weirdos" column_name = [ "pk", "geospacial", "geospacial_map", "markup", "guid", "tree", "variant", "SpecialPurposeColumns", "version"] insert_value = [(3, None, None, None, str(uuid.uuid1()).upper(), None, None, None)] update_value = [(1, None, None, None, str(uuid.uuid1()).upper(), None, None, None)] delete_value = [(0,)] query_list = (insert(database_name, schema_name, table_name, insert_value, column_name[:-1])) query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) values = insert_value + update_value rows = mssql_cursor_context_manager(*[ "select version from data_types_database.dbo.weirdos " "where pk in (0,1,3) order by pk desc"]) rows = [tuple(row) for row in rows] rows = [("0x{}".format(value.hex().upper()), ) for value, in rows] row_with_version = [x[0] + x[1] + (None, ) for x in zip(values, rows)] row_with_version.append((0, None, None, None, None, None, None, None, None, datetime.utcnow())) row_with_version[1], row_with_version[2] = row_with_version[2], row_with_version[1] self.EXPECTED_METADATA['data_types_database_dbo_weirdos']['values'] = row_with_version self.EXPECTED_METADATA["data_types_database_dbo_weirdos"]["fields"].append( {"_sdc_deleted_at": { 'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic'}} ) database_name = "data_types_database" schema_name = "dbo" table_name = "computed_columns" column_name = ["pk", "started_at", "ended_at", "durations_days"] insert_value = [(2, datetime(1980, 5, 30, 16), datetime.now())] update_value = [(1, datetime(1942, 11, 30), datetime(2017, 2, 12))] delete_value = [(0,)] query_list = (insert(database_name, schema_name, table_name, insert_value, column_name[:-1])) query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) values = insert_value + update_value # + [delete_value[0] + (None, None)] rows = mssql_cursor_context_manager( *["select durations_days from data_types_database.dbo.computed_columns " "where pk in (0,1,2) order by pk desc"]) rows = [tuple(row) for row in rows] row_with_duration = [x[0] + x[1] + (None, ) for x in zip(values, rows)] row_with_duration.append((0, None, None, None, datetime.utcnow())) row_with_duration[1], row_with_duration[2] = row_with_duration[2], row_with_duration[1] self.EXPECTED_METADATA['data_types_database_dbo_computed_columns']['values'] = row_with_duration self.EXPECTED_METADATA["data_types_database_dbo_computed_columns"]["fields"].append( {"_sdc_deleted_at": { 'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic'}} ) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()} self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertTrue(all( [message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:]] )) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [ { "action": "upsert", "data": { column: value for column, value in list(zip(column_names, stream_expected_data[self.VALUES][row])) if column not in non_selected_properties } } for row in range(len(stream_expected_data[self.VALUES])) ] # remove sequences from actual values for comparison [message.pop("sequence") for message in records_by_stream[stream]['messages'][1:]] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertGreaterEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row["data"].items(): if column_name != "_sdc_deleted_at": self.assertEqual(expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) elif expected_value: # we have an expected value for a deleted row try: actual_value = datetime.strptime(actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%S.%fZ") except ValueError: actual_value = datetime.strptime(actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%SZ") self.assertGreaterEqual(actual_value, expected_value - timedelta(seconds=15)) self.assertLessEqual(actual_value, expected_value + timedelta(seconds=15)) else: # the row wasn't deleted so we can either not pass the column or it can be None self.assertIsNone(actual_row["data"].get(column_name)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg="expected bookmark to have current_log_version because we are using log replication") self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") new_log_version = bookmark['current_log_version'] self.assertGreater(new_log_version, inital_log_version, msg='expected log version to increase') self.assertEqual(bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual(bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format(expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): """stream_expected_data[self.VALUES] Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{"breadcrumb": [], "metadata": {'replication-method': 'INCREMENTAL', 'replication-key': 'replication_key_column'}}] non_selected_properties = [] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()} self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream]['table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([m["action"] == "upsert" for m in records_by_stream[stream]['messages'][1:-1]]), msg="Expect all but the first message to be upserts") self.assertEqual(len(records_by_stream[stream]['messages'][1:-1]), len(stream_expected_data[self.VALUES]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] replication_column = column_names.index("replication_key_column") expected_messages = [ { "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) if column not in non_selected_properties } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[replication_column] is not None, row[replication_column])) ] # Verify all data is correct for incremental for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row["data"].items(): self.assertEqual(expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key self.assertEqual(bookmark['replication_key_value'], max([row[replication_column] for row in stream_expected_data[self.VALUES] if row[replication_column] is not None])) # self.assertEqual(bookmark['replication_key'], 'replication_key_column') self.assertEqual(bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format(expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "constraints_database" schema_name = "dbo" table_name = "no_constraints" column_name = ["replication_key_column"] insert_value = [(49, )] update_value = [(3, )] delete_value = [(0, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name)) query_list.extend([ "UPDATE constraints_database.dbo.no_constraints " "SET replication_key_column = 3 " "WHERE replication_key_column = 1"]) mssql_cursor_context_manager(*query_list) self.EXPECTED_METADATA["constraints_database_dbo_no_constraints"]["values"] = \ [(2, )] + insert_value + update_value database_name = "constraints_database" schema_name = "dbo" table_name = "multiple_column_pk" column_name = ["first_name", "last_name", "replication_key_column"] insert_value = [("Brian", "Lampkin", 72)] update_value = [("Sergey", "Brin", 65)] delete_value = [("Larry", "Page")] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:2])) query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) self.EXPECTED_METADATA["constraints_database_dbo_multiple_column_pk"]["values"] = \ [("Tim", "Berners-Lee", 64)] + insert_value + update_value # duplicative of other testing # table_name = "single_column_pk" # column_name = ["pk", "replication_key_column"] # insert_value = [(3, 49)] # update_value = [(1, 65)] # delete_value = [(0,)] # query_list = (insert(database_name, schema_name, table_name, insert_value)) # query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) # query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) # mssql_cursor_context_manager(*query_list) # insert_value = [insert_value[0] + (None,)] # update_value = [update_value[0] + (None,)] # delete_value = [delete_value[0] + (None, datetime.utcnow())] # self.EXPECTED_METADATA["constraints_database_dbo_single_column_pk"]["values"] = \ # insert_value + delete_value + update_value table_name = "pk_with_fk" column_name = ["pk", "replication_key_column"] insert_value = [(5, 2), (6, None)] delete_value = [(1,), (2,)] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) mssql_cursor_context_manager(*query_list) self.EXPECTED_METADATA["constraints_database_dbo_pk_with_fk"]["values"] = \ [(0, 1), (3, 1)] + insert_value[:-1] table_name = "pk_with_unique_not_null" column_name = ["pk", "replication_key_column"] insert_value = [(3, 49)] update_value = [(1, 65)] delete_value = [(0,)] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) self.EXPECTED_METADATA["constraints_database_dbo_pk_with_unique_not_null"]["values"] = \ [(2, 5)] + insert_value + update_value # update expected datafor VIEW_WITH_JOIN view self.EXPECTED_METADATA["constraints_database_dbo_view_with_join"]["values"] = \ [(None, None, 4), (2, 5, 5), (None, None, 6)] table_name = "default_column" column_name = ["pk", "replication_key_column"] insert_value = [(3, 49), (4, None), (5, )] update_value = [(1, 65)] query_list = (insert(database_name, schema_name, table_name, insert_value[:2])) query_list.extend(insert(database_name, schema_name, table_name, insert_value[-1:], column_names=column_name[:1])) query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) self.EXPECTED_METADATA["constraints_database_dbo_default_column"]["values"] = [ (0, -1)] + [(3, 49), (5, -1)] + update_value table_name = "check_constraint" column_name = ["pk", "replication_key_column"] insert_value = [(3, 49)] update_value = [(1, 65)] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) self.EXPECTED_METADATA["constraints_database_dbo_check_constraint"]["values"] = \ [(0, 37)] + insert_value + update_value table_name = "even_identity" column_name = ["pk", "replication_key_column"] insert_value = [(3,)] update_value = [(2,)] delete_value = [(1,)] query_list = (insert(database_name, schema_name, table_name, insert_value, column_names=column_name[:1])) query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = [insert_value[0] + (6, )] update_value = [update_value[0] + (4, )] self.EXPECTED_METADATA["constraints_database_dbo_even_identity"]["values"] = \ insert_value + update_value sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()} self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual(records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual(records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue(all( [message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:-1]] )) self.assertEqual(len(records_by_stream[stream]['messages'][1:-1]), len(stream_expected_data[self.VALUES]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] replication_column = column_names.index("replication_key_column") expected_messages = [ { "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) if column not in non_selected_properties } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[replication_column] is not None, row[replication_column])) ] # remove sequences from actual values for comparison [message.pop("sequence") for message in records_by_stream[stream]['messages'][1:-1]] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row["data"].items(): self.assertEqual(expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key self.assertEqual(bookmark['replication_key_value'], max([row[replication_column] for row in stream_expected_data[self.VALUES] if row[replication_column] is not None])) # self.assertEqual(bookmark['replication_key'], 'replication_key_column') self.assertEqual(bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual(bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format(expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): """stream_expected_data[self.VALUES] Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) # TODO - change the replication key back to replication_key_column when rowversion is supported additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'INCREMENTAL', 'replication-key': 'temp_replication_key_column' } }] non_selected_properties = [ "nvarchar_text", "varchar_text", "varbinary_data", "geospacial", "geospacial_map", "markup", "tree", "variant", "SpecialPurposeColumns", "started_at", "ended_at" ] BaseTapTest.select_all_streams_and_fields( conn_id, found_catalogs, non_selected_properties=non_selected_properties, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream][ 'table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([ m["action"] == "upsert" for m in records_by_stream[stream]['messages'][1:-1] ]), msg="Expect all but the first message to be upserts") self.assertEqual(len( records_by_stream[stream]['messages'][1:-1]), len(stream_expected_data[self.VALUES]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [ { "action": "upsert", "data": { column: value for column, value in list( zip(column_names, row_values)) if column not in non_selected_properties } # TODO - change to -1 for using rowversion for replication key } for row_values in sorted( stream_expected_data[self.VALUES], key=lambda row: (row[1] is not None, row[1])) ] # Verify all data is correct for incremental for expected_row, actual_row in zip( expected_messages, records_by_stream[stream]['messages'][1:-1]): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key TODO - change to -1 for using rowversion for replication key self.assertEqual( bookmark['replication_key_value'], re.sub( r'\d{3}Z', "Z", max([ row[1] for row in stream_expected_data[self.VALUES] ]).strftime("%Y-%m-%dT%H:%M:%S.%fZ"))) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "text_and_image_deprecated_soon" column_name = [ "pk", "temp_replication_key_column", "nvarchar_text", "varchar_text", "varbinary_data", "replication_key_column" ] insert_value = [(3, datetime(2018, 12, 31, 23, 59, 59, 993000, tzinfo=timezone.utc), "JKL", "MNO", "PQR".encode('utf-8'))] update_value = [(0, datetime(2018, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), "JKL", "MNO", "PQR".encode('utf-8'))] query_list = (insert(database_name, schema_name, table_name, insert_value, column_names=column_name[:-1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) values = insert_value + [( 1, datetime(2018, 12, 31, 23, 59, 59, 987000, tzinfo=timezone.utc), "abc", "def", "ghi".encode('utf-8'))] + update_value rows = mssql_cursor_context_manager(*[ "select replication_key_column from data_types_database.dbo.text_and_image_deprecated_soon " "where pk in (0, 1,3) order by pk desc" ]) rows = [tuple(row) for row in rows] rows = [("0x{}".format(value.hex().upper()), ) for value, in rows] row_with_version = [x[0] + x[1] for x in zip(values, rows)] self.EXPECTED_METADATA[ 'data_types_database_dbo_text_and_image_deprecated_soon'][ 'values'] = row_with_version database_name = "data_types_database" schema_name = "dbo" table_name = "weirdos" column_name = [ "pk", "temp_replication_key_column", "geospacial", "geospacial_map", "markup", "guid", "tree", "variant", "SpecialPurposeColumns", "replication_key_column" ] insert_value = [(3, datetime(9999, 12, 31, 23, 59, 59, 993000, tzinfo=timezone.utc), None, None, None, str(uuid.uuid1()).upper(), None, None, None)] update_value = [(1, datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), None, None, None, str(uuid.uuid1()).upper(), None, None, None)] delete_value = [(0, )] query_list = (insert(database_name, schema_name, table_name, insert_value, column_name[:-1])) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) values = insert_value + [ (2, datetime(9999, 12, 31, 23, 59, 59, 990000, tzinfo=timezone.utc), None, None, None, "B792681C-AEF4-11E9-8002-0800276BC1DF", None, None, None) ] + update_value rows = mssql_cursor_context_manager(*[ "select replication_key_column from data_types_database.dbo.weirdos " "where pk in (1, 2, 3) order by pk desc" ]) rows = [tuple(row) for row in rows] rows = [("0x{}".format(value.hex().upper()), ) for value, in rows] row_with_version = [x[0] + x[1] for x in zip(values, rows)] self.EXPECTED_METADATA['data_types_database_dbo_weirdos'][ 'values'] = row_with_version database_name = "data_types_database" schema_name = "dbo" table_name = "computed_columns" column_name = [ "pk", "temp_replication_key_column", "started_at", "ended_at", "replication_key_column" ] insert_value = [(2, datetime(9998, 12, 31, 23, 59, 59, 990000, tzinfo=timezone.utc), datetime(1980, 5, 30, 16), datetime.now())] update_value = [(0, datetime(9998, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(1942, 11, 30), datetime(2017, 2, 12))] query_list = (insert(database_name, schema_name, table_name, insert_value, column_name[:-1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) values = insert_value + [( 1, datetime(9998, 12, 31, 23, 59, 59, 987000, tzinfo=timezone.utc), datetime(1970, 1, 1, 0), datetime.now())] + update_value rows = mssql_cursor_context_manager(*[ "select replication_key_column from data_types_database.dbo.computed_columns " "where pk in (0, 1, 2) order by pk desc" ]) rows = [tuple(row) for row in rows] row_with_duration = [x[0] + x[1] for x in zip(values, rows)] self.EXPECTED_METADATA['data_types_database_dbo_computed_columns'][ 'values'] = row_with_duration sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([ message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:-1] ])) self.assertEqual(len( records_by_stream[stream]['messages'][1:-1]), len(stream_expected_data[self.VALUES]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) if column not in non_selected_properties } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[1] is not None, row[1]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:-1] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print( "records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key self.assertEqual( bookmark['replication_key_value'], re.sub( r'\d{3}Z', "Z", max([ row[1] for row in stream_expected_data[self.VALUES] ]).strftime("%Y-%m-%dT%H:%M:%S.%fZ"))) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): """stream_expected_data[self.VALUES] Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'INCREMENTAL', 'replication-key': 'replication_key_column' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() non_selected_properties = [] table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream][ 'table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([ m["action"] == "upsert" for m in records_by_stream[stream]['messages'][1:-1] ]), msg="Expect all but the first message to be upserts") self.assertEqual(len( records_by_stream[stream]['messages'][1:-1]), len(stream_expected_data[self.VALUES]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) if column not in non_selected_properties } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[1] is not None, row[1]))] # Verify all data is correct for incremental for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, time): # sql server time has second resolution only self.assertEqual( expected_value.replace( microsecond=0).isoformat().replace( '+00:00', ''), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, date): # sql server time has second resolution only self.assertEqual( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value, actual_row["data"][column_name])) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key expected_bookmark = max([ row[1] for row in stream_expected_data[self.VALUES] if row[1] is not None ]) self.assertEqual(bookmark['replication_key_value'], expected_bookmark.isoformat()) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "dates_and_times" column_name = [ "pk", "replication_key_column", "date_and_time", "bigger_range_and_precision_datetime", "datetime_with_timezones", "datetime_no_seconds", "its_time" ] insert_value = [ (5, date(9999, 12, 30), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=14))).isoformat(), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)), (6, date(2018, 12, 29), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=14))).isoformat(), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)) ] update_value = [ (3, date(9999, 12, 31), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=10))).isoformat(), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)), (4, date(2018, 12, 30), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=6))).isoformat(), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)) ] delete_value = [(2, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = [ (5, date(9999, 12, 30), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=14))).astimezone( timezone.utc), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)), (6, date(2018, 12, 29), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=14))).astimezone( timezone.utc), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)) ] update_value = [ (3, date(9999, 12, 31), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=10))).astimezone( timezone.utc), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)), (4, date(2018, 12, 30), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=6))).astimezone( timezone.utc), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)) ] insert_value = insert_value[:-1] # only repl_key >= gets included update_value = update_value[:-1] self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"][ "values"] = [( 1, date(9999, 12, 29), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime( 9999, 12, 31, 10, 14, tzinfo=timezone( timedelta(hours=14))).astimezone(timezone.utc), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)) ] + update_value + insert_value sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([ message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:-1] ])) self.assertEqual(len( records_by_stream[stream]['messages'][1:-1]), len(stream_expected_data[self.VALUES]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) if column not in non_selected_properties } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[1] is not None, row[1]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:-1] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, time): # sql server time has second resolution only self.assertEqual( expected_value.replace( microsecond=0).isoformat().replace( '+00:00', ''), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, date): # sql server time has second resolution only self.assertEqual( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value, actual_row["data"][column_name])) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key expected_bookmark = max([ row[1] for row in stream_expected_data[self.VALUES] if row[1] is not None ]) self.assertEqual(bookmark['replication_key_value'], expected_bookmark.isoformat()) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): """stream_expected_data[self.VALUES] Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'INCREMENTAL', 'replication-key': 'replication_key_column' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream][ 'table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([ m["action"] == "upsert" for m in records_by_stream[stream]['messages'][1:-1] ]), msg="Expect all but the first message to be upserts") self.assertEqual( len(stream_expected_data[self.VALUES]), len(records_by_stream[stream]['messages'][1:-1]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[-1] is not None, row[-1]))] # Verify all data is correct for incremental for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, Decimal): self.assertEqual( type(actual_row["data"][column_name]), Decimal, msg= "decimal value is not represented as a number" ) self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key expected_bookmark = max([ row[-1] for row in stream_expected_data[self.VALUES] if row[-1] is not None ]) # currently decimal replication keys aren't supported in the front end. If they are at a later point # this should be a decimal comparison. https://stitchdata.atlassian.net/browse/SRCE-1331 self.assertEqual(bookmark['replication_key_value'], float(expected_bookmark)) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual( records_by_stream[stream]['schema'], simplejson.loads(simplejson.dumps(expected_schemas), use_decimal=True), msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "numeric_precisions" precision_scale = NUMERIC_PRECISION_SCALE column_type = [ "numeric({},{})".format(precision, scale) for precision, scale in precision_scale ] column_name = ["pk"] + [ x.replace("(", "_").replace(",", "_").replace(")", "") for x in column_type[:-1] ] + ["replication_key_column"] insert_value = [ (8, Decimal(100), Decimal(100), Decimal(100), Decimal(100)), (7, Decimal('99999.9995'), Decimal('9999999.999999999999'), Decimal('999999.9999999999999999999999'), Decimal('99999999999999999999999999999999999.995')) ] update_value = [ (5, Decimal(100), Decimal(100), Decimal(100), Decimal(100)), (6, Decimal('99999.9999'), Decimal('9999999.999999999999'), Decimal('999999.9999999999999999999999'), Decimal('99999999999999999999999999999999999.999')) ] delete_value = [(4, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = insert_value[-1:] # only repl_key >= gets included update_value = update_value[-1:] self.EXPECTED_METADATA["data_types_database_dbo_numeric_precisions"][ "values"] = [ (3, Decimal('99999.9993'), Decimal('9999999.999999999999'), Decimal('999999.9999999999999999999999'), Decimal('99999999999999999999999999999999999.993')), ] + update_value + insert_value database_name = "data_types_database" schema_name = "dbo" table_name = "decimal_precisions" precision_scale = DECIMAL_PRECISION_SCALE column_type = [ "decimal({},{})".format(precision, scale) for precision, scale in precision_scale ] column_name = ["pk"] + [ x.replace("(", "_").replace(",", "_").replace(")", "") for x in column_type[:-1] ] + ["replication_key_column"] insert_value = [ (8, Decimal(100), Decimal(100), Decimal(100), Decimal(100)), (7, Decimal('99999.9995'), Decimal('9999999999999.999999'), Decimal('9999999999999999999999.999999'), Decimal('9999999999999999999999999.9999999999995')) ] update_value = [ (5, Decimal(100), Decimal(100), Decimal(100), Decimal(100)), (6, Decimal('99999.9999'), Decimal('9999999999999.999999'), Decimal('9999999999999999999999.999999'), Decimal('9999999999999999999999999.9999999999999')) ] delete_value = [(4, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = insert_value[-1:] # only repl_key >= gets included update_value = update_value[-1:] self.EXPECTED_METADATA["data_types_database_dbo_decimal_precisions"][ "values"] = [ (3, Decimal('99999.9993'), Decimal('9999999999999.999999'), Decimal('9999999999999999999999.999999'), Decimal('9999999999999999999999999.9999999999993')) ] + update_value + insert_value sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([ message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:-1] ])) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[-1] is not None, row[-1]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:-1] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, Decimal): self.assertEqual( type(actual_row["data"][column_name]), Decimal, msg= "decimal value is not represented as a number" ) self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key expected_bookmark = max([ row[-1] for row in stream_expected_data[self.VALUES] if row[-1] is not None ]) # currently decimal replication keys aren't supported in the front end. If they are at a later point # this should be a decimal comparison. https://stitchdata.atlassian.net/browse/SRCE-1331 self.assertEqual(bookmark['replication_key_value'], float(expected_bookmark)) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual( records_by_stream[stream]['schema'], simplejson.loads(simplejson.dumps(expected_schemas), use_decimal=True), msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): """ Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } # self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream][ 'table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part if records_by_stream[stream]['messages'][-1].get("data"): last_row_data = True else: last_row_data = False self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-2]['action'], 'activate_version') if last_row_data: self.assertEqual( records_by_stream[stream]['messages'][-3]['action'], 'activate_version') else: self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertEqual( len([ m for m in records_by_stream[stream]['messages'][1:] if m["action"] == "activate_version" ]), 2, msg= "Expect 2 more activate version messages for end of full table and beginning of log based" ) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # Verify all data is correct for the full table part if last_row_data: final_row = -3 else: final_row = -2 for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream] ['messages'][1:final_row])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, Decimal): self.assertEqual( type(actual_row["data"][column_name]), Decimal, msg= "decimal value is not represented as a number" ) self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) # Verify all data is correct for the log replication part if sent if records_by_stream[stream]['messages'][-1].get("data"): for column_name, expected_value in expected_messages[-1][ "data"].items(): self.assertEqual( expected_value, records_by_stream[stream]['messages'][-1]["data"] [column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg= "expected bookmark to have current_log_version because we are using log replication" ) self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") inital_log_version = bookmark['current_log_version'] self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual( records_by_stream[stream]['schema'], simplejson.loads(simplejson.dumps(expected_schemas), use_decimal=True), msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "decimal_precisions" precision_scale = DECIMAL_PRECISION_SCALE column_type = [ "decimal({},{})".format(precision, scale) for precision, scale in precision_scale ] column_name = ["pk"] + [ x.replace("(", "_").replace(",", "_").replace(")", "") for x in column_type ] insert_value = [ (7, Decimal('-92473.8401'), Decimal('-4182159664734.645653'), Decimal('6101329656084900380190.268036'), Decimal('4778017533841887320066645.9761464001349')), ] update_value = [ (3, Decimal('-92473.8401'), Decimal('-4182159664734.645653'), Decimal('6101329656084900380190.268036'), Decimal('4778017533841887320066645.9761464001349')), ] delete_value = [(4, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = [insert_value[0] + (None, )] update_value = [update_value[0] + (None, )] delete_value = [ delete_value[0] + (None, None, None, None, datetime.utcnow()) ] self.EXPECTED_METADATA["data_types_database_dbo_decimal_precisions"]["values"] = \ [self.expected_metadata()["data_types_database_dbo_decimal_precisions"]["values"][-1]] + \ insert_value + delete_value + update_value self.EXPECTED_METADATA["data_types_database_dbo_decimal_precisions"][ "fields"].append({ "_sdc_deleted_at": { 'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic' } }) database_name = "data_types_database" schema_name = "dbo" table_name = "numeric_precisions" precision_scale = NUMERIC_PRECISION_SCALE column_type = [ "numeric({},{})".format(precision, scale) for precision, scale in precision_scale ] column_name = ["pk"] + [ x.replace("(", "_").replace(",", "_").replace(")", "") for x in column_type ] insert_value = [ (7, Decimal('96701.9382'), Decimal('-4371716.186100650268'), Decimal('-367352.306093776232045517794'), Decimal('-81147872128956247517327931319278572.985')), ] update_value = [ (3, Decimal('96701.9382'), Decimal('-4371716.186100650268'), Decimal('-367352.306093776232045517794'), Decimal('-81147872128956247517327931319278572.985')), ] delete_value = [(4, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = [insert_value[0] + (None, )] update_value = [update_value[0] + (None, )] delete_value = [ delete_value[0] + (None, None, None, None, datetime.utcnow()) ] self.EXPECTED_METADATA["data_types_database_dbo_numeric_precisions"]["values"] = \ insert_value + delete_value + update_value self.EXPECTED_METADATA["data_types_database_dbo_numeric_precisions"][ "fields"].append({ "_sdc_deleted_at": { 'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic' } }) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertTrue( all([ message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:] ])) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertGreaterEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if column_name != "_sdc_deleted_at": if isinstance(expected_value, Decimal): self.assertEqual( type(actual_row["data"][column_name]), Decimal, msg= "decimal value is not represented as a number" ) self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) elif expected_value: # we have an expected value for a deleted row try: actual_value = datetime.strptime( actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%S.%fZ") except ValueError: actual_value = datetime.strptime( actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%SZ") self.assertGreaterEqual( actual_value, expected_value - timedelta(seconds=15)) self.assertLessEqual( actual_value, expected_value + timedelta(seconds=15)) else: # the row wasn't deleted so we can either not pass the column or it can be None self.assertIsNone( actual_row["data"].get(column_name)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg= "expected bookmark to have current_log_version because we are using log replication" ) self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") new_log_version = bookmark['current_log_version'] self.assertGreater(new_log_version, inital_log_version, msg='expected log version to increase') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual( records_by_stream[stream]['schema'], simplejson.loads(simplejson.dumps(expected_schemas), use_decimal=True), msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): """stream_expected_data[self.VALUES] Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'INCREMENTAL', 'replication-key': 'replication_key_column' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream][ 'table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertTrue( all([ m["action"] == "upsert" for m in records_by_stream[stream]['messages'][1:-1] ]), msg="Expect all but the first message to be upserts") self.assertEqual( len(stream_expected_data[self.VALUES]), len(records_by_stream[stream]['messages'][1:-1]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[1] is not None, row[1]))] # Verify all data is correct for incremental for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): column_index = [ list(key.keys())[0] for key in self.expected_metadata()[stream][self.FIELDS] ].index(column_name) if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][self.DATATYPE] \ in ("real", "float") \ and actual_row["data"][column_name] is not None: self.assertEqual( type(actual_row["data"][column_name]), Decimal, msg= "float value is not represented as a number" ) self.assertEqual( float(str(float32(expected_value))), float( str( float32(actual_row["data"] [column_name]))), msg= "single value of {} doesn't match actual {}" .format( float(str(float32(expected_value))), float( str( float32(actual_row["data"] [column_name]))))) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key self.assertEqual( bookmark['replication_key_value'], max([ row[1] for row in stream_expected_data[self.VALUES] if row[1] is not None ])) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "float_precisions" column_name = [ "pk", "replication_key_column", "float_53", "real_24_bits" ] insert_value = [(15, 100, 100, 100), (14, 3.4028235e+38, 1.7976931348623157e+308, 3.4028235e+38)] update_value = [(4, 101, 101, 101), (6, 3.4028233e+38, 1.7976931348623157e+308, 3.4028235e+38)] delete_value = [(5, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = insert_value[-1:] # only repl_key >= gets included update_value = update_value[-1:] self.EXPECTED_METADATA["data_types_database_dbo_float_precisions"]["values"] = \ [(1, 3.4028230e+38, 1.7976931348623157e+308, 3.4028235e+38)] + update_value + insert_value sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([ message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:-1] ])) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[1] is not None, row[1]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:-1] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): column_index = [ list(key.keys())[0] for key in self.expected_metadata()[stream][self.FIELDS] ].index(column_name) if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][self.DATATYPE] \ in ("real", "float") \ and actual_row["data"][column_name] is not None: self.assertEqual( type(actual_row["data"][column_name]), Decimal, msg= "float value is not represented as a number" ) self.assertEqual( float(str(float32(expected_value))), float( str( float32(actual_row["data"] [column_name]))), msg= "single value of {} doesn't match actual {}" .format( float(str(float32(expected_value))), float( str( float32(actual_row["data"] [column_name]))))) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key self.assertEqual( bookmark['replication_key_value'], max([ row[1] for row in stream_expected_data[self.VALUES] if row[1] is not None ])) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))