def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") #select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, c['stream_id']) for k in self.expected_automatic_fields()[c['stream_name']]: mdata = next( (m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format( c['stream_name'], mdata)) self.assertTrue( mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_and_fields_via_metadata( conn_id, c, catalog_entry) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) first_record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, first_record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( first_record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Verify that automatic fields are all emitted with records synced_records = runner.get_records_from_target_output() for stream_name, data in synced_records.items(): record_messages = [ set(row['data'].keys()) for row in data['messages'] ] for record_keys in record_messages: self.assertEqual( self.expected_automatic_fields().get(stream_name, set()) - record_keys, set()) # Verify bookmarks were saved for all streams # TODO: No bookmarks for now, though `actions` at least will be state = menagerie.get_state(conn_id) # TODO: make more generic check for assertions in state self.assertTrue( state.get('bookmarks', {}).get('actions', {}).get('window_start')) #for stream in self.expected_sync_streams(): # self.assertTrue(stream_states.get(stream)) print("Bookmarks are accurate, running second sync...") # Run another sync second_sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) second_record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) for stream in self.expected_full_table_sync_streams(): record_count = second_record_count_by_stream.get(stream, 0) # Assert we have enough data self.assertGreater(record_count, 0) # Assert that our bookmark works as expected self.assertEqual(record_count, first_record_count_by_stream[stream]) # self.assertEqual(record_count, 1) for stream in self.expected_incremental_sync_streams(): record_count = second_record_count_by_stream.get(stream, 0) # We aren't generating data between the two syncs, and the # bookmark should be a little behind 'now', so the second sync # should return no data self.assertEqual(record_count, 0) print("Second sync record count is OK.")
def test_run(self): print("\n\nRUNNING {}\n\n".format(self.name())) # Initialize start_date state to make assertions self.START_DATE = self.get_properties().get('start_date') start_date_1 = self.START_DATE # default start_date_2 = self.timedelta_formatted(self.START_DATE, 2) # default + 2 days # get expected records expected_records_1 = {x: [] for x in self.expected_streams() } # ids by stream for stream in self.testable_streams(): existing_objects = self.client.get_all(stream) assert existing_objects, "Test data is not properly set for {}, test will fail.".format( stream) print("Data exists for stream: {}".format(stream)) for obj in existing_objects: expected_records_1[stream].append(obj) # If no objects exist since the 2nd start_date, create one data_in_range = False for obj in expected_records_1.get(stream): created = obj.get('created_date').replace( ' ', 'T', 1) + 'Z' # 2016-06-02 19:57:10 -->> 2016-06-02T19:57:10Z assert created, "'created_date' is not an attribute of {}".format( obj.get('name')) if self.parse_date(created) > self.parse_date(start_date_2): data_in_range = True break if not data_in_range: if stream in self.testable_streams(): expected_records_1[stream].append( self.client.create(stream)) continue assert None, "Sufficient test data does not exist for {}, test will fail.".format( stream) ########################################################################## ### First Sync ########################################################################## conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") # Select all available streams and their fields self.select_all_streams_and_fields(conn_id=conn_id, catalogs=found_catalogs) catalogs = menagerie.get_catalogs(conn_id) #clear state menagerie.set_state(conn_id, {}) # Run sync 1 sync_job_1 = runner.run_sync_mode(self, conn_id) # Verify tap exit codes exit_status_1 = menagerie.get_exit_status(conn_id, sync_job_1) menagerie.verify_sync_exit_status(self, exit_status_1, sync_job_1) # read target output record_count_by_stream_1 = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) replicated_row_count_1 = reduce(lambda accum, c: accum + c, record_count_by_stream_1.values()) self.assertGreater(replicated_row_count_1, 0, msg="failed to replicate any data: {}".format( record_count_by_stream_1)) print("total replicated row count: {}".format(replicated_row_count_1)) synced_records_1 = runner.get_records_from_target_output() state_1 = menagerie.get_state(conn_id) ########################################################################## ### Update START DATE Between Syncs ########################################################################## self.START_DATE = start_date_2 print("REPLICATION START DATE CHANGE: {} ===>>> {} ".format( start_date_1, start_date_2)) self.END_DATE = self.get_properties()['end_date'] ########################################################################## ### Second Sync ########################################################################## # create a new connection with the new start_date conn_id = connections.ensure_connection(self, original_properties=False) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") # Select all available streams and their fields self.select_all_streams_and_fields(conn_id=conn_id, catalogs=found_catalogs) catalogs = menagerie.get_catalogs(conn_id) # clear state menagerie.set_state(conn_id, {}) # Run sync 2 sync_job_2 = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status_2 = menagerie.get_exit_status(conn_id, sync_job_2) menagerie.verify_sync_exit_status(self, exit_status_2, sync_job_2) # This should be validating the the PKs are written in each record record_count_by_stream_2 = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) replicated_row_count_2 = reduce(lambda accum, c: accum + c, record_count_by_stream_2.values(), 0) self.assertGreater(replicated_row_count_2, 0, msg="failed to replicate any data") print("total replicated row count: {}".format(replicated_row_count_2)) synced_records_2 = runner.get_records_from_target_output() state_2 = menagerie.get_state(conn_id) for stream in self.testable_streams(): with self.subTest(stream=stream): replication_type = self.expected_replication_method().get( stream) record_count_1 = record_count_by_stream_1.get(stream, 0) record_count_2 = record_count_by_stream_2.get(stream, 0) # Testing how FULL TABLE streams handle start date if replication_type == self.FULL: # Verify that a bookmark doesn't exist for the stream. self.assertTrue( state_1.get(stream) is None, msg="There should not be bookmark value for {}\n{}". format(stream, state_1.get(stream))) self.assertTrue( state_2.get(stream) is None, msg="There should not be bookmark value for {}\n{}". format(stream, state_1.get(stream))) # Verify that the 2nd sync includes the same number of records as the 1st sync. # -> Currently full table does not obey start_date, which makes this assertion valid self.assertEqual( record_count_2, record_count_1, msg="\nStream '{}' is {}\n".format(stream, self.FULL) + "Record counts should be equal, but are not\n" + "Sync 1 start_date: {} ".format(start_date_1) + "Sync 1 record_count: {}\n".format(record_count_1) + "Sync 2 start_date: {} ".format(start_date_2) + "Sync 2 record_count: {}".format(record_count_2)) # Verify all records in the 1st sync are included in the 2nd sync since # 2nd sync has a later start date. records_from_sync_1 = set( row.get('data').get('eid') for row in synced_records_1.get(stream, []).get( 'messages', [])) records_from_sync_2 = set( row.get('data').get('eid') for row in synced_records_2.get(stream, []).get( 'messages', [])) self.assertEqual( set(), records_from_sync_1.difference(records_from_sync_2), msg="Sync 2 record(s) missing from Sync 1:\n{}".format( records_from_sync_2.difference( records_from_sync_1))) else: raise Exception( "Expectations are set incorrectly. {} cannot have a " "replication method of {}".format( stream, replication_type))
def test_run(self): (table_configs, conn_id, _) = self.pre_sync_test() # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata found_catalogs = menagerie.get_catalogs(conn_id) for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) # run full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() expected_pks = {} for config in table_configs: key = {config['HashKey']} if config.get('SortKey'): key |= {config.get('SortKey')} expected_pks[config['TableName']] = key # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, {x['TableName'] for x in table_configs}, expected_pks) state = menagerie.get_state(conn_id) first_versions = {} # assert that we get the correct number of records for each stream for config in table_configs: table_name = config['TableName'] self.assertEqual(config['num_rows'], record_count_by_stream[table_name]) # assert that an activate_version_message is first and last message sent for each stream self.assertEqual( 'activate_version', records_by_stream[table_name]['messages'][0]['action']) self.assertEqual( 'activate_version', records_by_stream[table_name]['messages'][-1]['action']) # assert that the state has an initial_full_table_complete == True self.assertTrue( state['bookmarks'][table_name]['initial_full_table_complete']) # assert that there is a version bookmark in state first_versions[table_name] = state['bookmarks'][table_name][ 'version'] self.assertIsNotNone(first_versions[table_name])
def test_run(self): """ Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } # self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream][ 'table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part if records_by_stream[stream]['messages'][-1].get("data"): last_row_data = True else: last_row_data = False self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-2]['action'], 'activate_version') if last_row_data: self.assertEqual( records_by_stream[stream]['messages'][-3]['action'], 'activate_version') else: self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertEqual( len([ m for m in records_by_stream[stream]['messages'][1:] if m["action"] == "activate_version" ]), 2, msg= "Expect 2 more activate version messages for end of full table and beginning of log based" ) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # Verify all data is correct for the full table part if last_row_data: final_row = -3 else: final_row = -2 for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream] ['messages'][1:final_row])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, Decimal): self.assertEqual( type(actual_row["data"][column_name]), Decimal, msg= "decimal value is not represented as a number" ) self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) # Verify all data is correct for the log replication part if sent if records_by_stream[stream]['messages'][-1].get("data"): for column_name, expected_value in expected_messages[-1][ "data"].items(): self.assertEqual( expected_value, records_by_stream[stream]['messages'][-1]["data"] [column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg= "expected bookmark to have current_log_version because we are using log replication" ) self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") inital_log_version = bookmark['current_log_version'] self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual( records_by_stream[stream]['schema'], simplejson.loads(simplejson.dumps(expected_schemas), use_decimal=True), msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "decimal_precisions" precision_scale = DECIMAL_PRECISION_SCALE column_type = [ "decimal({},{})".format(precision, scale) for precision, scale in precision_scale ] column_name = ["pk"] + [ x.replace("(", "_").replace(",", "_").replace(")", "") for x in column_type ] insert_value = [ (7, Decimal('-92473.8401'), Decimal('-4182159664734.645653'), Decimal('6101329656084900380190.268036'), Decimal('4778017533841887320066645.9761464001349')), ] update_value = [ (3, Decimal('-92473.8401'), Decimal('-4182159664734.645653'), Decimal('6101329656084900380190.268036'), Decimal('4778017533841887320066645.9761464001349')), ] delete_value = [(4, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = [insert_value[0] + (None, )] update_value = [update_value[0] + (None, )] delete_value = [ delete_value[0] + (None, None, None, None, datetime.utcnow()) ] self.EXPECTED_METADATA["data_types_database_dbo_decimal_precisions"]["values"] = \ [self.expected_metadata()["data_types_database_dbo_decimal_precisions"]["values"][-1]] + \ insert_value + delete_value + update_value self.EXPECTED_METADATA["data_types_database_dbo_decimal_precisions"][ "fields"].append({ "_sdc_deleted_at": { 'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic' } }) database_name = "data_types_database" schema_name = "dbo" table_name = "numeric_precisions" precision_scale = NUMERIC_PRECISION_SCALE column_type = [ "numeric({},{})".format(precision, scale) for precision, scale in precision_scale ] column_name = ["pk"] + [ x.replace("(", "_").replace(",", "_").replace(")", "") for x in column_type ] insert_value = [ (7, Decimal('96701.9382'), Decimal('-4371716.186100650268'), Decimal('-367352.306093776232045517794'), Decimal('-81147872128956247517327931319278572.985')), ] update_value = [ (3, Decimal('96701.9382'), Decimal('-4371716.186100650268'), Decimal('-367352.306093776232045517794'), Decimal('-81147872128956247517327931319278572.985')), ] delete_value = [(4, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = [insert_value[0] + (None, )] update_value = [update_value[0] + (None, )] delete_value = [ delete_value[0] + (None, None, None, None, datetime.utcnow()) ] self.EXPECTED_METADATA["data_types_database_dbo_numeric_precisions"]["values"] = \ insert_value + delete_value + update_value self.EXPECTED_METADATA["data_types_database_dbo_numeric_precisions"][ "fields"].append({ "_sdc_deleted_at": { 'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic' } }) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertTrue( all([ message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:] ])) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertGreaterEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if column_name != "_sdc_deleted_at": if isinstance(expected_value, Decimal): self.assertEqual( type(actual_row["data"][column_name]), Decimal, msg= "decimal value is not represented as a number" ) self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) elif expected_value: # we have an expected value for a deleted row try: actual_value = datetime.strptime( actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%S.%fZ") except ValueError: actual_value = datetime.strptime( actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%SZ") self.assertGreaterEqual( actual_value, expected_value - timedelta(seconds=15)) self.assertLessEqual( actual_value, expected_value + timedelta(seconds=15)) else: # the row wasn't deleted so we can either not pass the column or it can be None self.assertIsNone( actual_row["data"].get(column_name)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg= "expected bookmark to have current_log_version because we are using log replication" ) self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") new_log_version = bookmark['current_log_version'] self.assertGreater(new_log_version, inital_log_version, msg='expected log version to increase') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual( records_by_stream[stream]['schema'], simplejson.loads(simplejson.dumps(expected_schemas), use_decimal=True), msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): """stream_expected_data[self.VALUES] Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'INCREMENTAL', 'replication-key': 'replication_key_column' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream][ 'table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertTrue( all([ m["action"] == "upsert" for m in records_by_stream[stream]['messages'][1:-1] ]), msg="Expect all but the first message to be upserts") self.assertEqual( len(stream_expected_data[self.VALUES]), len(records_by_stream[stream]['messages'][1:-1]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[1] is not None, row[1]))] # Verify all data is correct for incremental for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): column_index = [ list(key.keys())[0] for key in self.expected_metadata()[stream][self.FIELDS] ].index(column_name) if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][self.DATATYPE] \ in ("real", "float") \ and actual_row["data"][column_name] is not None: self.assertEqual( type(actual_row["data"][column_name]), Decimal, msg= "float value is not represented as a number" ) self.assertEqual( float(str(float32(expected_value))), float( str( float32(actual_row["data"] [column_name]))), msg= "single value of {} doesn't match actual {}" .format( float(str(float32(expected_value))), float( str( float32(actual_row["data"] [column_name]))))) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key self.assertEqual( bookmark['replication_key_value'], max([ row[1] for row in stream_expected_data[self.VALUES] if row[1] is not None ])) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "float_precisions" column_name = [ "pk", "replication_key_column", "float_53", "real_24_bits" ] insert_value = [(15, 100, 100, 100), (14, 3.4028235e+38, 1.7976931348623157e+308, 3.4028235e+38)] update_value = [(4, 101, 101, 101), (6, 3.4028233e+38, 1.7976931348623157e+308, 3.4028235e+38)] delete_value = [(5, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = insert_value[-1:] # only repl_key >= gets included update_value = update_value[-1:] self.EXPECTED_METADATA["data_types_database_dbo_float_precisions"]["values"] = \ [(1, 3.4028230e+38, 1.7976931348623157e+308, 3.4028235e+38)] + update_value + insert_value sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([ message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:-1] ])) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[1] is not None, row[1]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:-1] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): column_index = [ list(key.keys())[0] for key in self.expected_metadata()[stream][self.FIELDS] ].index(column_name) if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][self.DATATYPE] \ in ("real", "float") \ and actual_row["data"][column_name] is not None: self.assertEqual( type(actual_row["data"][column_name]), Decimal, msg= "float value is not represented as a number" ) self.assertEqual( float(str(float32(expected_value))), float( str( float32(actual_row["data"] [column_name]))), msg= "single value of {} doesn't match actual {}" .format( float(str(float32(expected_value))), float( str( float32(actual_row["data"] [column_name]))))) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key self.assertEqual( bookmark['replication_key_value'], max([ row[1] for row in stream_expected_data[self.VALUES] if row[1] is not None ])) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): """ Verify that for each stream you can get multiple pages of data when no fields are selected and only the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ print("\n\nRUNNING {}\n\n".format(self.name())) # Resetting tracked parent objects prior to test utils.reset_tracked_parent_objects() # ensure data exists for sync streams and set expectations expected_records = {x: [] for x in self.expected_sync_streams() } # ids by stream for stream in self.testable_streams(): since = None if stream in self.expected_incremental_streams(): since = dt.strptime(self.get_properties()['start_date'], self.START_DATE_FORMAT).strftime( self.TEST_TIME_FORMAT) _, existing_objects = utils.get_total_record_count_and_objects( stream, since=since) if existing_objects: logging.info("Data exists for stream: {}".format(stream)) for obj in existing_objects: expected_records[stream].append({ field: obj.get(field) for field in self.expected_automatic_fields().get( stream) }) continue logging.info("Data does not exist for stream: {}".format(stream)) new_object = utils.create_object(stream) logging.info("Data generated for stream: {}".format(stream)) expected_records[stream].append({ field: new_object.get(field) for field in self.expected_automatic_fields().get(stream) }) conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") # Select all streams but only automtic fields self.select_all_streams_and_fields(conn_id, found_catalogs, select_all_fields=False) for cat in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, cat['stream_id']) for k in self.expected_automatic_fields()[cat['stream_name']]: mdata = next( (m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format( cat['stream_name'], mdata)) self.assertTrue( mdata and mdata['metadata']['inclusion'] == 'automatic') catalogs = menagerie.get_catalogs(conn_id) #clear state menagerie.set_state(conn_id, {}) # run sync sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # read target output first_record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, first_record_count_by_stream.values()) synced_records = runner.get_records_from_target_output() # Verify target has records for all synced streams for stream, count in first_record_count_by_stream.items(): assert stream in self.expected_sync_streams() self.assertGreater( count, 0, msg="failed to replicate any data for: {}".format(stream)) print("total replicated row count: {}".format(replicated_row_count)) for stream in self.testable_streams(): with self.subTest(stream=stream): data = synced_records.get(stream) record_messages_keys = [ set(row['data'].keys()) for row in data['messages'] ] expected_keys = self.expected_automatic_fields().get(stream) # Verify that ONLY automatic fields are emitted for actual_keys in record_messages_keys: self.assertEqual( actual_keys.symmetric_difference(expected_keys), set(), msg="Expected automatic fields and nothing else.") actual_records = [row['data'] for row in data['messages']] # Verify the number of records match expectations self.assertEqual(len(expected_records.get(stream)), len(actual_records), msg="Number of actual records do match expectations. " +\ "We probably have duplicate records.") # verify by values, that we replicated the expected records for actual_record in actual_records: self.assertTrue( actual_record in expected_records.get(stream), msg="Actual record missing from expectations") for expected_record in expected_records.get(stream): self.assertTrue(expected_record in actual_records, msg="Expected record missing from target.") # CLEAN UP stream_to_delete = 'boards' boards_remaining = 5 print("Deleting all but {} records for stream {}.".format( boards_remaining, stream_to_delete)) board_count = len(expected_records.get(stream_to_delete, [])) for obj_to_delete in expected_records.get( stream_to_delete, []): # Delete all baords between syncs if board_count > boards_remaining: utils.delete_object(stream_to_delete, obj_to_delete.get('id')) board_count -= 1 else: break # Reset the parent objects that we have been tracking utils.reset_tracked_parent_objects()
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") #select all catalogs for catalog in found_catalogs: connections.select_catalog_and_fields_via_metadata( conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id'])) future_time = "2050-01-01T00:00:00.000000Z" #clear state future_bookmarks = { "currently_syncing": None, "bookmarks": { "contacts": { "offset": {}, "versionTimestamp": future_time }, "subscription_changes": { "startTimestamp": future_time, "offset": {} }, "campaigns": { "offset": {} }, "forms": { "updatedAt": future_time }, "deals": { "offset": {}, "hs_lastmodifieddate": future_time }, "workflows": { "updatedAt": future_time }, "owners": { "updatedAt": future_time }, "contact_lists": { "updatedAt": future_time, "offset": {} }, "email_events": { "startTimestamp": future_time, "offset": {} }, "companies": { "offset": {}, "hs_lastmodifieddate": future_time }, "engagements": { "lastUpdated": future_time, "offset": {} } } } menagerie.set_state(conn_id, future_bookmarks) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) #because the bookmarks were set into the future, we should NOT actually replicate any data. #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks streams_with_bookmarks = self.expected_sync_streams() streams_with_bookmarks.remove('campaigns') streams_with_bookmarks.remove('deal_pipelines') bad_streams = streams_with_bookmarks.intersection( record_count_by_stream.keys()) self.assertEqual( len(bad_streams), 0, msg="still pulled down records from {} despite future bookmarks". format(bad_streams)) state = menagerie.get_state(conn_id) # NB: Companies and engagements won't set a bookmark in the future. state["bookmarks"].pop("companies") state["bookmarks"].pop("engagements") future_bookmarks["bookmarks"].pop("companies") future_bookmarks["bookmarks"].pop("engagements") self.assertEqual( state, future_bookmarks, msg= "state should not have been modified because we didn't replicate any data" ) bookmarks = state.get('bookmarks') bookmark_streams = set(state.get('bookmarks').keys())
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify discovery produced (at least) 1 expected catalog found_catalogs = [ found_catalog for found_catalog in menagerie.get_catalogs(conn_id) if found_catalog['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual(len(found_catalogs), 1) # verify the tap discovered the expected streams found_catalog_names = { catalog['tap_stream_id'] for catalog in found_catalogs } self.assertSetEqual(self.expected_check_streams(), found_catalog_names) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(test_table_name, test_catalog['stream_name']) print("discovered streams are correct") # perform table selection print('selecting {} and all fields within the table'.format( test_table_name)) schema_and_metadata = menagerie.get_annotated_schema( conn_id, test_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] _ = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, schema_and_metadata, additional_md) # clear state menagerie.set_state(conn_id, {}) # run sync job 1 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_1 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('activate_version', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the persisted schema matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records match expectations self.assertDictEqual(self.expected_records[0], messages[1]['data']) self.assertDictEqual(self.expected_records[1], messages[2]['data']) self.assertDictEqual(self.expected_records[2], messages[3]['data']) print("records are correct") # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_1, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN and get the same 3 records #---------------------------------------------------------------------- # run sync job 2 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_2 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(4, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('activate_version', messages[3]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_2, table_version_1) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records still match expectations self.assertDictEqual(self.expected_records[0], messages[0]['data']) self.assertDictEqual(self.expected_records[1], messages[1]['data']) self.assertDictEqual(self.expected_records[2], messages[2]['data']) # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_2, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN following various manipulations to the data #---------------------------------------------------------------------- with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: # NB | We will perform the following actions prior to the next sync: # [Action (EXPECTED RESULT)] # Insert a record # Insert a record to be updated prior to sync # Insert a record to be deleted prior to sync (NOT REPLICATED) # Update an existing record # Update a newly inserted record # Delete an existing record # Delete a newly inserted record # inserting... # a new record nyc_tz = pytz.timezone('America/New_York') our_time_offset = "-04:00" our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(6, 6, 6) our_time_tz = our_time.isoformat() + our_time_offset our_date = datetime.date(1970, 7, 1) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '1', 'our_json': json.dumps({'nymn': 77}), 'our_jsonb': json.dumps({'burgers': 'good++'}), 'our_uuid': my_uuid, 'our_citext': 'cyclops 2', 'our_store': 'dances=>"floor",name=>"betty"', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': '$0.98789' }) self.expected_records.append({ 'id': 4, 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'OUR DATE': '1970-07-01T00:00:00+00:00', 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': True, 'our_json': '{"nymn": 77}', 'our_jsonb': '{"burgers": "good++"}', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_citext': self.inserted_records[-1]['our_citext'], 'our_store': { "name": "betty", "dances": "floor" }, 'our_cidr': self.inserted_records[-1]['our_cidr'], 'our_inet': self.inserted_records[-1]['our_inet'], 'our_mac': self.inserted_records[-1]['our_mac'], 'our_money': '$0.99', 'our_alignment_enum': None, }) # a new record which we will then update prior to sync our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 5, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) # a new record to be deleted prior to sync our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 6, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) db_utils.insert_record(cur, test_table_name, self.inserted_records[3]) db_utils.insert_record(cur, test_table_name, self.inserted_records[4]) db_utils.insert_record(cur, test_table_name, self.inserted_records[5]) # updating ... # an existing record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 1 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[0]["our_double"] = decimal.Decimal("6.6") self.expected_records[0]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # a newly inserted record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 5 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[4]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[4]["our_double"] = decimal.Decimal("6.6") self.expected_records[4]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # deleting # an existing record record_pk = 2 db_utils.delete_record(cur, canon_table_name, record_pk) # a newly inserted record record_pk = 6 db_utils.delete_record(cur, canon_table_name, record_pk) #---------------------------------------------------------------------- # invoke the sync job AGAIN after vairous manipulations #---------------------------------------------------------------------- # run sync job 3 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_3 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(4, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_3, table_version_2) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # NB | This is a little tough to track mentally so here's a breakdown of # the order of operations by expected records indexes: # Prior to Sync 1 # insert 0, 1, 2 # Prior to Sync 2 # No db changes # Prior to Sync 3 # insert 3, 4, 5 # update 0, 4 # delete 1, 5 # Resulting Synced Records: 2, 3, 0, 4 # verify replicated records still match expectations self.assertDictEqual(self.expected_records[2], messages[0]['data']) # existing insert self.assertDictEqual(self.expected_records[3], messages[1]['data']) # new insert self.assertDictEqual(self.expected_records[0], messages[2]['data']) # existing update self.assertDictEqual(self.expected_records[4], messages[3]['data']) # new insert / update # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_3, bookmark['version'])
def test_run(self): conn_id = connections.ensure_connection(self, payload_hook=None) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg= "Expected check streams are not subset of discovered catalog, extra streams={}" .format( self.expected_check_streams().difference(found_catalog_names))) # # # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], []) # # Verify that all streams sync at least one row for initial sync # # This test is also verifying access token expiration handling. If test fails with # # authentication error, refresh token was not replaced after expiring. menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that all streams sync at least one row for initial sync for stream in self.expected_sync_streams().difference({ 'feature_events', 'events', 'page_events', 'guide_events', 'poll_events', 'track_events', 'track_types', }): with self.subTest(stream=stream): self.assertLess(0, record_count_by_stream[stream]) # TODO run the remaining assertions against all incremental streams # Verify that bookmark values are correct after incremental sync start_date = self.get_properties()['start_date'] current_state = menagerie.get_state(conn_id) test_bookmark = current_state['bookmarks']['accounts'] # Verify a bookmark is present for accounts self.assertIn('bookmarks', current_state.keys()) self.assertIn('accounts', current_state['bookmarks'].keys())
def do_test(self, conn_id): # Select our catalogs our_catalogs = [ c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata( conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream, {}).get('messages', []) for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="oh no! {}".format(m)) satisfaction_ratings_bookmark = "2020-03-05T14:14:42Z" state = menagerie.get_state(conn_id) state['bookmarks']['satisfaction_ratings'][ 'updated_at'] = satisfaction_ratings_bookmark menagerie.set_state(conn_id, state) # Create a new record creds = { "email": "*****@*****.**", "subdomain": self.get_properties()['subdomain'], "token": os.getenv('TAP_ZENDESK_API_TOKEN') } self.client = Zenpy(**creds) # Create some new objects group_name = str(uuid.uuid4()) group = Group(name=group_name) self.created_group = self.client.groups.create(group) org_name = str(uuid.uuid4()) org = Organization(name=org_name) self.created_org = self.client.organizations.create(org) user = User(name="John Doe", email="{}@mailinator.com".format(uuid.uuid4())) self.created_user = self.client.users.create(user) # Sleeping 1 minute to validate lookback behavior needed in tap # We've observed a delay between when users are created and when # they're available through the API print("sleeping for 60 seconds") time.sleep(60) # Run another Sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Check both sets of records and make sure we have our new rows records = runner.get_records_from_target_output() messages = records.get('groups', {}).get('messages', []) new_record = [ r for r in messages if r['data']['id'] == self.created_group.id ] self.assertTrue(any(new_record)) self.assertEqual(len(messages), 2, msg="Sync'd incorrect count of messages: {}".format( len(messages))) messages = records.get('organizations', {}).get('messages', []) new_record = [ r for r in messages if r['data']['id'] == self.created_org.id ] self.assertTrue(any(new_record)) self.assertEqual(len(messages), 2, msg="Sync'd incorrect count of messages: {}".format( len(messages))) messages = records.get('users', {}).get('messages', []) new_record = [ r for r in messages if r['data']['id'] == self.created_user.id ] self.assertTrue(any(new_record)) # NB: GreaterEqual because we suspect Zendesk updates users in the backend # >= 1 because we're no longer inclusive of the last replicated user record. The lookback will control this going forward. # If we get the user we wanted and then some, this assertion should succeed self.assertGreaterEqual( len(messages), 1, msg="Sync'd incorrect count of messages: {}".format(len(messages))) messages = records.get('satisfaction_ratings', {}).get('messages', []) new_record = [ r for r in messages if r['data']['id'] in [364471784994, 364465631433, 364465212373] ] self.assertTrue(any(new_record)) self.assertGreaterEqual( len(messages), 3, msg="Sync'd incorrect count of messages: {}".format(len(messages))) for message in messages: self.assertGreaterEqual( utils.strptime_to_utc( message.get('data', {}).get('updated_at', '')), utils.strptime_to_utc(satisfaction_ratings_bookmark))
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode and verify exit codes check_job_name = runner.run_check_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify basics of discovery are consistent with expectations... # verify discovery produced (at least) 1 expected catalog found_catalogs = [ found_catalog for found_catalog in menagerie.get_catalogs(conn_id) if found_catalog['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual(len(found_catalogs), 1) # verify the tap discovered the expected streams found_catalog_names = { catalog['tap_stream_id'] for catalog in found_catalogs } self.assertSetEqual(self.expected_check_streams(), found_catalog_names) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(test_table_name, test_catalog['stream_name']) print("discovered streams are correct") # perform table selection print('selecting {} and all fields within the table'.format( test_table_name)) schema_and_metadata = menagerie.get_annotated_schema( conn_id, test_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'INCREMENTAL', 'replication-key': 'OUR TS TZ' } }] _ = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, schema_and_metadata, additional_md) # clear state menagerie.set_state(conn_id, {}) # run sync job 1 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_primary_keys()) records_by_stream = runner.get_records_from_target_output() table_version = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(4, len(messages)) self.assertEqual('activate_version', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) # verify the persisted schema matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records match expectations self.assertDictEqual(self.expected_records[0], messages[1]['data']) self.assertDictEqual(self.expected_records[1], messages[2]['data']) self.assertDictEqual(self.expected_records[2], messages[3]['data']) # verify records are in ascending order by replication-key value expected_replication_key = list( self.expected_replication_keys()[test_table_name])[0] self.assertLess(messages[1]['data'][expected_replication_key], messages[2]['data'][expected_replication_key]) self.assertLess(messages[2]['data'][expected_replication_key], messages[3]['data'][expected_replication_key]) print("records are correct") # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_incremental_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertEqual(table_version, bookmark['version']) self.assertEqual(expected_replication_key, bookmark['replication_key']) self.assertEqual(self.expected_records[2][expected_replication_key], bookmark['replication_key_value']) #---------------------------------------------------------------------- # invoke the sync job AGAIN following various manipulations to the data #---------------------------------------------------------------------- with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: # NB | We will perform the following actions prior to the next sync: # [Action (EXPECTED RESULT)] # Insert a record with a lower replication-key value (NOT REPLICATED) # Insert a record with a higher replication-key value (REPLICATED) # Insert a record with a higher replication-key value and... # Delete it (NOT REPLICATED) # Update a record with a higher replication-key value (REPLICATED) # Update a record with a lower replication-key value (NOT REPLICATED) # inserting... # a record with a replication-key value that is lower than the previous bookmark nyc_tz = pytz.timezone('America/New_York') our_time_offset = "-04:00" our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(6, 6, 6) our_time_tz = our_time.isoformat() + our_time_offset our_date = datetime.date(1970, 7, 1) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '1', 'our_json': json.dumps({'nymn': 77}), 'our_jsonb': json.dumps({'burgers': 'good++'}), 'our_uuid': my_uuid, 'our_citext': 'cyclops 2', 'our_store': 'dances=>"floor",name=>"betty"', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': '$0.98789' }) self.expected_records.append({ 'id': 4, 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'OUR DATE': '1970-07-01T00:00:00+00:00', 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': True, 'our_json': '{"nymn": 77}', 'our_jsonb': '{"burgers": "good++"}', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_citext': self.inserted_records[-1]['our_citext'], 'our_store': { "name": "betty", "dances": "floor" }, 'our_cidr': self.inserted_records[-1]['our_cidr'], 'our_inet': self.inserted_records[-1]['our_inet'], 'our_mac': self.inserted_records[-1]['our_mac'], 'our_money': '$0.99' }) # a record with a replication-key value that is higher than the previous bookmark our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 5, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None }) # a record with a replication-key value that is higher than the previous bookmark (to be deleted) our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 6, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None }) db_utils.insert_record(cur, test_table_name, self.inserted_records[3]) db_utils.insert_record(cur, test_table_name, self.inserted_records[4]) db_utils.insert_record(cur, test_table_name, self.inserted_records[5]) # update a record with a replication-key value that is higher than the previous bookmark canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 1 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[0]["our_double"] = decimal.Decimal("6.6") self.expected_records[0]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # update a record with a replication-key value that is lower than the previous bookmark canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 2 our_ts = datetime.datetime(1990, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[1]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[1]["our_double"] = decimal.Decimal("6.6") self.expected_records[1]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # delete a newly inserted record with a higher replication key than the previous bookmark record_pk = 5 db_utils.delete_record(cur, canon_table_name, record_pk) # run sync job 2 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # grab records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_primary_keys()) records_by_stream = runner.get_records_from_target_output() messages = records_by_stream[test_table_name]['messages'] # verify the expected number of records were synced self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual('activate_version', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) # verify the persisted schema matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records meet our expectations... # verify the first record was the bookmarked record from the previous sync self.assertDictEqual(self.expected_records[2], messages[1]['data']) # verify the expected updated record with a higher replication-key value was replicated self.assertDictEqual(self.expected_records[0], messages[2]['data']) # verify the expected inserted record with a lower replication-key value was NOT replicated actual_record_ids = [message['data']['id'] for message in messages[1:]] expected_record_id = self.expected_records[3]['id'] self.assertNotIn(expected_record_id, actual_record_ids) # verify the deleted record with a lower replication-key value was NOT replicated expected_record_id = self.expected_records[4]['id'] self.assertNotIn(expected_record_id, actual_record_ids) # verify the expected updated record with a lower replication-key value was NOT replicated expected_record_id = self.expected_records[1]['id'] self.assertNotIn(expected_record_id, actual_record_ids) # verify the expected inserted record with a higher replication-key value was replicated self.assertDictEqual(self.expected_records[5], messages[3]['data']) # verify records are in ascending order by replication-key value self.assertLess(messages[1]['data'][expected_replication_key], messages[2]['data'][expected_replication_key]) self.assertLess(messages[2]['data'][expected_replication_key], messages[3]['data'][expected_replication_key]) print("records are correct") # get bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_incremental_replication_test'] # verify the bookmarked state matches our expectations self.assertIsNone(bookmark.get('lsn')) self.assertEqual(bookmark['version'], table_version) self.assertEqual(bookmark['replication_key'], expected_replication_key) self.assertEqual(bookmark['replication_key_value'], self.expected_records[5][expected_replication_key]) #--------------------------------------------------------------------- # run sync AGAIN after deleting a record and get 1 record (prev bookmark) #---------------------------------------------------------------------- # Delete a pre-existing record from the database with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: # delete a record with a lower replication key than the previous sync record_pk = 1 db_utils.delete_record(cur, canon_table_name, record_pk) # run sync job 3 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_primary_keys()) records_by_stream = runner.get_records_from_target_output() messages = records_by_stream[test_table_name]['messages'] # verify the expected number of records were replicated self.assertEqual(1, record_count_by_stream[test_table_name]) # verify messages match our expectations self.assertEqual(2, len(messages)) self.assertEqual(messages[0]['action'], 'activate_version') self.assertEqual(messages[1]['action'], 'upsert') self.assertEqual(records_by_stream[test_table_name]['table_version'], table_version) # verify replicated records meet our expectations... # verify we did not re-replicate the deleted record actual_record_ids = [message['data']['id'] for message in messages[1:]] expected_record_id = self.expected_records[0]['id'] self.assertNotIn(expected_record_id, actual_record_ids) # verify only the previously bookmarked record was synced self.assertDictEqual(self.expected_records[5], messages[1]['data']) print("records are correct") # get bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_incremental_replication_test'] # verify the bookmarked state matches our expectations self.assertIsNone(bookmark.get('lsn')) self.assertEqual(bookmark['version'], table_version) self.assertEqual(bookmark['replication_key'], expected_replication_key) self.assertEqual(bookmark['replication_key_value'], self.expected_records[5][expected_replication_key])
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id ][0] # assert that the pks are correct self.assertEqual( self.expected_pks()[found_stream['stream_name']], set( found_stream.get('metadata', {}).get('table-key-properties'))) # assert that the row counts are correct self.assertEqual( self.expected_row_counts()[found_stream['stream_name']], found_stream.get('metadata', {}).get('row-count')) # ----------------------------------- # ----------- Initial Full Table --------- # ----------------------------------- # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that the full table was synced for tap_stream_id in self.expected_sync_streams(): self.assertGreaterEqual(record_count_by_stream[tap_stream_id], self.expected_row_counts()[tap_stream_id]) # Verify that we have 'initial_full_table_complete' bookmark state = menagerie.get_state(conn_id) first_versions = {} for tap_stream_id in self.expected_check_streams(): # assert that the state has an initial_full_table_complete == True self.assertTrue(state['bookmarks'][tap_stream_id] ['initial_full_table_complete']) # assert that there is a version bookmark in state first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id][ 'version'] self.assertIsNotNone(first_versions[tap_stream_id]) # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark self.assertIsNotNone( state['bookmarks'][tap_stream_id]['oplog_ts_time']) self.assertIsNotNone( state['bookmarks'][tap_stream_id]['oplog_ts_inc']) changed_ids = set() with get_test_connection() as client: # Delete two documents for each collection changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 0})[0]['_id']) client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0}) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 1})[0]['_id']) client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 0})[0]['_id']) client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 1})[0]['_id']) client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1}) # Update two documents for each collection changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 48})[0]['_id']) client["simple_db"]["simple_coll_1"].update_one( {'int_field': 48}, {'$set': { 'int_field': -1 }}) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 49})[0]['_id']) client["simple_db"]["simple_coll_1"].update_one( {'int_field': 49}, {'$set': { 'int_field': -1 }}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 98})[0]['_id']) client["simple_db"]["simple_coll_2"].update_one( {'int_field': 98}, {'$set': { 'int_field': -1 }}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 99})[0]['_id']) client["simple_db"]["simple_coll_2"].update_one( {'int_field': 99}, {'$set': { 'int_field': -1 }}) # Insert two documents for each collection client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 50, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 50})[0]['_id']) client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 51, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 51})[0]['_id']) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 100, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 100})[0]['_id']) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 101, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 101})[0]['_id']) # ----------------------------------- # ----------- Subsequent Oplog Sync --------- # ----------------------------------- # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() records_by_stream = {} for stream_name in self.expected_sync_streams(): records_by_stream[stream_name] = [ x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert' ] # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that we got at least 6 records due to changes # (could be more due to overlap in gte oplog clause) for k, v in record_count_by_stream.items(): self.assertGreaterEqual(v, 6) # Verify that we got 2 records with _SDC_DELETED_AT self.assertEqual( 2, len([ x['data'] for x in records_by_stream['simple_coll_1'] if x['data'].get('_sdc_deleted_at') ])) self.assertEqual( 2, len([ x['data'] for x in records_by_stream['simple_coll_2'] if x['data'].get('_sdc_deleted_at') ])) # Verify that the _id of the records sent are the same set as the # _ids of the documents changed actual = set([ ObjectId(x['data']['_id']) for x in records_by_stream['simple_coll_1'] ]).union( set([ ObjectId(x['data']['_id']) for x in records_by_stream['simple_coll_2'] ])) self.assertEqual(changed_ids, actual)
def test_run(self): """ Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } # self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream][ 'table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part if records_by_stream[stream]['messages'][-1].get("data"): last_row_data = True else: last_row_data = False self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-2]['action'], 'activate_version') if last_row_data: self.assertEqual( records_by_stream[stream]['messages'][-3]['action'], 'activate_version') else: self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertEqual( len([ m for m in records_by_stream[stream]['messages'][1:] if m["action"] == "activate_version" ]), 2, msg= "Expect 2 more activate version messages for end of full table and beginning of log based" ) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # Verify all data is correct for the full table part if last_row_data: final_row = -3 else: final_row = -2 for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream] ['messages'][1:final_row])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): column_index = [ list(key.keys())[0] for key in self.expected_metadata()[stream][self.FIELDS] ].index(column_name) if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][self.DATATYPE] \ in ("real", "float") \ and actual_row["data"][column_name] is not None: self.assertEqual( type(actual_row["data"][column_name]), Decimal, msg= "float value is not represented as a number" ) self.assertEqual( float(str(float32(expected_value))), float( str( float32(actual_row["data"] [column_name]))), msg= "single value of {} doesn't match actual {}" .format( float(str(float32(expected_value))), float( str( float32(actual_row["data"] [column_name]))))) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) # Verify all data is correct for the log replication part if sent if records_by_stream[stream]['messages'][-1].get("data"): for column_name, expected_value in expected_messages[-1][ "data"].items(): if isinstance(expected_value, float): self.assertEqual( type(records_by_stream[stream]['messages'][-1] ["data"][column_name]), Decimal, msg="float value is not represented as a number" ) self.assertEqual( float(str(float32(expected_value))), float( str( float32(records_by_stream[stream] ['messages'][-1]["data"] [column_name]))), msg="single value of {} doesn't match actual {}" .format( float(str(float32(expected_value))), float( str( float32(records_by_stream[stream] ['messages'][-1]["data"] [column_name]))))) else: self.assertEqual( expected_value, records_by_stream[stream]['messages'][-1] ["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg= "expected bookmark to have current_log_version because we are using log replication" ) self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") inital_log_version = bookmark['current_log_version'] self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "float_precisions" column_name = ["pk", "float_24", "float_53", "real_24_bits"] insert_value = [(14, 100.1, 100.1, 100.1)] update_value = [(1, 101.2, 101.2, 101.2)] delete_value = [(5, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = [(14, 100.1, 100.1, 100.1, None)] update_value = [(1, 101.2, 101.2, 101.2, None)] delete_value = [(5, None, None, None, datetime.utcnow())] self.EXPECTED_METADATA["data_types_database_dbo_float_precisions"]["values"] = \ [self.expected_metadata()["data_types_database_dbo_float_precisions"]["values"][-1]] + \ insert_value + delete_value + update_value self.EXPECTED_METADATA["data_types_database_dbo_float_precisions"][ "fields"].append({ "_sdc_deleted_at": { 'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic' } }) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertTrue( all([ message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:] ])) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertGreaterEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if column_name != "_sdc_deleted_at": column_index = [ list(key.keys())[0] for key in self.expected_metadata()[stream] [self.FIELDS] ].index(column_name) if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][ self.DATATYPE] \ in ("real", "float") \ and actual_row["data"][column_name] is not None: self.assertEqual( type(actual_row["data"][column_name]), Decimal, msg= "float value is not represented as a number" ) self.assertEqual( float(str(float32(expected_value))), float( str( float32(actual_row["data"] [column_name]))), msg= "single value of {} doesn't match actual {}" .format( float(str( float32(expected_value))), float( str( float32(actual_row["data"] [column_name]))))) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) elif expected_value: # we have an expected value for a deleted row try: actual_value = datetime.strptime( actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%S.%fZ") except ValueError: actual_value = datetime.strptime( actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%SZ") self.assertGreaterEqual( actual_value, expected_value - timedelta(seconds=15)) self.assertLessEqual( actual_value, expected_value + timedelta(seconds=15)) else: # the row wasn't deleted so we can either not pass the column or it can be None self.assertIsNone( actual_row["data"].get(column_name)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg= "expected bookmark to have current_log_version because we are using log replication" ) self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") new_log_version = bookmark['current_log_version'] self.assertGreater(new_log_version, inital_log_version, msg='expected log version to increase') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))