def automatic_fields_test(self, conn_id): """Just testing we can sync with no fields selected. And that automatic fields still get synced.""" # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify discovery produced (at least) 1 expected catalog found_catalogs = [ found_catalog for found_catalog in menagerie.get_catalogs(conn_id) if found_catalog['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual(len(found_catalogs), 1) # verify the tap discovered the expected streams found_catalog_names = { catalog['tap_stream_id'] for catalog in found_catalogs } self.assertSetEqual(self.expected_check_streams(), found_catalog_names) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(test_table_name, test_catalog['stream_name']) print("discovered streams are correct") # perform table selection print('selecting {} and NO FIELDS within the table'.format( test_table_name)) self.select_streams_and_fields(conn_id, test_catalog, select_all_fields=False) # clear state menagerie.set_state(conn_id, {}) # run sync job 1 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_primary_keys()) records_by_stream = runner.get_records_from_target_output() messages = records_by_stream[test_table_name]['messages'] # expected values expected_primary_keys = self.expected_primary_keys()[test_table_name] expected_replication_keys = self.expected_replication_keys( )[test_table_name] expected_automatic_fields = expected_primary_keys.union( expected_replication_keys) # collect actual values record_messages_keys = [ set(message['data'].keys()) for message in messages[1:-1] ] # verify the message actions match expectations for all replication methods self.assertEqual(4, len(messages)) self.assertEqual('activate_version', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('activate_version', messages[3]['action']) # Verify that you get some records for each stream self.assertGreater(record_count_by_stream[test_table_name], 0) # Verify that only the automatic fields are sent to the target for actual_fields in record_messages_keys: self.assertSetEqual(expected_automatic_fields, actual_fields)
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams()] self.assertGreaterEqual(len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference(found_catalog_names) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual('postgres_logical_replication_test', test_catalog['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}] #don't selcted our_text_2 _ = connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog, menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']), additional_md, ['our_text_2']) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test': 4}) records_by_stream = runner.get_records_from_target_output() table_version = records_by_stream['postgres_logical_replication_test']['table_version'] self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][0]['action'], 'activate_version') self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][1]['action'], 'upsert') self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][2]['action'], 'upsert') self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][3]['action'], 'upsert') self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][4]['action'], 'upsert') self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][5]['action'], 'activate_version') # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone(bookmark['lsn'], msg="expected bookmark for stream to have an lsn") lsn_1 = bookmark['lsn'] self.assertEqual(bookmark['version'], table_version, msg="expected bookmark for stream to match version") #---------------------------------------------------------------------- # invoke the sync job again after adding a record #---------------------------------------------------------------------- print("inserting a record 5") with db_utils.get_test_connection(test_db) as conn: conn.autocommit = True with conn.cursor() as cur: #insert fixture data 3 our_ts = datetime.datetime(1993, 3, 3, 3, 3, 3, 333333) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(3,4,5) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1933, 3, 3) my_uuid = str(uuid.uuid1()) #STRINGS: #OUR TS: '1993-03-03 03:03:03.333333' #OUR TS TZ: '1993-03-03 08:03:03.333333+00' #'OUR TIME': '03:04:05' #'OUR TIME TZ': '03:04:05+00' self.rec_5 = {'our_varchar' : "our_varchar 5", # str 'our_varchar_10' : "varchar13", # str 'our_text' : "some text 3", #str 'our_text_2' : "NOT SELECTED", 'our_integer' : 96000, #int 'our_smallint' : 3, # int 'our_bigint' : 3000000, #int 'our_decimal' : decimal.Decimal('1234567890.03'), #1234567890.03 / our_decimal is a <class 'float'> quote_ident('OUR TS', cur) : our_ts, # str '1993-03-03 03:03:03.333333' quote_ident('OUR TS TZ', cur) : our_ts_tz, #str '1993-03-03 08:03:03.333333+00' quote_ident('OUR TIME', cur) : our_time, # str '03:04:05' quote_ident('OUR TIME TZ', cur) : our_time_tz, # str '03:04:05+00' quote_ident('OUR DATE', cur) : our_date, #1933-03-03 / OUR DATE is a <class 'str'> 'our_double' : 3.3, #3.3 / our_double is a <class 'float'> 'our_real' : 6.6, #6.6 / our_real is a <class 'float'> 'our_boolean' : True, #boolean 'our_bit' : '1', #string 'our_json' : json.dumps({'secret' : 33}), #string 'our_jsonb' : json.dumps(['burgers make me hungry']), 'our_uuid' : my_uuid, #string 'our_store' : 'jumps=>"high",name=>"betty"', #string 'our_citext': 'maGICKal 3', 'our_cidr' : '192.168.102.128/32', 'our_inet': '192.168.102.128/32', 'our_mac' : '08:00:2b:01:02:05', 'our_money': '$412.1234' } insert_record(cur, test_table_name, self.rec_5) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test': 1 }) records_by_stream = runner.get_records_from_target_output() self.assertTrue(len(records_by_stream) > 0) for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertDictEqual(recs['schema'], expected_schemas[stream]) self.assertEqual(1, len(records_by_stream['postgres_logical_replication_test']['messages'])) actual_record_2 = records_by_stream['postgres_logical_replication_test']['messages'][0]['data'] actual_sdc_lsn_2 = int(actual_record_2['_sdc_lsn']) del actual_record_2['_sdc_lsn'] expected_inserted_record = {'our_text': 'some text 3', 'our_real': decimal.Decimal('6.6'), '_sdc_deleted_at': None, 'our_store' : {'name' : 'betty', 'jumps' : 'high' }, 'our_bigint': 3000000, 'our_varchar': 'our_varchar 5', 'our_double': decimal.Decimal('3.3'), 'our_bit': True, 'our_uuid': self.rec_5['our_uuid'], 'OUR TS': '1993-03-03T03:03:03.333333+00:00', 'OUR TS TZ': '1993-03-03T08:03:03.333333+00:00', 'OUR TIME': '03:04:05', 'OUR TIME TZ': '03:04:05-04:00', 'OUR DATE': '1933-03-03T00:00:00+00:00', 'our_decimal': decimal.Decimal('1234567890.03'), 'id': 5, 'our_varchar_10': 'varchar13', 'our_json': '{"secret": 33}', 'our_jsonb': self.rec_5['our_jsonb'], 'our_smallint': 3, 'our_integer': 96000, 'our_boolean': True, 'our_citext': 'maGICKal 3', 'our_cidr': self.rec_5['our_cidr'], 'our_inet': '192.168.102.128', 'our_mac': self.rec_5['our_mac'], 'our_alignment_enum' : None, 'our_money' :'$412.12' } self.assertDictEqual(expected_inserted_record, actual_record_2) self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][0]['action'], 'upsert') print("inserted record is correct") state = menagerie.get_state(conn_id) chicken_bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone(chicken_bookmark['lsn'], msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn") lsn_2 = chicken_bookmark['lsn'] self.assertTrue(lsn_2 >= lsn_1) #table_version does NOT change self.assertEqual(chicken_bookmark['version'], table_version, msg="expected bookmark for stream public-postgres_logical_replication_test to match version") #---------------------------------------------------------------------- # invoke the sync job again after deleting a record #---------------------------------------------------------------------- print("delete row from source db") with db_utils.get_test_connection(test_db) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute("DELETE FROM {} WHERE id = 3".format(canonicalized_table_name(test_schema_name, test_table_name, cur))) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) # verify the inserted record's lsn is less than or equal to the bookmarked lsn self.assertGreaterEqual(lsn_2, actual_sdc_lsn_2) expected_record_count = 1 if actual_sdc_lsn_2 < lsn_2 else 2 self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test': expected_record_count }) records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual(recs['schema'], expected_schemas[stream], msg="Persisted schema did not match expected schema for stream `{}`.".format(stream)) # if there are 2 records... if expected_record_count == 2: # the 1st message will be the previous insert insert_message = records_by_stream['postgres_logical_replication_test']['messages'][0]['data'] del insert_message['_sdc_lsn'] self.assertDictEqual(insert_message, expected_inserted_record) #the 2nd message will be the delete delete_message = records_by_stream['postgres_logical_replication_test']['messages'][expected_record_count - 1] self.assertEqual(delete_message['action'], 'upsert') sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 3) print("deleted record is correct") state = menagerie.get_state(conn_id) bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone(bookmark['lsn'], msg="expected bookmark for stream ROOT-CHICKEN to have an scn") lsn_3 = bookmark['lsn'] self.assertTrue(lsn_3 >= lsn_2) #---------------------------------------------------------------------- # invoke the sync job again after deleting a record using the 'id IN (SELECT ...)' format #---------------------------------------------------------------------- print("delete row from source db") with db_utils.get_test_connection(test_db) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute("DELETE FROM {} WHERE id IN (SELECT id FROM {} WHERE id=2)".format(canonicalized_table_name(test_schema_name, test_table_name, cur), canonicalized_table_name(test_schema_name, test_table_name, cur))) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test': 2 }) records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual(recs['schema'], expected_schemas[stream], msg="Persisted schema did not match expected schema for stream `{}`.".format(stream)) #first record will be the previous delete delete_message = records_by_stream['postgres_logical_replication_test']['messages'][0] sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 3) #the 2nd message will be the more recent delete delete_message = records_by_stream['postgres_logical_replication_test']['messages'][1] self.assertEqual(delete_message['action'], 'upsert') sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 2) print("deleted record is correct") state = menagerie.get_state(conn_id) bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone(bookmark['lsn'], msg="expected bookmark for stream ROOT-CHICKEN to have an scn") lsn_4 = bookmark['lsn'] self.assertTrue(lsn_4 >= lsn_3) #table_version does NOT change self.assertEqual(bookmark['version'], table_version, msg="expected bookmark for stream postgres_logical_replication_test to match version") #---------------------------------------------------------------------- # invoke the sync job again after deleting a record using the 'id IN (<id>, <id>)' format #---------------------------------------------------------------------- print("delete row from source db") with db_utils.get_test_connection(test_db) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute("DELETE FROM {} WHERE id IN (4, 5)".format(canonicalized_table_name(test_schema_name, test_table_name, cur))) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test': 3 }) records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual(recs['schema'], expected_schemas[stream], msg="Persisted schema did not match expected schema for stream `{}`.".format(stream)) #first record will be the previous delete delete_message = records_by_stream['postgres_logical_replication_test']['messages'][0] sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 2) #the 2nd message will be the more recent delete delete_message = records_by_stream['postgres_logical_replication_test']['messages'][1] self.assertEqual(delete_message['action'], 'upsert') sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 4) print("deleted record is correct") #the 3rd message will be the more recent delete delete_message = records_by_stream['postgres_logical_replication_test']['messages'][2] self.assertEqual(delete_message['action'], 'upsert') sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 5) print("deleted record is correct") state = menagerie.get_state(conn_id) bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone(bookmark['lsn'], msg="expected bookmark for stream ROOT-CHICKEN to have an scn") lsn_5 = bookmark['lsn'] self.assertTrue(lsn_5 >= lsn_4) #table_version does NOT change self.assertEqual(bookmark['version'], table_version, msg="expected bookmark for stream postgres_logical_replication_test to match version") #---------------------------------------------------------------------- # invoke the sync job again after updating a record #---------------------------------------------------------------------- print("updating row from source db") with db_utils.get_test_connection(test_db) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute("UPDATE {} SET our_varchar = 'THIS HAS BEEN UPDATED', our_money = '$56.811', our_decimal = 'NaN', our_real = '+Infinity', our_double = 'NaN' WHERE id = 1".format(canonicalized_table_name(test_schema_name, test_table_name, cur))) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test': 3 }) records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual(recs['schema'], expected_schemas[stream], msg="Persisted schema did not match expected schema for stream `{}`.".format(stream)) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) self.assertEqual(len(records_by_stream['postgres_logical_replication_test']['messages']), 3) #first record will be the previous first delete delete_message = records_by_stream['postgres_logical_replication_test']['messages'][0] sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 4) #second record will be the previous second delete delete_message = records_by_stream['postgres_logical_replication_test']['messages'][1] sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at') self.assertIsNotNone(sdc_deleted_at) self.assertEqual(delete_message['data']['id'], 5) #third record will be the new update updated_message = records_by_stream['postgres_logical_replication_test']['messages'][2] del updated_message['data']['_sdc_lsn'] self.assertEqual(updated_message['action'], 'upsert') expected_updated_rec = {'our_varchar' : 'THIS HAS BEEN UPDATED', 'id' : 1, 'our_varchar_10' : "varchar_10", 'our_text' : "some text", 'our_integer' : 44100, 'our_smallint' : 1, 'our_bigint' : 1000000, 'our_decimal' : None, 'OUR TS': '1997-02-02T02:02:02.722184+00:00', 'OUR TS TZ' : '1997-02-02T07:02:02.722184+00:00', 'OUR TIME' : '12:11:10', 'OUR TIME TZ' : '12:11:10-04:00', 'OUR DATE': '1998-03-04T00:00:00+00:00', 'our_double' : None, 'our_real' : None, 'our_boolean' : True, 'our_bit' : False, 'our_json' : '{"secret": 55}', 'our_jsonb' : self.rec_1['our_jsonb'], 'our_uuid' : self.rec_1['our_uuid'], '_sdc_deleted_at' : None, 'our_store' : {'name' : 'betty', 'size' : 'small' }, 'our_citext': 'maGICKal', 'our_cidr': self.rec_1['our_cidr'], 'our_inet': self.rec_1['our_inet'], 'our_mac': self.rec_1['our_mac'], 'our_alignment_enum' : 'bad', 'our_money' : '$56.81' } self.assertDictEqual(expected_updated_rec, updated_message['data']) print("updated record is correct") #check state again state = menagerie.get_state(conn_id) self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") chicken_bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone(chicken_bookmark['lsn'], msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn") lsn_6 = chicken_bookmark['lsn'] self.assertTrue(lsn_6 >= lsn_5) #table_version does NOT change self.assertEqual(chicken_bookmark['version'], table_version, msg="expected bookmark for stream public-postgres_logical_replication_test to match version") #---------------------------------------------------------------------- # invoke the sync job one last time. should only get the PREVIOUS update #---------------------------------------------------------------------- sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) # we will get the previous update record again self.assertEqual(record_count_by_stream, {'postgres_logical_replication_test': 1}) # TODO the next line is not grabing the record from the latest sync, opening potential for false negatives update_message = records_by_stream['postgres_logical_replication_test']['messages'][2] self.assertEqual(update_message['action'], 'upsert') self.assertEqual(set(update_message['data'].keys()), set(expected_updated_rec.keys()), msg="keys for expected_record_1 are wrong: {}".format(set(update_message['data'].keys()).symmetric_difference(set(expected_updated_rec.keys())))) for k,v in update_message['data'].items(): self.assertEqual(v, expected_updated_rec[k], msg="{} != {} for key {}".format(v, expected_updated_rec[k], k)) #check state again state = menagerie.get_state(conn_id) chicken_bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNotNone(chicken_bookmark['lsn'], msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn") lsn_7 = chicken_bookmark['lsn'] self.assertTrue(lsn_7 >= lsn_6) #table_version does NOT change self.assertEqual(chicken_bookmark['version'], table_version, msg="expected bookmark for stream public-postgres_logical_replication_test to match version")
def test_run(self): """ Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] # TODO - test schema matches expectations based on data type, nullable, not nullable, datetimes as string +, etc # This needs to be consistent based on replication method so you can change replication methods table_version = records_by_stream[stream]['table_version'] # verify on the first sync you get activate version message before and after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:-1] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") # TODO - change this to something for mssql once binlog (cdc) is finalized and we know what it is self.assertIsNone( bookmark.get('lsn'), msg= "expected bookmark for stream to have NO lsn because we are using full-table replication" ) self.assertEqual( bookmark['version'], table_version, msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and get the same records # NOTE: THIS IS ONLY DONE IN THIS TEST. It also tests we don't send activate version before completion # and the talbe version is incremented # ---------------------------------------------------------------------- # TODO - update the table to add a column and ensure that discovery adds the new column sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] # TODO - test schema matches expectations based on data type, nullable, not nullable, datetimes as string +, etc # This needs to be consistent based on replication method so you can change replication methods # {'action': 'upsert', 'sequence': 1560362044666000001, 'data': {'MySmallIntColumn': 0, 'pk': 1, 'MyIntColumn': 0, 'MyBigIntColumn': 0}} new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'upsert') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][0:-1] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][0:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone( bookmark.get('lsn'), msg= "expected bookmark for stream to have NO lsn because we are using full-table replication" ) self.assertGreater( new_table_version, table_version, msg= "table version {} didn't increate from {} on the second run" .format(new_table_version, table_version)) self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): """stream_expected_data[self.VALUES] Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'INCREMENTAL', 'replication-key': 'replication_key_column' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() non_selected_properties = [] table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream][ 'table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([ m["action"] == "upsert" for m in records_by_stream[stream]['messages'][1:-1] ]), msg="Expect all but the first message to be upserts") self.assertEqual(len( records_by_stream[stream]['messages'][1:-1]), len(stream_expected_data[self.VALUES]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) if column not in non_selected_properties } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[1] is not None, row[1]))] # Verify all data is correct for incremental for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, time): # sql server time has second resolution only self.assertEqual( expected_value.replace( microsecond=0).isoformat().replace( '+00:00', ''), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, date): # sql server time has second resolution only self.assertEqual( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value, actual_row["data"][column_name])) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key expected_bookmark = max([ row[1] for row in stream_expected_data[self.VALUES] if row[1] is not None ]) self.assertEqual(bookmark['replication_key_value'], expected_bookmark.isoformat()) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "dates_and_times" column_name = [ "pk", "replication_key_column", "date_and_time", "bigger_range_and_precision_datetime", "datetime_with_timezones", "datetime_no_seconds", "its_time" ] insert_value = [ (5, date(9999, 12, 30), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=14))).isoformat(), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)), (6, date(2018, 12, 29), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=14))).isoformat(), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)) ] update_value = [ (3, date(9999, 12, 31), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=10))).isoformat(), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)), (4, date(2018, 12, 30), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=6))).isoformat(), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)) ] delete_value = [(2, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = [ (5, date(9999, 12, 30), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=14))).astimezone( timezone.utc), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)), (6, date(2018, 12, 29), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=14))).astimezone( timezone.utc), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)) ] update_value = [ (3, date(9999, 12, 31), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=10))).astimezone( timezone.utc), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)), (4, date(2018, 12, 30), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime(9999, 12, 31, 10, 14, tzinfo=timezone(timedelta(hours=6))).astimezone( timezone.utc), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)) ] insert_value = insert_value[:-1] # only repl_key >= gets included update_value = update_value[:-1] self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"][ "values"] = [( 1, date(9999, 12, 29), datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc), datetime( 9999, 12, 31, 10, 14, tzinfo=timezone( timedelta(hours=14))).astimezone(timezone.utc), datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc), time(23, 59, 59, tzinfo=timezone.utc)) ] + update_value + insert_value sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([ message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:-1] ])) self.assertEqual(len( records_by_stream[stream]['messages'][1:-1]), len(stream_expected_data[self.VALUES]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) if column not in non_selected_properties } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[1] is not None, row[1]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:-1] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, time): # sql server time has second resolution only self.assertEqual( expected_value.replace( microsecond=0).isoformat().replace( '+00:00', ''), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, date): # sql server time has second resolution only self.assertEqual( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value, actual_row["data"][column_name])) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key expected_bookmark = max([ row[1] for row in stream_expected_data[self.VALUES] if row[1] is not None ]) self.assertEqual(bookmark['replication_key_value'], expected_bookmark.isoformat()) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams()] self.assertGreaterEqual(len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference(found_catalog_names) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(test_table_name, test_catalog['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}] _ = connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog, menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']), additional_md) # clear state menagerie.set_state(conn_id, {}) print("inserting a record") our_ts_tz = None our_date = None our_uuid = str(uuid.uuid1()) with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor() as cur: #insert fixture data 2 #insert fixture data 1 our_ts = datetime.datetime(1997, 2, 2, 2, 2, 2, 722184) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_date = datetime.date(1998, 3, 4) self.rec_1 = { 'our_bit_array' : '{{0,1,1}}', 'our_boolean_array' : '{true}', 'our_cidr_array' : '{{192.168.100.128/25}}', 'our_citext_array' : '{{maGICKal 2}}', 'our_date_array' : '{{{}}}'.format(our_date), 'our_decimal_array' : '{{{}}}'.format(decimal.Decimal('1234567890.01')), 'our_double_array' : '{{1.232323}}', 'our_enum_array' : '{{bad}}', 'our_float_array' : '{{5.23}}', 'our_hstore_array' : """{{"size=>small","name=>betty"}}""", 'our_inet_array' : '{{192.168.100.128/24}}', 'our_int_array' : '{{1,2,3},{4,5,6}}', 'our_json_array' : [psycopg2.extras.Json({'secret' : 55})], 'our_jsonb_array' : [psycopg2.extras.Json({'secret' : 69})], 'our_mac_array' : '{{08:00:2b:01:02:03}}', 'our_money_array' : '{{$412.1234}}', 'our_real_array' : '{{76.33}}', 'our_smallint_array' : '{{10,20,30},{40,50,60}}', 'our_string_array' : '{{one string, two strings}}', 'our_text_array' : '{{three string, four}}', 'our_time_array' : '{{03:04:05}}', 'our_ts_tz_array' : '{{{}}}'.format(our_ts_tz), 'our_uuid_array' : '{{{}}}'.format(our_uuid)} insert_record(cur, test_table_name, self.rec_1) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, { test_table_name: 1 }) records_by_stream = runner.get_records_from_target_output() self.assertTrue(len(records_by_stream) > 0) for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual(recs['schema'], expected_schemas[stream], msg="Persisted schema did not match expected schema for stream `{}`.".format(stream)) self.assertEqual(3, len(records_by_stream[test_table_name]['messages'])) self.assertEqual(records_by_stream[test_table_name]['messages'][0]['action'], 'activate_version') self.assertEqual(records_by_stream[test_table_name]['messages'][1]['action'], 'upsert') self.assertEqual(records_by_stream[test_table_name]['messages'][2]['action'], 'activate_version') actual_record_1 = records_by_stream[test_table_name]['messages'][1]['data'] expected_inserted_record = {'id': 1, 'our_bit_array' : [[False, True, True]], 'our_boolean_array' : [True], 'our_cidr_array' : [['192.168.100.128/25']], 'our_citext_array' : [['maGICKal 2']], 'our_date_array' : ['1998-03-04T00:00:00+00:00'], 'our_decimal_array' : [decimal.Decimal('1234567890.01')], 'our_double_array' : [[decimal.Decimal('1.232323')]], 'our_enum_array' : [['bad']], 'our_float_array' : [[decimal.Decimal('5.23')]], 'our_hstore_array' : [[{'size' : 'small' }, {'name' : 'betty'} ]], 'our_inet_array' : [['192.168.100.128/24']], 'our_int_array' : [[1,2,3],[4,5,6]], 'our_json_array' : [json.dumps({'secret' : 55})], 'our_jsonb_array' : [json.dumps({'secret' : 69})], 'our_mac_array' : [['08:00:2b:01:02:03']], 'our_money_array' : [['$412.12']], 'our_real_array' : [[decimal.Decimal('76.33')]], 'our_smallint_array' : [[10,20,30],[40,50,60]], 'our_string_array' : [['one string', 'two strings']], 'our_text_array' : [['three string', 'four']], 'our_time_array' : [['03:04:05']], 'our_ts_tz_array' : ['1997-02-02T07:02:02.722184+00:00'], 'our_uuid_array' : ['{}'.format(our_uuid)] } self.assertEqual(set(actual_record_1.keys()), set(expected_inserted_record.keys()), msg="keys for expected_record_1 are wrong: {}".format(set(actual_record_1.keys()).symmetric_difference(set(expected_inserted_record.keys())))) for k in actual_record_1.keys(): self.assertEqual(actual_record_1[k], expected_inserted_record[k], msg="{} != {} for key {}".format(actual_record_1[k], expected_inserted_record[k], k)) print("inserted record is correct") # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks']['dev-public-postgres_full_table_replication_array_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('lsn'), msg="expected bookmark for stream to have NO lsn because we are using full-table replication")
def test_run(self): """ Verify that a full sync can send capture all data and send it in the correct format """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version = records_by_stream[stream]['table_version'] # verify on the first sync you get activate version message before and after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:-1] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, Decimal): self.assertEqual( type(actual_row["data"][column_name]), Decimal, msg= "decimal value is not represented as a number" ) self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") # TODO - change this to something for mssql once binlog (cdc) is finalized and we know what it is self.assertIsNone( bookmark.get('lsn'), msg= "expected bookmark for stream to have NO lsn because we are using full-table replication" ) self.assertEqual( bookmark['version'], table_version, msg="expected bookmark for stream to match version") expected_schemas = { "selected": True, "type": "object", "properties": { k: dict(**self.DATATYPE_SCHEMAS[v["sql-datatype"]], selected=True, inclusion=v["inclusion"]) for fd in stream_expected_data[self.FIELDS] for k, v in fd.items() } } expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual( records_by_stream[stream]['schema'], simplejson.loads(simplejson.dumps(expected_schemas), use_decimal=True), msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): # SYNC 1 conn_id = self.ensure_connection() # Run in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) # Select only the expected streams tables expected_streams = self.expected_streams() catalog_entries = [ ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams ] self.select_all_streams_and_fields(conn_id, catalog_entries) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) first_sync_records = runner.get_records_from_target_output() first_sync_record_count = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) first_sync_bookmarks = menagerie.get_state(conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # UPDATE STATE BETWEEN SYNCS new_state = dict() new_state['bookmarks'] = { key: { 'LastUpdatedTime': value } for key, value in self.calculated_states_by_stream( first_sync_bookmarks).items() } menagerie.set_state(conn_id, new_state) # SYNC 2 sync_job_name = runner.run_sync_mode(self, conn_id) second_sync_records = runner.get_records_from_target_output() second_sync_record_count = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) second_sync_bookmarks = menagerie.get_state(conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Test by stream for stream in self.expected_streams(): with self.subTest(stream=stream): # record counts first_sync_count = first_sync_record_count.get(stream, 0) second_sync_count = second_sync_record_count.get(stream, 0) # record messages first_sync_messages = first_sync_records.get( stream, { 'messages': [] }).get('messages') second_sync_messages = second_sync_records.get( stream, { 'messages': [] }).get('messages') # replication key is an object (MetaData.LastUpdatedTime) in sync records # but just the sub level replication key is used in setting bookmarks top_level_replication_key = 'MetaData' sub_level_replication_key = 'LastUpdatedTime' # bookmarked states (top level objects) first_bookmark_key_value = first_sync_bookmarks.get( 'bookmarks').get(stream) second_bookmark_key_value = second_sync_bookmarks.get( 'bookmarks').get(stream) # Verify the first sync sets a bookmark of the expected form self.assertIsNotNone(first_bookmark_key_value) self.assertIsNotNone( first_bookmark_key_value.get(sub_level_replication_key)) # Verify the second sync sets a bookmark of the expected form self.assertIsNotNone(second_bookmark_key_value) self.assertIsNotNone( second_bookmark_key_value.get(sub_level_replication_key)) # bookmarked states (actual values) first_bookmark_value = first_bookmark_key_value.get( sub_level_replication_key) second_bookmark_value = second_bookmark_key_value.get( sub_level_replication_key) # bookmarked values as utc for comparing against records first_bookmark_value_utc = self.convert_state_to_utc( first_bookmark_value) second_bookmark_value_utc = self.convert_state_to_utc( second_bookmark_value) # Verify the second sync bookmark is Equal to the first sync bookmark self.assertEqual(second_bookmark_value, first_bookmark_value ) # assumes no changes to data during test # Verify the second sync records respect the previous (simulated) bookmark value simulated_bookmark_value = new_state['bookmarks'][stream][ sub_level_replication_key] for message in second_sync_messages: replication_key_value = message.get('data').get( top_level_replication_key).get( sub_level_replication_key) self.assertGreaterEqual( replication_key_value, simulated_bookmark_value, msg= "Second sync records do not repect the previous bookmark." ) # Verify the first sync bookmark value is the max replication key value for a given stream for message in first_sync_messages: replication_key_value = message.get('data').get( top_level_replication_key).get( sub_level_replication_key) self.assertLessEqual( replication_key_value, first_bookmark_value_utc, msg= "First sync bookmark was set incorrectly, a record with a greater rep key value was synced" ) # Verify the second sync bookmark value is the max replication key value for a given stream for message in second_sync_messages: replication_key_value = message.get('data').get( top_level_replication_key).get( sub_level_replication_key) self.assertLessEqual( replication_key_value, second_bookmark_value_utc, msg= "Second sync bookmark was set incorrectly, a record with a greater rep key value was synced" ) # Verify the number of records in the 2nd sync is less then the first self.assertLess(second_sync_count, first_sync_count) # Verify at least 1 record was replicated in the second sync self.assertGreater( second_sync_count, 0, msg="We are not fully testing bookmarking for {}".format( stream))
def test_run(self): conn_id = connections.ensure_connection(self, payload_hook=None) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # # # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], []) # # Verify that all streams sync at least one row for initial sync # # This test is also verifying access token expiration handling. If test fails with # # authentication error, refresh token was not replaced after expiring. menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) zero_count_streams = { k for k, v in record_count_by_stream.items() if v == 0 } self.assertFalse( zero_count_streams, msg="The following streams did not sync any rows {}".format( zero_count_streams)) # # Verify that bookmark values are correct after incremental sync bookmark_props = configuration['bookmark'] current_state = menagerie.get_state(conn_id) test_bookmark = current_state['bookmarks'][ bookmark_props['bookmark_dict']][bookmark_props['bookmark_key']] print(test_bookmark) self.assertTrue( test_bookmark == bookmark_props['bookmark_timestamp'], msg="The bookmark value does not match the expected result")
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams()] self.assertGreaterEqual(len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference(found_catalog_names) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] print('Catalog', test_catalog) self.assertEqual('postgres_full_table_replication_test', test_catalog['stream_name']) print("discovered streams are correct") print('checking discoverd metadata for public-postgres_full_table_test...') md = menagerie.get_annotated_schema(conn_id, test_catalog['stream_id'])['metadata'] self.assertEqual( {('properties', 'our_varchar'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'character varying'}, ('properties', 'our_boolean'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'boolean'}, ('properties', 'our_real'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'real'}, ('properties', 'our_uuid'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'uuid'}, ('properties', 'our_bit'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'bit'}, ('properties', 'OUR TS TZ'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'timestamp with time zone'}, ('properties', 'our_varchar_10'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'character varying'}, ('properties', 'our_store'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'hstore'}, ('properties', 'our_citext'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'citext'}, ('properties', 'OUR TIME'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'time without time zone'}, ('properties', 'our_decimal'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'numeric'}, ('properties', 'OUR TS'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'timestamp without time zone'}, ('properties', 'our_jsonb'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'jsonb'}, ('properties', 'OUR TIME TZ'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'time with time zone'}, ('properties', 'our_text'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'text'}, ('properties', 'OUR DATE'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'date'}, ('properties', 'our_double'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'double precision'}, (): {'is-view': False, 'schema-name': 'public', 'table-key-properties': ['id'], 'database-name': 'dev', 'row-count': 0}, ('properties', 'our_bigint'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'bigint'}, ('properties', 'id'): {'inclusion': 'automatic', 'selected-by-default': True, 'sql-datatype': 'integer'}, ('properties', 'our_json'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'json'}, ('properties', 'our_smallint'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'smallint'}, ('properties', 'our_integer'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'integer'}, ('properties', 'our_inet'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'inet'}, ('properties', 'our_cidr'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'cidr'}, ('properties', 'our_mac'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'macaddr'}, ('properties', 'our_alignment_enum'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'alignment'}, ('properties', 'our_money'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'money'}}, metadata.to_map(md)) additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}] selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog, menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']), additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, { 'postgres_full_table_replication_test': 3}) records_by_stream = runner.get_records_from_target_output() table_version = records_by_stream['postgres_full_table_replication_test']['table_version'] self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][0]['action'], 'activate_version') self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][1]['action'], 'upsert') self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][2]['action'], 'upsert') self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][3]['action'], 'upsert') self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][4]['action'], 'activate_version') # verifications about individual records for table_name, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual(recs['schema'], expected_schemas[table_name], msg="Persisted schema did not match expected schema for table `{}`.".format(table_name)) expected_record_1 = {'our_decimal': decimal.Decimal('.01'), 'our_text': 'some text', 'our_bit': False, 'our_integer': 44100, 'our_double': decimal.Decimal('1.1'), 'id': 1, 'our_json': '{"secret": 55}', 'our_boolean': True, 'our_jsonb': '{"burgers": "good"}', 'our_bigint': 1000000, 'OUR TS': '1997-02-02T02:02:02.722184+00:00', 'OUR TS TZ': '1997-02-02T07:02:02.722184+00:00', 'OUR TIME': '12:11:10', 'OUR TIME TZ': '12:11:10-04:00', 'our_store': {"name" : "betty", "size" :"small"}, 'our_smallint': 1, 'OUR DATE': '1998-03-04T00:00:00+00:00', 'our_varchar': 'our_varchar', 'our_uuid': self.rec_1['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_10', 'our_citext' : self.rec_1['our_citext'], 'our_inet' : self.rec_1['our_inet'], 'our_cidr' : self.rec_1['our_cidr'], 'our_mac' : self.rec_1['our_mac'], 'our_alignment_enum' : self.rec_1['our_alignment_enum'], 'our_money' : '$100.11' } expected_record_2 = {'our_decimal': decimal.Decimal('.02'), 'OUR TIME': '10:09:08', 'our_text': 'some text 2', 'our_bit': True, 'our_integer': 44101, 'our_double': decimal.Decimal('1.1'), 'id': 2, 'our_json': '["nymn 77"]', 'our_boolean': True, 'our_jsonb': '{"burgers": "good++"}', 'our_bigint': 1000001, 'OUR TIME TZ': '10:09:08-04:00', 'our_store': {"name" : "betty", "dances" :"floor"}, 'OUR TS TZ': '1987-03-03T08:03:03.733184+00:00', 'our_smallint': 2, 'OUR DATE': '1964-07-01T00:00:00+00:00', 'our_varchar': 'our_varchar 2', 'OUR TS': '1987-03-03T03:03:03.733184+00:00', 'our_uuid': self.rec_2['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_10', 'our_citext' : self.rec_2['our_citext'], 'our_inet' : self.rec_2['our_inet'], 'our_cidr' : self.rec_2['our_cidr'], 'our_mac' : self.rec_2['our_mac'], 'our_alignment_enum' : None, 'our_money': None } actual_record_1 = records_by_stream['postgres_full_table_replication_test']['messages'][1] self.assertEqual(set(actual_record_1['data'].keys()), set(expected_record_1.keys()), msg="keys for expected_record_1 are wrong: {}".format(set(actual_record_1.keys()).symmetric_difference(set(expected_record_1.keys())))) for k,v in actual_record_1['data'].items(): self.assertEqual(actual_record_1['data'][k], expected_record_1[k], msg="{} != {} for key {}".format(actual_record_1['data'][k], expected_record_1[k], k)) actual_record_2 = records_by_stream['postgres_full_table_replication_test']['messages'][2] self.assertEqual(set(actual_record_2['data'].keys()), set(expected_record_2.keys()), msg="keys for expected_record_2 are wrong: {}".format(set(actual_record_2.keys()).symmetric_difference(set(expected_record_2.keys())))) for k,v in actual_record_2['data'].items(): self.assertEqual(actual_record_2['data'][k], expected_record_2[k], msg="{} != {} for key {}".format(actual_record_2['data'][k], expected_record_2[k], k)) #We cast NaN's, +Inf, -Inf to NULL as wal2json does not support them and now we are at least consistent(ly wrong) expected_record_3 = {'our_decimal' : None, 'our_double' : None, 'our_real' : None} actual_record_3 = records_by_stream['postgres_full_table_replication_test']['messages'][3] for k,v in expected_record_3.items(): self.assertEqual(actual_record_3['data'][k], v, msg="{} != {} for key {}".format(actual_record_3['data'][k], v, k)) print("records are correct") # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks']['dev-public-postgres_full_table_replication_test'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('lsn'), msg="expected bookmark for stream ROOT-CHICKEN to have NO lsn because we are using full-table replication") self.assertEqual(bookmark['version'], table_version, msg="expected bookmark for stream ROOT-CHICKEN to match version") #---------------------------------------------------------------------- # invoke the sync job AGAIN and get the same 3 records #---------------------------------------------------------------------- sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, { 'postgres_full_table_replication_test': 3}) records_by_stream = runner.get_records_from_target_output() new_table_version = records_by_stream['postgres_full_table_replication_test']['table_version'] self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][0]['action'], 'upsert') self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][1]['action'], 'upsert') self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][2]['action'], 'upsert') self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][3]['action'], 'activate_version') new_table_version = records_by_stream['postgres_full_table_replication_test']['table_version'] self.assertGreater(new_table_version, table_version, msg="table version {} didn't increate from {} on the second run".format(new_table_version, table_version)) # verifications about individual records for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual(recs['schema'], expected_schemas[stream], msg="Persisted schema did not match expected schema for stream `{}`.".format(stream))
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") # Select all Catalogs for catalog in found_catalogs: connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id'])) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) max_bookmarks_from_records = runner.get_most_recent_records_from_target(self, self.expected_bookmarks(), self.get_properties()['start_date']) start_of_today = utils.strftime(datetime.datetime(datetime.datetime.utcnow().year, datetime.datetime.utcnow().month, datetime.datetime.utcnow().day, 0, 0, 0, 0, datetime.timezone.utc)) max_bookmarks_from_records['subscription_changes'] = start_of_today max_bookmarks_from_records['email_events'] = start_of_today #if we didn't replicate data, the bookmark should be the start_date for k in self.expected_bookmarks().keys(): if max_bookmarks_from_records.get(k) is None: max_bookmarks_from_records[k] = utils.strftime(datetime.datetime(2017, 5, 1, 0, 0, 0, 0, datetime.timezone.utc)) state = menagerie.get_state(conn_id) bookmarks = state.get('bookmarks') bookmark_streams = set(state.get('bookmarks').keys()) #verify bookmarks and offsets for k,v in sorted(list(self.expected_bookmarks().items())): for w in v: bk_value = bookmarks.get(k,{}).get(w) self.assertEqual(utils.strptime_with_tz(bk_value), utils.strptime_with_tz(max_bookmarks_from_records[k]), "Bookmark {} ({}) for stream {} should have been updated to {}".format(bk_value, w, k, max_bookmarks_from_records[k])) print("bookmark {}({}) updated to {} from max record value {}".format(k, w, bk_value, max_bookmarks_from_records[k])) for k,v in self.expected_offsets().items(): self.assertEqual(bookmarks.get(k,{}).get('offset', {}), v, msg="unexpected offset found for stream {} {}. state: {}".format(k, v, state)) print("offsets {} cleared".format(k)) diff = bookmark_streams.difference(self.acceptable_bookmarks()) self.assertEqual(len(diff), 0, msg="Unexpected bookmarks: {} Expected: {} Actual: {}".format(diff, self.acceptable_bookmarks(), bookmarks)) self.assertEqual(state.get('currently_syncing'), None,"Unexpected `currently_syncing` bookmark value: {} Expected: None".format(state.get('currently_syncing')))
def test_run(self): """ Verify that a bookmark doesn't exist for the stream Verify that the second sync includes the same number or more records than the first sync Verify that all records in the first sync are included in the second sync Verify that the sync only sent records to the target for selected streams (catalogs) PREREQUISITE For EACH stream that is fully replicated there are multiple rows of data with different values for the replication key """ CREATED_RECORDS = {x: [] for x in self.expected_streams()} UPDATED_RECORDS = {x: [] for x in self.expected_streams()} # Ensure data exists prior to test for all full table streams expected_records_1 = {x: [] for x in self.expected_streams()} for stream in self.expected_full_table_streams(): existing_objects = self.client.get_all(stream) assert existing_objects, "Test data is not properly set for {}, test will fail.".format(stream) print("Data exists for stream: {}".format(stream)) for obj in existing_objects: expected_records_1[stream].append(obj) conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Select all full table streams and no fields within streams found_catalogs = menagerie.get_catalogs(conn_id) full_streams = {key for key, value in self.expected_replication_method().items() if value == self.FULL} our_catalogs = [catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in full_streams] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True) # Run a sync job using orchestrator first_sync_record_count = self.run_sync(conn_id) # verify that the sync only sent records to the target for selected streams (catalogs) self.assertEqual(set(first_sync_record_count.keys()), full_streams, msg="Expect first_sync_record_count keys {} to equal full_streams {}," " first_sync_record_count was {}".format( first_sync_record_count.keys(), full_streams, first_sync_record_count)) first_sync_state = menagerie.get_state(conn_id) # Get the set of records from a first sync first_sync_records = runner.get_records_from_target_output() # Create 1 new record for every full table stream N = 1 # number of creates/updates between syncs expected_records_2 = {x: [] for x in self.expected_streams()} for stream in self.streams_creatable(): for _ in range(N): print("CREATING A RECORD FOR STREAM: {}".format(stream)) new_object = self.client.create(stream) expected_records_2[stream].append(new_object) CREATED_RECORDS[stream].append(new_object) # Update 1 existing record for every full table stream for stream in self.streams_creatable(): for _ in range(N): print("UDPATING A RECORD FOR STREAM: {}".format(stream)) # eid = expected_records_1.get(stream)[-1] # most recent record prior to test updated_object = self.client.update(stream) expected_records_2[stream].append(updated_object) UPDATED_RECORDS[stream].append(updated_object) # adjust expectations to include expected_records_1 for stream in self.streams_creatable(): for record in expected_records_1.get(stream): if record.get('eid') in [ex_rec.get('eid') for ex_rec in expected_records_2.get(stream, [])]: continue # don't add a record to expectations twice expected_records_2[stream].append(record) # Run a second sync job using orchestrator second_sync_record_count = self.run_sync(conn_id) # Get the set of records from a second sync second_sync_records = runner.get_records_from_target_output() # Loop first_sync_records and compare against second_sync_records # each iteration of loop is chekcing both for a stream # first_sync_records["ads"] == second["ads"] for stream in full_streams: with self.subTest(stream=stream): # RECORD COUNT record_count_1 = first_sync_record_count.get(stream, 0) record_count_2 = second_sync_record_count.get(stream, 0) # ACTUAL RECORDS records_from_sync_1 = set(row.get('data', {}).get('eid') for row in first_sync_records.get(stream, []).get('messages', [])) records_from_sync_2 = set(row.get('data', {}).get('eid') for row in second_sync_records.get(stream, []).get('messages', [])) # EXPECTED_RECORDS expected_records_from_sync_1 = set(record.get('eid') for record in expected_records_1.get(stream, [])) expected_records_from_sync_2 = set(record.get('eid') for record in expected_records_2.get(stream, [])) # verify there is no bookmark values from state state_value = first_sync_state.get("bookmarks", {}).get(stream) self.assertIsNone(state_value) # verify that there is more than 1 record of data - setup necessary self.assertGreater(record_count_1, 1, msg="Data isn't set up to be able to test full sync") # verify that you get the same or more data the 2nd time around self.assertGreaterEqual(record_count_2, record_count_1, msg="second syc didn't have more records, full sync not verified") # verify all expected records were replicated for first sync self.assertEqual( set(), records_from_sync_1.symmetric_difference(expected_records_from_sync_1), msg="1st Sync records do not match expectations.\n" + "MISSING RECORDS: {}\n".format(expected_records_from_sync_1.symmetric_difference(records_from_sync_1)) + "ADDITIONAL RECORDS: {}".format(records_from_sync_1.symmetric_difference(expected_records_from_sync_1)) ) # verify all data from 1st sync included in 2nd sync self.assertEqual(set(), records_from_sync_1.difference(records_from_sync_2), msg="Data in 1st sync missing from 2nd sync") # testing streams with new and updated data if stream in self.streams_creatable(): # verify that the record count has increased by N record in the 2nd sync, where # N = the number of new records created between sync 1 and sync 2 self.assertEqual(record_count_2, record_count_1 + N, msg="Expected {} new records to be captured by the 2nd sync.\n".format(N) + "Record Count 1: {}\nRecord Count 2: {}".format(record_count_1, record_count_2) ) # verify that the newly created and updated records are captured by the 2nd sync self.assertEqual( set(), records_from_sync_2.symmetric_difference(expected_records_from_sync_2), msg="2nd Sync records do not match expectations.\n" + "MISSING RECORDS: {}\n".format(expected_records_from_sync_2.difference(records_from_sync_2)) + "ADDITIONAL RECORDS: {}".format(records_from_sync_2.difference(expected_records_from_sync_2)) ) # verify that the updated records are correctly captured by the 2nd sync expected_updated_records = set(record.get('eid') for record in expected_records_2.get(stream, []) if "UPDATED" in record.get('name', '').upper()) if stream == 'segments': # Account for 'display name' in segments expected_updated_records.update(set(record.get('eid') for record in expected_records_2.get(stream, []) if "UPDATED" in record.get('display_name', '').upper())) if expected_updated_records: updated_records_from_sync_2 = set(row.get('data', {}).get('eid') for row in second_sync_records.get(stream, []).get('messages', []) if "UPDATED" in row.get('data', {}).get('name', '').upper()) if stream == 'segments': # Account for 'display name' in segments updated_records_from_sync_2.update(set(row.get('data', {}).get('eid') for row in second_sync_records.get(stream, []).get('messages', []) if "UPDATED" in row.get('data', {}).get('display_name', '').upper())) # check that the updated records are present in the target self.assertEqual( set(), updated_records_from_sync_2.symmetric_difference(expected_updated_records), msg="Failed to replicate the updated {} record(s)\n".format(stream) + "MISSING RECORDS: {}\n".format(expected_updated_records.difference(updated_records_from_sync_2)) + "ADDITIONAL RECORDS: {}\n".format(updated_records_from_sync_2.difference(expected_updated_records)) ) # check that the record data matches expectations self.assertEqual(len(UPDATED_RECORDS.get(stream, [])), 1, msg="Expectations are invalid") updated_record = UPDATED_RECORDS.get(stream, []).pop() record_name = [row.get('data', {}).get('name') for row in second_sync_records.get(stream, []).get('messages', []) if row.get('data', {}).get('eid') == updated_record.get('eid')] expected_record_name = updated_record.get('name') self.assertEqual(len(record_name), 1, msg="Updated record was duplicated.") self.assertEqual(expected_record_name, record_name.pop(), msg="Update was not captured correctly.")
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties chicken_catalog = found_catalogs[0] self.assertEqual('chicken_view', chicken_catalog['stream_name']) print("discovered streams are correct") print('checking discoverd metadata for ROOT-CHICKEN_VIEW') md = menagerie.get_annotated_schema( conn_id, chicken_catalog['stream_id'])['metadata'] self.assertEqual( { (): { 'database-name': 'postgres', 'is-view': True, 'row-count': 0, 'schema-name': 'public', 'table-key-properties': [] }, ('properties', 'fk_id'): { 'inclusion': 'available', 'sql-datatype': 'bigint', 'selected-by-default': True }, ('properties', 'name'): { 'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True }, ('properties', 'age'): { 'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True }, ('properties', 'size'): { 'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True }, ('properties', 'id'): { 'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True }, ('properties', 'updated_at'): { 'selected-by-default': True, 'inclusion': 'available', 'sql-datatype': 'timestamp with time zone' } }, metadata.to_map(md)) # 'ID' selected as view-key-properties, updated_at is replication_key replication_md = [{ "breadcrumb": [], "metadata": { 'replication-key': 'updated_at', "replication-method": "INCREMENTAL", 'view-key-properties': ["id"] } }] connections.select_catalog_and_fields_via_metadata( conn_id, chicken_catalog, menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id']), replication_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, {'chicken_view': 1}) records_by_stream = runner.get_records_from_target_output() table_version = records_by_stream['chicken_view']['table_version'] self.assertEqual(2, len(records_by_stream['chicken_view']['messages'])) self.assertEqual( records_by_stream['chicken_view']['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream['chicken_view']['messages'][1]['action'], 'upsert') # verifications about individual records for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) actual_chicken_record = records_by_stream['chicken_view']['messages'][ 1]['data'] expected_chicken_record = { 'id': 1, 'fk_id': 1, 'name': 'fred', 'age': 99, 'updated_at': '2111-01-01T12:12:12.222111+00:00', 'size': 'big' } self.assertEqual( actual_chicken_record, expected_chicken_record, msg= "Expected `various_types` upsert record data to be {}, but target output {}" .format(expected_chicken_record, actual_chicken_record)) print("records are correct") # verify state and bookmarks state = menagerie.get_state(conn_id) chicken_bookmark = state['bookmarks']['postgres-public-chicken_view'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertEqual( chicken_bookmark['version'], table_version, msg="expected bookmark for stream ROOT-CHICKEN to match version") self.assertEqual(chicken_bookmark['replication_key'], 'updated_at') self.assertEqual(chicken_bookmark['replication_key_value'], '2111-01-01T12:12:12.222111+00:00') print("bookmarks are correct")
def test_run(self): # Default test setup # Create the connection for Zendesk conn_id = connections.ensure_connection(self) # Run a check job using orchestrator check_job_name = runner.run_check_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify schemas discovered were discovered self.found_catalogs = menagerie.get_catalogs(conn_id) self.assertEqual(len(self.found_catalogs), len(self.expected_check_streams())) # Verify the schemas discovered were exactly what we expect found_catalog_names = { catalog['tap_stream_id'] for catalog in self.found_catalogs if catalog['tap_stream_id'] in self.expected_check_streams() } self.assertSetEqual(self.expected_check_streams(), found_catalog_names) # Select our catalogs our_catalogs = [ c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) # Tags table only has name and count columns; don't select count connections.select_catalog_and_fields_via_metadata( conn_id, c, c_annotated, [], ['name']) # Clear state before our run menagerie.set_state(conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records are retrieving the sub set of fields records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream).get('messages') for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone( m.get('data', {}).get(pk), msg="Missing primary-key for message {}".format(m))
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams catalog = menagerie.get_catalog(conn_id) found_catalogs = menagerie.get_catalogs(conn_id) found_streams = {entry['tap_stream_id'] for entry in catalog['streams']} self.assertSetEqual(self.expected_check_streams(), found_streams) # verify the tap discovered stream metadata is consistent with the source database for tap_stream_id in self.expected_check_streams(): with self.subTest(stream=tap_stream_id): # gather expectations stream = tap_stream_id.split('-')[1] expected_primary_key = self.expected_pks()[stream] expected_row_count = self.expected_row_counts()[stream] expected_replication_keys = self.expected_valid_replication_keys()[stream] # gather results found_stream = [entry for entry in catalog['streams'] if entry['tap_stream_id'] == tap_stream_id][0] stream_metadata = [entry['metadata'] for entry in found_stream['metadata'] if entry['breadcrumb']==[]][0] primary_key = set(stream_metadata.get('table-key-properties')) row_count = stream_metadata.get('row-count') replication_key = set(stream_metadata.get('valid-replication-keys')) # assert that the pks are correct self.assertSetEqual(expected_primary_key, primary_key) # assert that the row counts are correct self.assertEqual(expected_row_count, row_count) # assert that valid replication keys are correct self.assertSetEqual(replication_key, expected_replication_keys) # ----------------------------------- # ----------- Initial Sync --------- # ----------------------------------- # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) rep_key = 'date_field' for key in self.key_names(): if key in stream_catalog['stream_name']: rep_key = key additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'INCREMENTAL', 'replication-key': rep_key}}] selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, stream_catalog, annotated_schema, additional_md) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() # gather expectations expected_schema = {'type': 'object'} for tap_stream_id in self.expected_sync_streams(): with self.subTest(stream=tap_stream_id): # gather results persisted_schema = messages_by_stream[tap_stream_id]['schema'] # assert the schema is an object self.assertDictEqual(expected_schema, persisted_schema) # verify that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) # verify that the entire collection was synced by comparing row counts against the source for tap_stream_id in self.expected_sync_streams(): with self.subTest(stream=tap_stream_id): expected_row_count = self.expected_row_counts()[tap_stream_id] row_count = record_count_by_stream[tap_stream_id] self.assertEqual(expected_row_count, row_count) # verify state is saved in the proper format for all streams state = menagerie.get_state(conn_id) expected_state_keys = { 'last_replication_method', 'replication_key_name', 'replication_key_type', 'replication_key_value', 'version', } for tap_stream_id in self.expected_check_streams(): with self.subTest(stream=tap_stream_id): bookmark = state['bookmarks'][tap_stream_id] # gather expectations stream = tap_stream_id.split('-')[1] expected_replication_keys = self.expected_valid_replication_keys()[stream] # gather results replication_key = bookmark['replication_key_name'] replication_key_type = bookmark['replication_key_type'] # assert that all expected bookmark keys are present self.assertSetEqual(expected_state_keys, set(bookmark.keys())) # assert all bookmark keys have values for key in expected_state_keys: self.assertIsNotNone(bookmark[key]) # assert incremental sync was performed self.assertEqual('INCREMENTAL', bookmark['last_replication_method']) # assert the replication key was used to save state self.assertIn(replication_key, expected_replication_keys) # assert the replication key type is a valid datatype self.assertIn(replication_key_type, VALID_REPLICATION_TYPES) self.assertIsNone(state['currently_syncing']) # ----------------------------------- # ------------ Second Sync ---------- # ----------------------------------- # Perform data manipulations with get_test_connection() as client: # update 1 document in each of the collection update_doc_coll_1 = client["simple_db"]["simple_coll_1"].find_one() client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": update_doc_coll_1["_id"]}, {"$set": {"date_field": datetime(2020, 1, 1, 19, 29, 14, 578000)}}) update_doc_coll_2 = client["simple_db"]["simple_coll_2"].find_one() client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": update_doc_coll_2["_id"]}, {"$set": {"date_field": datetime(2020, 1, 1, 19, 29, 14, 578000)}}) for key_name in self.key_names(): if (key_name == 'int_field'): # get the first document in the collection to update doc_to_update = client["simple_db"]["simple_coll_{}".format(key_name)].find_one(sort=[("{}".format(key_name), -1)]) value = doc_to_update["{}".format(key_name)] int_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one() client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": int_based_coll["_id"]}, {"$set": {"{}".format(key_name): value+3}}) elif (key_name == 'double_field'): doc_to_update = client["simple_db"]["simple_coll_{}".format(key_name)].find_one(sort=[("{}".format(key_name), -1)]) value = doc_to_update["{}".format(key_name)] double_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one() client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": double_based_coll["_id"]}, {"$set": {"{}".format(key_name): value+3}}) elif (key_name == '64_bit_int_field'): doc_to_update = client["simple_db"]["simple_coll_{}".format(key_name)].find_one(sort=[("{}".format(key_name), -1)]) value = doc_to_update["{}".format(key_name)] bit64_int_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one() client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": bit64_int_based_coll["_id"]}, {"$set": {"{}".format(key_name): value+3}}) elif (key_name == 'date_field'): date_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one() client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": date_based_coll["_id"]}, {"$set": {"{}".format(key_name): datetime(2021, 1, 1, 15, 30, 14, 222000)}}) elif (key_name == 'timestamp_field'): timestamp_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one() client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": timestamp_based_coll["_id"]}, {"$set": {"{}".format(key_name): bson.timestamp.Timestamp(1565897157+99, 1)}}) # TODO : figure out how to update collections with replication key = string, uuid # insert two documents with date_field > bookmark for next sync client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 50, "string_field": z_string_generator(), "date_field": datetime(2018, 9, 13, 19, 29, 14, 578000), "double_field": 51.001, "timestamp_field": bson.timestamp.Timestamp(1565897157+50, 1), "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282050'), "64_bit_int_field": 34359738368 + 50 }) client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 51, "string_field": z_string_generator(), "date_field": datetime(2018, 9, 18, 19, 29, 14, 578000), "double_field": 52.001, "timestamp_field": bson.timestamp.Timestamp(1565897157+51, 1), "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282051'), "64_bit_int_field": 34359738368 + 51 }) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 100, "string_field": z_string_generator(), "date_field": datetime(2019, 5, 21, 19, 29, 14, 578000), "double_field": 101.001, "timestamp_field": bson.timestamp.Timestamp(1565897157+100, 1), "uuid_field":uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282100'), "64_bit_int_field": 34359738368 + 100 }) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 101, "string_field": z_string_generator(), "date_field": datetime(2019, 5, 26, 19, 29, 14, 578000), "double_field": 102.001, "timestamp_field": bson.timestamp.Timestamp(1565897157+101, 1), "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282101'), "64_bit_int_field": 34359738368 + 101 }) for key_name in self.key_names(): client["simple_db"]["simple_coll_{}".format(key_name)].insert_one({ "int_field": 50, "string_field": z_string_generator(50), "date_field": datetime(2018, 9, 13, 19, 29, 15, 578000), "double_field": 51.001, "timestamp_field": bson.timestamp.Timestamp(1565897157+50, 1), "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282050'), "64_bit_int_field": 34359738368 + 50 }) client["simple_db"]["simple_coll_{}".format(key_name)].insert_one({ "int_field": 51, "string_field": z_string_generator(51), "date_field": datetime(2018, 9, 18, 19, 29, 16, 578000), "double_field": 52.001, "timestamp_field": bson.timestamp.Timestamp(1565897157+51, 1), "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282051'), "64_bit_int_field": 34359738368 + 51 }) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() records_by_stream = {} for stream_name in self.expected_sync_streams(): records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert'] # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that we got 4 records for each stream (2 because of the new records, 1 because of update and 1 because of greater than equal [for key based incremental there will always be an overlap on the bookmark value]) for k, v in record_count_by_stream.items(): # Workaround for not including collections for uuid and string, TODO : look for a solution to implement string and uuid as replication_key if k not in ('simple_coll_uuid_field', 'simple_coll_string_field'): self.assertEqual(4, v) # Verify that the _id of the records sent are the same set as the # _ids of the documents changed for stream_name in self.expected_sync_streams(): # Workaround for not including collections for uuid and string, TODO : look for a solution to implement string and uuid as replication_key if stream_name not in ('simple_coll_uuid_field', 'simple_coll_string_field'): actual = set([x['data']['int_field'] for x in records_by_stream[stream_name]]) self.assertEqual(self.expected_incremental_int_fields()[stream_name], actual) ############################################################################## # Verify that data is not replicated when non replication key is updated ############################################################################## # Sampling a document from a collection which we know it exists because of the data set up no_rep_doc_coll_1 = client["simple_db"]["simple_coll_1"].find_one({"int_field": 20}) client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": no_rep_doc_coll_1["_id"]}, {"$set": {"string_field": 'No_replication'}}) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) messages_by_stream = runner.get_records_from_target_output() second_state = menagerie.get_state(conn_id) records_by_stream = {} for stream_name in self.expected_sync_streams(): records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert'] doc_from_simple_coll_1 = records_by_stream['simple_coll_1'] # Verify the document from simple_coll_1 does not correspond to the document which we updated_data self.assertNotEqual(doc_from_simple_coll_1[0]['data']['_id'], no_rep_doc_coll_1["_id"]) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that we got 1 record for each stream (1 because of greater than equal [for key based incremental there will always be an overlap on the bookmark value]) for k, v in record_count_by_stream.items(): if k not in ('simple_coll_uuid_field', 'simple_coll_string_field'): self.assertEqual(1, v) # ----------------------------------- # ------------ Third Sync ----------- # ----------------------------------- # Change the replication method for simple_coll_1 # Change the replication key for simple_coll_2 # Make sure both do full resync for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) additional_md = [] if stream_catalog['tap_stream_id'] == 'simple_db-simple_coll_1': additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}] elif stream_catalog['tap_stream_id'] == 'simple_db-simple_coll_2': additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'INCREMENTAL', 'replication-key': 'timestamp_field'}}] else: additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'INCREMENTAL', 'replication-key': stream_catalog['stream_name'].replace('simple_coll_', '')}}] selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, stream_catalog, annotated_schema, additional_md) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertDictEqual(record_count_by_stream, self.expected_last_sync_row_counts())
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") all_excluded_fields = {} # select all catalogs for c in found_catalogs: if c['stream_name'] == 'ads': continue discovered_schema = menagerie.get_annotated_schema( conn_id, c['stream_id'])['annotated-schema'] all_excluded_fields[c['stream_name']] = list( set(discovered_schema.keys()) - self.expected_automatic_fields().get(c['stream_name'], set()) )[:5] connections.select_catalog_and_fields_via_metadata( conn_id, c, discovered_schema, non_selected_fields=all_excluded_fields[c['stream_name']]) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) synced_records = runner.get_records_from_target_output() self.assertTrue('ads' not in synced_records.keys()) for stream_name, data in synced_records.items(): record_messages = [ set(row['data'].keys()) for row in data['messages'] ] for record_keys in record_messages: # The intersection should be empty self.assertFalse( record_keys.intersection(all_excluded_fields[stream_name]))
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) # ------------------------------------------- # ----------- First full Table Sync --------- # ------------------------------------------- # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}] selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, stream_catalog, annotated_schema, additional_md) # run full table sync sync_job_name = runner.run_sync_mode(self, conn_id) # check exit status exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # streams that we synced are the ones that we expect to see records_by_stream = runner.get_records_from_target_output() record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) # assert that we get the correct number of records for each stream self.assertEqual(self.expected_row_counts(),record_count_by_stream) # assert that an activate_version_message is first and last message sent for each stream for stream_name in self.expected_sync_streams(): self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action']) self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action']) state = menagerie.get_state(conn_id) first_versions = {} for tap_stream_id in self.expected_check_streams(): # state has an initial_full_table_complete == True self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete']) # there is a version bookmark in state first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version'] self.assertIsNotNone(first_versions[tap_stream_id]) # ------------------------------------------- # ----------- Second full Table Sync --------- # ------------------------------------------- # add 2 rows and run full table again, make sure we get initial number + 2 with get_test_connection() as client: client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(2)) client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(2)) client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(2)) sync_job_name = runner.run_sync_mode(self, conn_id) # check exit status exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) state = menagerie.get_state(conn_id) # Verify that menagerie state does not include a key for currently syncing self.assertIsNone(state['currently_syncing']) # Verify that menagerie state does not include a key for oplog based syncing self.assertNotIn('oplog', state) # assert that we have correct number of records (including the two new records) new_expected_row_counts = {k:v+2 for k,v in self.expected_row_counts().items() if k not in ['simple_db_simple_coll_3', 'simple_db_simple_coll_4']} new_expected_row_counts['simple_db_simple_coll_3']=0 new_expected_row_counts['simple_db_simple_coll_4']=5 self.assertEqual(new_expected_row_counts, record_count_by_stream) # assert that we only have an ActivateVersionMessage as the last message and not the first for stream_name in self.expected_sync_streams(): if len(records_by_stream[stream_name]['messages']) > 1: self.assertNotEqual('activate_version', records_by_stream[stream_name]['messages'][0]['action'], stream_name + "failed") self.assertEqual('upsert', records_by_stream[stream_name]['messages'][0]['action'], stream_name + "failed") self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action'], stream_name + "failed") second_versions = {} for tap_stream_id in self.expected_check_streams(): found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] # state has an initial_full_table_complete == True self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete']) # version bookmark second_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version'] self.assertIsNotNone(second_versions[tap_stream_id]) # version in this state is different than that of the previous state self.assertNotEqual(first_versions[tap_stream_id], second_versions[tap_stream_id]) # version which is larger than the previous target version self.assertTrue(second_versions[tap_stream_id]>first_versions[tap_stream_id]) # verify that menagerie state does include the version which matches the target version self.assertEqual(records_by_stream[self.tap_stream_id_to_stream()[tap_stream_id]]['table_version'], second_versions[tap_stream_id]) # version which is larger than the previous target version self.assertTrue(second_versions[tap_stream_id]>first_versions[tap_stream_id]) # version matches the target version self.assertEqual(records_by_stream[self.tap_stream_id_to_stream()[tap_stream_id]]['table_version'], second_versions[tap_stream_id])
def test_run(self): """ Verify that for each stream you can get data when no fields are selected and only the automatic fields are replicated. """ print("\n\nRUNNING {}\n\n".format(self.name())) # ensure data exists for sync streams and set expectations expected_records = {x: [] for x in self.expected_streams()} # ids by stream for stream in self.testable_streams(): existing_objects = self.client.get_all(stream) if existing_objects: print("Data exists for stream: {}".format(stream)) for obj in existing_objects: expected_records[stream].append({ field: obj.get(field) for field in self.expected_automatic_fields().get( stream) }) else: print("Data does not exist for stream: {}".format(stream)) assert None, "more test functinality needed" # Instantiate connection with default start/end dates conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") for cat in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, cat['stream_id']) # Verify that pks, rep keys, foreign keys have inclusion of automatic (metadata and annotated schema). for k in self.expected_automatic_fields().get(cat['stream_name']): mdata = next( (m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format( cat['stream_name'], mdata)) self.assertTrue( mdata and mdata['metadata']['inclusion'] == 'automatic') # Deselect all available fields from all streams, keep automatic fields self.select_all_streams_and_fields(conn_id=conn_id, catalogs=found_catalogs, select_all_fields=False) catalogs = menagerie.get_catalogs(conn_id) # Ensure our selection worked for cat in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, cat['stream_id']) # Verify all streams are selected selected = catalog_entry.get('annotated-schema').get('selected') print("Validating selection on {}: {}".format( cat['stream_name'], selected)) self.assertTrue(selected, msg="Stream not selected.") # Verify only automatic fields are selected for field, field_props in catalog_entry.get( 'annotated-schema').get('properties').items(): field_selected = field_props.get('selected') print("\tValidating selection on {}.{}: {}".format( cat['stream_name'], field, field_selected)) if field in self.expected_automatic_fields().get( cat['stream_name']): # NOTE: AUTOMATIC FIELDS IGNORE THE SELECTED md {'selected': None} print( "NOTE: selection for {} is ignored by the Transformer " .format(field) + " so long as 'inlcusion' = 'automatic'") else: self.assertFalse( field_selected, msg="Field is selected but not automatic.") #clear state menagerie.set_state(conn_id, {}) # run sync sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # read target output first_record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) replicated_row_count = reduce(lambda accum, c: accum + c, first_record_count_by_stream.values()) synced_records = runner.get_records_from_target_output() # Verify target has records for all synced streams for stream, count in first_record_count_by_stream.items(): assert stream in self.expected_streams() self.assertGreater( count, 0, msg="failed to replicate any data for: {}".format(stream)) print("total replicated row count: {}".format(replicated_row_count)) # Test by Stream for stream in self.testable_streams(): with self.subTest(stream=stream): data = synced_records.get(stream) record_messages_keys = [ set(row['data'].keys()) for row in data['messages'] ] expected_keys = self.expected_automatic_fields().get(stream) # Verify that only the automatic fields are sent to the target for actual_keys in record_messages_keys: self.assertEqual( actual_keys.symmetric_difference(expected_keys), set(), msg="Expected automatic fields and nothing else.") actual_records = [row['data'] for row in data['messages']] # Verify the number of records match expectations self.assertEqual(len(expected_records.get(stream)), len(actual_records), msg="Number of actual records do match expectations. " +\ "We probably have duplicate records.") # verify by values, that we replicated the expected records for actual_record in actual_records: self.assertTrue( actual_record in expected_records.get(stream), msg="Actual record missing from expectations") for expected_record in expected_records.get(stream): self.assertTrue(expected_record in actual_records, msg="Expected record missing from target.")
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) for tap_stream_id in self.expected_check_streams(): found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] # assert that the pks are correct self.assertEqual(self.expected_pks()[found_stream['stream_name']], set(found_stream.get('metadata', {}).get('table-key-properties'))) # assert that the row counts are correct self.assertEqual(self.expected_row_counts()[found_stream['stream_name']], found_stream.get('metadata', {}).get('row-count')) # ----------------------------------- # ----------- Full Table Sync --------- # ----------------------------------- # select simple_coll_1 stream and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}] selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, stream_catalog, annotated_schema, additional_md) # synthesize interrupted state interrupted_state = { 'currently_syncing' : 'simple_db-simple_coll_1', 'bookmarks' : {'simple_db-simple_coll_1': { 'max_id_value': 49, 'max_id_type': 'int', 'initial_full_table_complete': False, 'last_id_fetched': 25, 'last_id_fetched_type': 'int', 'version': int(time.time() * 1000)}, 'simple_db-simple_coll_2': { 'max_id_value': base64.b64encode("test {}".format(49).encode()), 'max_id_type': 'bytes', 'initial_full_table_complete': False, 'last_id_fetched': base64.b64encode("test {}".format(25).encode()), 'last_id_fetched_type': 'bytes', 'version': int(time.time() * 1000)}}} menagerie.set_state(conn_id, interrupted_state) runner.run_sync_mode(self, conn_id) # streams that we synced are the ones that we expect to see records_by_stream = runner.get_records_from_target_output() record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) # ActivateVersionMessage as the last message and not the first for stream_name in self.expected_sync_streams(): self.assertNotEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action']) self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action']) # _id of the first record sync'd for each stream is the bookmarked # last_id_fetched from the interrupted_state passed to the tap self.assertEqual(records_by_stream['simple_coll_1']['messages'][0]['data']['_id'], int(interrupted_state['bookmarks']['simple_db-simple_coll_1']['last_id_fetched'])) # _id of the last record sync'd for each stream is the bookmarked # max_id_value from the interrupted_state passed to the tap self.assertEqual(records_by_stream['simple_coll_1']['messages'][-2]['data']['_id'], int(interrupted_state['bookmarks']['simple_db-simple_coll_1']['max_id_value'])) # assert that final state has no last_id_fetched and max_id_value bookmarks final_state = menagerie.get_state(conn_id) for tap_stream_id in self.expected_check_streams(): self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('last_id_fetched')) self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('max_id_value'))
def test_run(self): conn_id = self.ensure_connection() # Run in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) # Select only the expected streams tables expected_streams = self.expected_streams() catalog_entries = [ ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams ] self.select_all_streams_and_fields(conn_id, catalog_entries) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Examine target file sync_records = runner.get_records_from_target_output() sync_record_count = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) # Test by stream for stream in self.expected_streams(): with self.subTest(stream=stream): expected_count = self.minimum_record_count_by_stream().get( stream) record_count = sync_record_count.get(stream, 0) sync_messages = sync_records.get(stream, { 'messages': [] }).get('messages') primary_key = self.expected_primary_keys().get(stream).pop() # Verify the sync meets or exceeds the default record count self.assertLessEqual(expected_count, record_count) # Verify the number or records exceeds the max_results (api limit) pagination_threshold = int( self.get_properties().get(page_size_key)) self.assertGreater( record_count, pagination_threshold, msg="Record count not large enough to gaurantee pagination." ) # Verify we did not duplicate any records across pages records_pks_set = { message.get('data').get(primary_key) for message in sync_messages } records_pks_list = [ message.get('data').get(primary_key) for message in sync_messages ] self.assertCountEqual( records_pks_set, records_pks_list, msg="We have duplicate records for {}".format(stream))
def test_run(self): # SYNC 1 conn_id = self.ensure_connection() # Run in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) # Select only the expected streams tables expected_streams = self.expected_streams() catalog_entries = [ ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams ] self.select_all_streams_and_fields(conn_id, catalog_entries) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) first_sync_records = runner.get_records_from_target_output() first_sync_record_count = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # SYNC 2 conn_id = self.ensure_connection(original=False) # Run in check mode check_job_name = runner.run_check_mode(self, conn_id) # Select only the Accounts table found_catalogs = menagerie.get_catalogs(conn_id) catalog_entries = [ ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams ] self.select_all_streams_and_fields(conn_id, catalog_entries) # Run in Sync mode sync_job_name = runner.run_sync_mode(self, conn_id) second_sync_records = runner.get_records_from_target_output() second_sync_record_count = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Test by stream for stream in self.expected_streams(): with self.subTest(stream=stream): # record counts first_sync_count = first_sync_record_count.get(stream, 0) expected_first_sync_count = self.minimum_record_count_by_stream( ).get(stream) second_sync_count = second_sync_record_count.get(stream, 0) # record messages first_sync_messages = first_sync_records.get( stream, { 'messages': [] }).get('messages') second_sync_messages = second_sync_records.get( stream, { 'messages': [] }).get('messages') # start dates start_date_1 = self.get_properties()['start_date'] start_date_2 = self.get_properties( original=False)['start_date'] # Verify by stream that our first sync meets or exceeds the default record count self.assertLessEqual(expected_first_sync_count, first_sync_count) # Verify by stream more records were replicated in the first sync, with an older start_date than the second self.assertGreaterEqual(first_sync_count, second_sync_count) # Verify by stream that all records have a rep key that is equal to or greater than that sync's start_date for message in first_sync_messages: rep_key_value = message.get('data').get('MetaData').get( 'LastUpdatedTime') self.assertGreaterEqual( rep_key_value, start_date_1, msg= "A record was replicated with a replication key value prior to the start date" ) for message in second_sync_messages: rep_key_value = message.get('data').get('MetaData').get( 'LastUpdatedTime') self.assertGreaterEqual( rep_key_value, start_date_2, msg= "A record was replicated with a replication key value prior to the start date" )
def test_run(self): print("\n\nRUNNING {}\n\n".format(self.name())) # ensure data exists for sync streams and set expectations expected_records_1 = {x: [] for x in self.expected_sync_streams()} # ids by stream for stream in self.expected_sync_streams().difference(self.untestable_streams()): if stream in self.expected_incremental_sync_streams(): start_date = dt.strptime(self.get_properties().get('start_date'), self.START_DATE_FORMAT) since = start_date.strftime(self.TEST_TIME_FORMAT) _, existing_objects = utils.get_total_record_count_and_objects(stream, since=since) else: _, existing_objects = utils.get_total_record_count_and_objects(stream) if existing_objects: logging.info("Data exists for stream: {}".format(stream)) for obj in existing_objects: # add existing records to expectations expected_records_1[stream].append(obj) continue # Create 1 record if none exist logging.info("Data does not exist for stream: {}".format(stream)) new_object = utils.create_object(stream) logging.info("Data generated for stream: {}".format(stream)) expected_records_1[stream].append(new_object) # Create comment actions start_date = dt.strptime(self.get_properties().get('start_date'), self.START_DATE_FORMAT) since = start_date.strftime(self.TEST_TIME_FORMAT) # count_before, before_records = utils.get_total_record_count_and_objects('actions', since=since) action_comments = [] action_comments.append(utils.create_object('actions', action_type="comment")) action_comments.append(utils.create_object('actions', action_type="comment")) for action in action_comments: expected_records_1['actions'].append(action) # count_after, after_records = utils.get_total_record_count_and_objects('actions', since=since) # run in check mode conn_id = connections.ensure_connection(self) check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") #select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id']) for k in self.expected_automatic_fields()[c['stream_name']]: mdata = next((m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format(c['stream_name'], mdata)) self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify data was replicated record_count_by_stream_1 = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks() ) replicated_row_count_1 = reduce(lambda accum,c : accum + c, record_count_by_stream_1.values()) self.assertGreater(replicated_row_count_1, 0, msg="failed to replicate any data: {}".format(record_count_by_stream_1)) print("total replicated row count: {}".format(replicated_row_count_1)) # get emitted with records synced_records_1 = runner.get_records_from_target_output() # Verify bookmarks were saved for all streams state_1 = menagerie.get_state(conn_id) for stream in self.expected_incremental_sync_streams(): self.assertTrue(state_1.get('bookmarks', {}).get(stream, {}).get('window_start', {})) print("Bookmarks meet expectations") # Generate data between syncs for bookmarking streams print("Generating more data prior to 2nd sync") expected_records_2 = {x: [] for x in self.expected_sync_streams()} for stream in self.expected_full_table_sync_streams().difference(self.untestable_streams()): for _ in range(1): new_object = utils.create_object(stream) expected_records_2[stream].append({field: new_object.get(field) for field in self.expected_automatic_fields().get(stream)}) # Update a single comment action before second sync print("Updating existing data prior to 2nd sync") updated_records = {x: [] for x in self.expected_sync_streams()} action_id_to_update = random.choice(action_comments).get('id') updated_action = utils.update_object_action(obj_id=action_id_to_update) updated_records['actions'].append(updated_action) # Get new actions from data manipulation between syncs print("Acquriing in-test actions prior to 2nd sync") for stream in self.expected_incremental_sync_streams().difference(self.untestable_streams()): state = dt.strptime(state_1.get('bookmarks').get(stream).get('window_start'), self.TEST_TIME_FORMAT) since = (state - timedelta(days=self.LOOKBACK_WINDOW)).strftime(self.TEST_TIME_FORMAT) # start_date = dt.strptime(self.get_properties().get('start_date'), self.START_DATE_FORMAT) # since = start_date.strftime(self.TEST_TIME_FORMAT) _, objects = utils.get_total_record_count_and_objects(stream, since=since) for obj in objects: expected_records_2[stream].append({field: obj.get(field) for field in self.expected_automatic_fields().get(stream)}) # Run another sync print("Running 2nd sync job") sync_job_name_2 = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status_2 = menagerie.get_exit_status(conn_id, sync_job_name_2) menagerie.verify_sync_exit_status(self, exit_status_2, sync_job_name_2) # verify data was replicated record_count_by_stream_2 = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks() ) replicated_row_count_2 = reduce(lambda accum,c : accum + c, record_count_by_stream_2.values()) self.assertGreater(replicated_row_count_2, 0, msg="failed to replicate any data: {}".format(record_count_by_stream_2)) print("total replicated row count: {}".format(replicated_row_count_2)) # get emitted with records synced_records_2 = runner.get_records_from_target_output() # Verify bookmarks were saved as expected inc streams state_2 = menagerie.get_state(conn_id) for stream in self.expected_incremental_sync_streams(): self.assertTrue(state_2.get('bookmarks', {}).get(stream, {}).get('window_start', {})) print("Bookmarks meet expectations") # TESTING FULL TABLE STREAMS for stream in self.expected_full_table_sync_streams().difference(self.untestable_streams()): with self.subTest(stream=stream): record_count_1 = record_count_by_stream_1.get(stream, 0) record_count_2 = record_count_by_stream_2.get(stream, 0) # Assert we have data for both syncs for full table streams self.assertGreater(record_count_1, 0) self.assertGreater(record_count_2, 0) # Assert that we are capturing the expected number of records for full table streams self.assertGreater(record_count_2, record_count_1, msg="Full table streams should have more data in second sync.") self.assertEqual((record_count_2 - record_count_1), len(expected_records_2.get(stream, [])), msg="The differnce in record counts between syncs should " +\ "equal the number of records we created between syncs.\n" +\ "This is not the case for {}".format(stream)) # Test that we are capturing the expected records for full table streams expected_ids_1 = set(record.get('id') for record in expected_records_1.get(stream)) data_1 = synced_records_1.get(stream, []) record_messages_1 = [row.get('data') for row in data_1['messages']] record_ids_1 = set(row.get('data').get('id') for row in data_1['messages']) expected_ids_2 = set(record.get('id') for record in expected_records_2.get(stream)) data_2 = synced_records_2.get(stream, []) record_messages_2 = [row.get('data') for row in data_2['messages']] record_ids_2 = set(row.get('data').get('id') for row in data_2['messages']) # verify all expected records are replicated for both syncs self.assertEqual(expected_ids_1, record_ids_1, msg="Data discrepancy. Expected records do not match actual in sync 1.") self.assertTrue(expected_ids_1.issubset(record_ids_2), msg="Data discrepancy. Expected records do not match actual in sync 2.") for expected_record in expected_records_1.get(stream): actual_record = [message for message in record_messages_1 if message.get('id') == expected_record.get('id')].pop() self.assertEqual(set(expected_record.keys()), set(actual_record.keys()), msg="Field mismatch between expectations and replicated records in sync 1.") # verify the 2nd sync gets records created after the 1st sync self.assertEqual(set(record_ids_2).difference(set(record_ids_1)), expected_ids_2, msg="We did not get the new record(s)") print("Full table streams tested.") # TESTING INCREMENTAL STREAMS for stream in self.expected_incremental_sync_streams().difference(self.untestable_streams()): with self.subTest(stream=stream): record_count_1 = record_count_by_stream_1.get(stream, 0) record_count_2 = record_count_by_stream_2.get(stream, 0) # Assert we have data for both syncs for inc streams self.assertGreater(record_count_1, 0) self.assertGreater(record_count_2, 0) # Assert that we are capturing the expected number of records for inc streams self.assertEqual(record_count_1, len(expected_records_1.get(stream, [])), msg="Stream {} replicated an unexpedted number records on 1st sync.".format(stream)) self.assertEqual(record_count_2, len(expected_records_2.get(stream, [])), msg="Stream {} replicated an unexpedted number records on 2nd sync.".format(stream)) # Assert that we are capturing the expected records for inc streams data_1 = synced_records_1.get(stream, []) record_messages_1 = [row.get('data').get('id') for row in data_1['messages']] data_2 = synced_records_2.get(stream, []) record_messages_2 = [row.get('data').get('id') for row in data_2['messages']] for record in expected_records_1.get(stream): self.assertTrue(record.get('id') in record_messages_1, msg="Missing an expected record from sync 1.") for record in expected_records_2.get(stream): self.assertTrue(record.get('id') in record_messages_2, msg="Missing an expected record from sync 2.") record_data_1 = [row.get('data') for row in data_1['messages']] record_data_2 = [row.get('data') for row in data_2['messages']] # Testing action comments (the only action type that can be updated) for action in action_comments: # Get text value for action comment from sync 1 original_action_text = "" for record in record_data_1: if record.get('id') == action.get('id'): original_action_text = record.get('data').get('text') assert original_action_text, "Record {} is missing from 1st sync.".format(action.get('id')) # Get text value for action comment from sync 2 for record in record_data_2: if record.get('id') == action.get('id'): current_action_text = record.get('data').get('text') assert current_action_text, "Record {} is missing from 2nd sync.".format(action.get('id')) # Verify the action comment text matches expectations if action.get('id')== action_id_to_update: self.assertNotEqual(original_action_text, current_action_text, msg="Update was not captured.") self.assertIn("UPDATE", current_action_text, msg="Update was captured but not as expected.") else: self.assertEqual(original_action_text, current_action_text, msg="Text does not match expected.") print("Incremental streams tested.") # CLEANING UP stream_to_delete = 'boards' boards_remaining = 5 print("Deleting all but {} records for stream {}.".format(boards_remaining, stream_to_delete)) board_count = len(expected_records_1.get(stream_to_delete, [])) + len(expected_records_2.get(stream_to_delete, [])) for obj_to_delete in expected_records_2.get(stream_to_delete, []): # Delete all baords between syncs if board_count > boards_remaining: utils.delete_object(stream_to_delete, obj_to_delete.get('id')) board_count -= 1 else: break for obj_to_delete in expected_records_1.get(stream_to_delete, []): # Delete all baords between syncs if board_count > boards_remaining: utils.delete_object(stream_to_delete, obj_to_delete.get('id')) board_count -= 1 else: break # Reset the parent objects that we have been tracking utils.reset_tracked_parent_objects()
def discovery_test(self): """ Verify that discover creates the appropriate catalog, schema, metadata, etc. • Verify number of actual streams discovered match expected • Verify the stream names discovered were what we expect • Verify stream names follow naming convention streams should only have lowercase alphas and underscores • verify there is only 1 top level breadcrumb • verify replication key(s) • verify primary key(s) • verify that if there is a replication key we are doing INCREMENTAL otherwise FULL • verify the actual replication matches our expected replication method • verify that primary, replication and foreign keys are given the inclusion of automatic (metadata and annotated schema). • verify that all other fields have inclusion of available (metadata and schema) """ conn_id = connections.ensure_connection(self) check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify number of actual streams discovered match expected found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) self.assertEqual( len(found_catalogs), len(self.expected_streams()), msg="Expected {} streams, actual was {} for connection {}, " "actual {}".format(len(self.expected_streams()), len(found_catalogs), found_catalogs, conn_id)) # Verify the stream names discovered were what we expect found_catalog_names = {c['tap_stream_id'] for c in found_catalogs} self.assertEqual(set(self.expected_streams()), set(found_catalog_names), msg="Expected streams don't match actual streams") # Verify stream names follow naming convention # streams should only have lowercase alphas and underscores self.assertTrue(all( [re.fullmatch(r"[a-z_]+", name) for name in found_catalog_names]), msg="One or more streams don't follow standard naming") for stream in self.expected_streams(): with self.subTest(stream=stream): catalog = next( iter([ catalog for catalog in found_catalogs if catalog["stream_name"] == stream ])) assert catalog # based on previous tests this should always be found schema_and_metadata = menagerie.get_annotated_schema( conn_id, catalog['stream_id']) metadata = schema_and_metadata["metadata"] schema = schema_and_metadata["annotated-schema"] # verify the stream level properties are as expected # verify there is only 1 top level breadcrumb stream_properties = [ item for item in metadata if item.get("breadcrumb") == [] ] self.assertTrue( len(stream_properties) == 1, msg="There is more than one top level breadcrumb") # verify replication key(s) self.assertEqual( set(stream_properties[0].get("metadata", { self.REPLICATION_KEYS: [] }).get(self.REPLICATION_KEYS, [])), self.expected_replication_keys()[stream], msg="expected replication key {} but actual is {}".format( self.expected_replication_keys()[stream], set(stream_properties[0].get( "metadata", { self.REPLICATION_KEYS: None }).get(self.REPLICATION_KEYS, [])))) # verify primary key(s) self.assertEqual( set(stream_properties[0].get("metadata", { self.PRIMARY_KEYS: [] }).get(self.PRIMARY_KEYS, [])), self.expected_primary_keys()[stream], msg="expected primary key {} but actual is {}".format( self.expected_primary_keys()[stream], set(stream_properties[0].get("metadata", { self.PRIMARY_KEYS: None }).get(self.PRIMARY_KEYS, [])))) # verify that if there is a replication key we are doing INCREMENTAL otherwise FULL actual_replication_method = stream_properties[0].get( "metadata", { self.REPLICATION_METHOD: None }).get(self.REPLICATION_METHOD) if stream_properties[0].get("metadata", { self.REPLICATION_KEYS: [] }).get(self.REPLICATION_KEYS, []): self.assertTrue( actual_replication_method == self.INCREMENTAL, msg="Expected INCREMENTAL replication " "since there is a replication key") else: self.assertTrue(actual_replication_method == self.FULL, msg="Expected FULL replication " "since there is no replication key") # verify the actual replication matches our expected replication method self.assertEqual( self.expected_replication_method().get(stream, None), actual_replication_method, msg= "The actual replication method {} doesn't match the expected {}" .format( actual_replication_method, self.expected_replication_method().get(stream, None))) expected_primary_keys = self.expected_primary_keys()[stream] expected_replication_keys = self.expected_replication_keys( )[stream] expected_automatic_fields = expected_primary_keys | expected_replication_keys # verify that primary, replication and foreign keys # are given the inclusion of automatic in annotated schema. actual_automatic_fields = { key for key, value in schema["properties"].items() if value.get("inclusion") == "automatic" } self.assertEqual( expected_automatic_fields, actual_automatic_fields, msg="expected {} automatic fields but got {}".format( expected_automatic_fields, actual_automatic_fields)) # verify that all other fields have inclusion of available # This assumes there are no unsupported fields for SaaS sources self.assertTrue( all({ value.get("inclusion") == "available" for key, value in schema["properties"].items() if key not in actual_automatic_fields }), msg= "Not all non key properties are set to available in annotated schema" ) # verify that primary, replication and foreign keys # are given the inclusion of automatic in metadata. actual_automatic_fields = { item.get("breadcrumb", ["properties", None])[1] for item in metadata if item.get("metadata").get("inclusion") == "automatic" } self.assertEqual( expected_automatic_fields, actual_automatic_fields, msg="expected {} automatic fields but got {}".format( expected_automatic_fields, actual_automatic_fields)) # verify that all other fields have inclusion of available # This assumes there are no unsupported fields for SaaS sources self.assertTrue( all({ item.get("metadata").get("inclusion") == "available" for item in metadata if item.get("breadcrumb", []) != [] and item.get("breadcrumb", ["properties", None])[1] not in actual_automatic_fields }), msg= "Not all non key properties are set to available in metadata" )
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") # select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, c['stream_id']) for k in self.expected_pks()[c['stream_name']]: mdata = next( (m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format( c['stream_name'], mdata)) self.assertTrue( mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_via_metadata(conn_id, c, catalog_entry) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) synced_records = runner.get_records_from_target_output() for stream_name, data in synced_records.items(): record_messages = [ set(row['data'].keys()) for row in data['messages'] ] for record_keys in record_messages: # The symmetric difference should be empty self.assertEqual( record_keys, self.expected_automatic_fields().get(stream_name, set()))
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual( len(found_catalogs), 2, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties test_catalog_cows = list( filter( lambda c: c['stream_name'] == 'postgres_logical_replication_test_cows', found_catalogs))[0] self.assertEqual('postgres_logical_replication_test_cows', test_catalog_cows['stream_name']) test_catalog_chickens = list( filter( lambda c: c['stream_name' ] == 'postgres_logical_replication_test_chickens', found_catalogs))[0] self.assertEqual('postgres_logical_replication_test_chickens', test_catalog_chickens['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog_cows, menagerie.get_annotated_schema(conn_id, test_catalog_cows['stream_id']), additional_md) connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog_chickens, menagerie.get_annotated_schema(conn_id, test_catalog_chickens['stream_id']), additional_md) # clear state menagerie.set_state(conn_id, {}) #run sync job sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual( record_count_by_stream, { 'public_postgres_logical_replication_test_cows': 1, 'public_postgres_logical_replication_test_chickens': 1 }) records_by_stream = runner.get_records_from_target_output() table_version_cows = records_by_stream[ 'public_postgres_logical_replication_test_cows']['table_version'] self.assertEqual( records_by_stream['public_postgres_logical_replication_test_cows'] ['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream['public_postgres_logical_replication_test_cows'] ['messages'][1]['action'], 'upsert') self.assertEqual( records_by_stream['public_postgres_logical_replication_test_cows'] ['messages'][2]['action'], 'activate_version') table_version_chickens = records_by_stream[ 'public_postgres_logical_replication_test_chickens'][ 'table_version'] self.assertEqual( records_by_stream[ 'public_postgres_logical_replication_test_chickens'] ['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[ 'public_postgres_logical_replication_test_chickens'] ['messages'][1]['action'], 'upsert') self.assertEqual( records_by_stream[ 'public_postgres_logical_replication_test_chickens'] ['messages'][2]['action'], 'activate_version') # verify state and bookmarks state = menagerie.get_state(conn_id) self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") bookmark_cows = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_cows'] self.assertIsNotNone(bookmark_cows['lsn'], msg="expected bookmark for stream to have an lsn") lsn_cows_1 = bookmark_cows['lsn'] self.assertEqual(bookmark_cows['version'], table_version_cows, msg="expected bookmark for stream to match version") bookmark_chickens = state['bookmarks'][ 'postgres-public-postgres_logical_replication_test_chickens'] self.assertIsNotNone(bookmark_chickens['lsn'], msg="expected bookmark for stream to have an lsn") lsn_chickens_1 = bookmark_chickens['lsn'] self.assertEqual(bookmark_chickens['version'], table_version_chickens, msg="expected bookmark for stream to match version") #---------------------------------------------------------------------- # invoke the sync job again after adding records #---------------------------------------------------------------------- print("inserting 1 more cows and 1 more chickens") with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor() as cur: #insert another cow self.cows_rec_2 = {'cow_name': "betty cow", 'cow_age': 21} insert_record(cur, test_table_name_cows, self.cows_rec_2) with db_utils.get_test_connection('postgres') as conn: conn.autocommit = True with conn.cursor() as cur: #insert another chicken self.chicken_rec_2 = { 'chicken_name': "burt chicken", 'chicken_age': 14 } insert_record(cur, test_table_name_chickens, self.chicken_rec_2) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual( record_count_by_stream, { 'public_postgres_logical_replication_test_cows': 1, 'public_postgres_logical_replication_test_chickens': 1 }) upserts = [] for u in runner.get_upserts_from_target_output(): self.assertIsNotNone(u.get('_sdc_lsn')) del u['_sdc_lsn'] upserts.append(u) self.assertEqual([{ '_sdc_deleted_at': None, 'cow_age': 21, 'id': 2, 'cow_name': 'betty cow' }, { 'chicken_name': 'burt chicken', '_sdc_deleted_at': None, 'chicken_age': 14, 'id': 2 }], upserts) print("inserted record is correct") state = menagerie.get_state(conn_id) self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") cows_bookmark = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_cows'] self.assertIsNotNone( cows_bookmark['lsn'], msg= "expected bookmark for stream public-postgres_logical_replication_test to have an scn" ) lsn_cows_2 = cows_bookmark['lsn'] self.assertTrue(lsn_cows_2 >= lsn_cows_1) chickens_bookmark = state['bookmarks'][ 'postgres-public-postgres_logical_replication_test_chickens'] self.assertIsNotNone( chickens_bookmark['lsn'], msg= "expected bookmark for stream public-postgres_logical_replication_test to have an scn" ) lsn_chickens_2 = chickens_bookmark['lsn'] self.assertTrue(lsn_chickens_2 >= lsn_chickens_1) #table_version does NOT change self.assertEqual( chickens_bookmark['version'], table_version_chickens, msg= "expected bookmark for stream public-postgres_logical_replication_test to match version" ) #table_version does NOT change self.assertEqual( cows_bookmark['version'], table_version_cows, msg= "expected bookmark for stream public-postgres_logical_replication_test to match version" )
def test_run(self): conn_id = connections.ensure_connection(self, payload_hook=None) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found catalog_entries = menagerie.get_catalogs(conn_id) # Select all streams and all fields for entry in catalog_entries: if entry.get('tap_stream_id') in self.expected_sync_streams(): schema = menagerie.select_catalog(conn_id, entry) catalog_entry = { 'key_properties': entry.get('key_properties'), 'schema': schema, 'tap_stream_id': entry.get('tap_stream_id'), 'replication_method': entry.get('replication_method'), 'replication_key': entry.get('replication_key') } connections.select_catalog_and_fields_via_metadata( conn_id, catalog_entry, schema) # found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(catalog_entries), 0, msg="unable to locate schemas for connection {}".format(conn_id)) set_of_discovered_streams = { entry['tap_stream_id'] for entry in catalog_entries } self.assertTrue( self.expected_check_streams().issubset(set_of_discovered_streams), msg="Expected check streams are not a subset of discovered streams" ) menagerie.set_state(conn_id, {}) # Verify that tap and target exit codes are 0 first_record_count = self.run_sync_and_get_record_count(conn_id) # verify that we only sync selected streams self.assertEqual(set(first_record_count.keys()), self.expected_sync_streams()) first_state = menagerie.get_state(conn_id) first_sync_records = runner.get_records_from_target_output() first_max_bookmarks = self.max_bookmarks_by_stream(first_sync_records) first_min_bookmarks = self.min_bookmarks_by_stream(first_sync_records) # Run second sync second_record_count = self.run_sync_and_get_record_count(conn_id) second_state = menagerie.get_state(conn_id) second_sync_records = runner.get_records_from_target_output() second_max_bookmarks = self.max_bookmarks_by_stream( second_sync_records) second_min_bookmarks = self.min_bookmarks_by_stream( second_sync_records) for stream in self.expected_sync_streams(): # Verify first sync returns more data or same amount of data self.assertGreaterEqual( first_record_count.get(stream, 0), second_record_count.get(stream, 0), msg="Second sync didn't always return less records for stream {}" .format(stream)) self.assertGreaterEqual(second_state['bookmarks'][stream], first_state['bookmarks'][stream])
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams()] self.assertEqual(len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference(found_catalog_names) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties chicken_catalog = found_catalogs[0] self.assertEqual('chicken_view', chicken_catalog['stream_name']) print("discovered streams are correct") print('checking discoverd metadata for ROOT-CHICKEN_VIEW') md = menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id'])['metadata'] self.assertEqual( {(): {'database-name': 'postgres', 'is-view': True, 'row-count': 0, 'schema-name': 'public', 'table-key-properties': []}, ('properties', 'fk_id'): {'inclusion': 'available', 'sql-datatype': 'bigint', 'selected-by-default': True}, ('properties', 'name'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True}, ('properties', 'age'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True}, ('properties', 'size'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True}, ('properties', 'id'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True}}, metadata.to_map(md)) # 'ID' selected as view-key-properties replication_md = [{"breadcrumb": [], "metadata": {'replication-key': None, "replication-method" : "LOG_BASED", 'view-key-properties': ["id"]}}] connections.select_catalog_and_fields_via_metadata(conn_id, chicken_catalog, menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id']), replication_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) self.assertEqual(exit_status['tap_exit_status'], 1) # menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, {}) print("records are correct") # verify state and bookmarks state = menagerie.get_state(conn_id) self.assertEqual(state, {}, msg="expected state to be empty")
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id ][0] # assert that the pks are correct self.assertEqual( self.expected_pks()[found_stream['stream_name']], set( found_stream.get('metadata', {}).get('table-key-properties'))) # assert that the row counts are correct self.assertEqual( self.expected_row_counts()[found_stream['stream_name']], found_stream.get('metadata', {}).get('row-count')) # ----------------------------------- # ----------- Full Table Sync --------- # ----------------------------------- # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) # Synthesize interrupted state original_version = int(time.time() * 1000) interrupted_state = { 'currently_syncing': 'simple_db-simple_coll_1', 'bookmarks': { 'simple_db-simple_coll_1': { 'version': original_version, 'initial_full_table_complete': True, 'oplog_ts_time': 1, 'oplog_ts_inc': 0 } } } menagerie.set_state(conn_id, interrupted_state) # This should say the oplog has timed out and will execute a resync runner.run_sync_mode(self, conn_id) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # assert that we only have an ActivateVersionMessage as the last message and not the first for stream_name in self.expected_sync_streams(): self.assertEqual( 'activate_version', records_by_stream[stream_name]['messages'][0]['action']) self.assertEqual( 'activate_version', records_by_stream[stream_name]['messages'][51]['action']) # assert that final state has no last_id_fetched and max_id_value bookmarks final_state = menagerie.get_state(conn_id) self.assertNotEqual( original_version, final_state.get('bookmarks', {}).get('simple_db-simple_coll_1', {}).get('version')) # assert that all rows in the collection were sync'd for stream_id, row_count in self.expected_row_counts().items(): self.assertGreaterEqual(record_count_by_stream[stream_id], row_count) # assert that each stream has a initial_full_table_complete=True bookmark self.assertIsNotNone( final_state.get('bookmarks', {}).get('simple_db-simple_coll_1', {}).get('oplog_ts_time')) self.assertIsNotNone( final_state.get('bookmarks', {}).get('simple_db-simple_coll_1', {}).get('oplog_ts_inc')) self.assertTrue( final_state.get('bookmarks', {}).get('simple_db-simple_coll_1', {}).get('initial_full_table_complete'))
def run_single_projection(self, projection_mapping): self.setUpDatabase() conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id ][0] # assert that the pks are correct self.assertEqual( self.expected_pks()[found_stream['stream_name']], set( found_stream.get('metadata', {}).get('table-key-properties'))) # assert that the row counts are correct self.assertEqual( self.expected_row_counts()[found_stream['stream_name']], found_stream.get('metadata', {}).get('row-count')) # ----------------------------------- # ----------- Initial Full Table --------- # ----------------------------------- # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] if projection_mapping['projection'] is not None: additional_md[0]['metadata'][ 'tap_mongodb.projection'] = json.dumps( projection_mapping['projection']) selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() for stream_name in self.expected_sync_streams(): stream_records = [ x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert' ] #actual_keys = set() for record in stream_records: self.assertIn(record['data'].keys(), projection_mapping['expected_keys']) #actual_keys = actual_keys.union(set(record['data'].keys())) #self.assertTrue(actual_keys.issubset(projection_mapping['expected_keys'])) self.modify_database() # ----------------------------------- # ----------- Subsequent Oplog Sync --------- # ----------------------------------- # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() for stream_name in self.expected_sync_streams(): stream_records = [ x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert' ] #actual_keys = set() for record in stream_records: self.assertIn(record['data'].keys(), projection_mapping['expected_keys'])
def test_run(self): """stream_expected_data[self.VALUES] Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) # TODO - change the replication key back to replication_key_column when rowversion is supported additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'INCREMENTAL', 'replication-key': 'temp_replication_key_column' } }] non_selected_properties = [ "nvarchar_text", "varchar_text", "varbinary_data", "geospacial", "geospacial_map", "markup", "tree", "variant", "SpecialPurposeColumns", "started_at", "ended_at" ] BaseTapTest.select_all_streams_and_fields( conn_id, found_catalogs, non_selected_properties=non_selected_properties, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream][ 'table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([ m["action"] == "upsert" for m in records_by_stream[stream]['messages'][1:-1] ]), msg="Expect all but the first message to be upserts") self.assertEqual(len( records_by_stream[stream]['messages'][1:-1]), len(stream_expected_data[self.VALUES]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [ { "action": "upsert", "data": { column: value for column, value in list( zip(column_names, row_values)) if column not in non_selected_properties } # TODO - change to -1 for using rowversion for replication key } for row_values in sorted( stream_expected_data[self.VALUES], key=lambda row: (row[1] is not None, row[1])) ] # Verify all data is correct for incremental for expected_row, actual_row in zip( expected_messages, records_by_stream[stream]['messages'][1:-1]): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key TODO - change to -1 for using rowversion for replication key self.assertEqual( bookmark['replication_key_value'], re.sub( r'\d{3}Z', "Z", max([ row[1] for row in stream_expected_data[self.VALUES] ]).strftime("%Y-%m-%dT%H:%M:%S.%fZ"))) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "text_and_image_deprecated_soon" column_name = [ "pk", "temp_replication_key_column", "nvarchar_text", "varchar_text", "varbinary_data", "replication_key_column" ] insert_value = [(3, datetime(2018, 12, 31, 23, 59, 59, 993000, tzinfo=timezone.utc), "JKL", "MNO", "PQR".encode('utf-8'))] update_value = [(0, datetime(2018, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), "JKL", "MNO", "PQR".encode('utf-8'))] query_list = (insert(database_name, schema_name, table_name, insert_value, column_names=column_name[:-1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) values = insert_value + [( 1, datetime(2018, 12, 31, 23, 59, 59, 987000, tzinfo=timezone.utc), "abc", "def", "ghi".encode('utf-8'))] + update_value rows = mssql_cursor_context_manager(*[ "select replication_key_column from data_types_database.dbo.text_and_image_deprecated_soon " "where pk in (0, 1,3) order by pk desc" ]) rows = [tuple(row) for row in rows] rows = [("0x{}".format(value.hex().upper()), ) for value, in rows] row_with_version = [x[0] + x[1] for x in zip(values, rows)] self.EXPECTED_METADATA[ 'data_types_database_dbo_text_and_image_deprecated_soon'][ 'values'] = row_with_version database_name = "data_types_database" schema_name = "dbo" table_name = "weirdos" column_name = [ "pk", "temp_replication_key_column", "geospacial", "geospacial_map", "markup", "guid", "tree", "variant", "SpecialPurposeColumns", "replication_key_column" ] insert_value = [(3, datetime(9999, 12, 31, 23, 59, 59, 993000, tzinfo=timezone.utc), None, None, None, str(uuid.uuid1()).upper(), None, None, None)] update_value = [(1, datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), None, None, None, str(uuid.uuid1()).upper(), None, None, None)] delete_value = [(0, )] query_list = (insert(database_name, schema_name, table_name, insert_value, column_name[:-1])) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) values = insert_value + [ (2, datetime(9999, 12, 31, 23, 59, 59, 990000, tzinfo=timezone.utc), None, None, None, "B792681C-AEF4-11E9-8002-0800276BC1DF", None, None, None) ] + update_value rows = mssql_cursor_context_manager(*[ "select replication_key_column from data_types_database.dbo.weirdos " "where pk in (1, 2, 3) order by pk desc" ]) rows = [tuple(row) for row in rows] rows = [("0x{}".format(value.hex().upper()), ) for value, in rows] row_with_version = [x[0] + x[1] for x in zip(values, rows)] self.EXPECTED_METADATA['data_types_database_dbo_weirdos'][ 'values'] = row_with_version database_name = "data_types_database" schema_name = "dbo" table_name = "computed_columns" column_name = [ "pk", "temp_replication_key_column", "started_at", "ended_at", "replication_key_column" ] insert_value = [(2, datetime(9998, 12, 31, 23, 59, 59, 990000, tzinfo=timezone.utc), datetime(1980, 5, 30, 16), datetime.now())] update_value = [(0, datetime(9998, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc), datetime(1942, 11, 30), datetime(2017, 2, 12))] query_list = (insert(database_name, schema_name, table_name, insert_value, column_name[:-1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) values = insert_value + [( 1, datetime(9998, 12, 31, 23, 59, 59, 987000, tzinfo=timezone.utc), datetime(1970, 1, 1, 0), datetime.now())] + update_value rows = mssql_cursor_context_manager(*[ "select replication_key_column from data_types_database.dbo.computed_columns " "where pk in (0, 1, 2) order by pk desc" ]) rows = [tuple(row) for row in rows] row_with_duration = [x[0] + x[1] for x in zip(values, rows)] self.EXPECTED_METADATA['data_types_database_dbo_computed_columns'][ 'values'] = row_with_duration sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([ message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:-1] ])) self.assertEqual(len( records_by_stream[stream]['messages'][1:-1]), len(stream_expected_data[self.VALUES]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) if column not in non_selected_properties } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[1] is not None, row[1]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:-1] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print( "records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key self.assertEqual( bookmark['replication_key_value'], re.sub( r'\d{3}Z', "Z", max([ row[1] for row in stream_expected_data[self.VALUES] ]).strftime("%Y-%m-%dT%H:%M:%S.%fZ"))) # self.assertEqual(bookmark['replication_key'], 'replication_key_value') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): """ Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } # self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream][ 'table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part if records_by_stream[stream]['messages'][-1].get("data"): last_row_data = True else: last_row_data = False self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-2]['action'], 'activate_version') if last_row_data: self.assertEqual( records_by_stream[stream]['messages'][-3]['action'], 'activate_version') else: self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertEqual( len([ m for m in records_by_stream[stream]['messages'][1:] if m["action"] == "activate_version" ]), 2, msg= "Expect 2 more activate version messages for end of full table and beginning of log based" ) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # Verify all data is correct for the full table part if last_row_data: final_row = -3 else: final_row = -2 for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream] ['messages'][1:final_row])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) # Verify all data is correct for the log replication part if sent if records_by_stream[stream]['messages'][-1].get("data"): for column_name, expected_value in expected_messages[-1][ "data"].items(): self.assertEqual( expected_value, records_by_stream[stream]['messages'][-1]["data"] [column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg= "expected bookmark to have current_log_version because we are using log replication" ) self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") inital_log_version = bookmark['current_log_version'] self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "integers" column_name = [ "pk", "MyBigIntColumn", "MyIntColumn", "MySmallIntColumn" ] insert_value = [(14, 100, 100, 100)] update_value = [(1, 101, 101, 101)] delete_value = [(5, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = [(14, 100, 100, 100, None)] update_value = [(1, 101, 101, 101, None)] delete_value = [(5, None, None, None, datetime.utcnow())] self.EXPECTED_METADATA["data_types_database_dbo_integers"]["values"] = \ insert_value + delete_value + update_value self.EXPECTED_METADATA["data_types_database_dbo_integers"][ "fields"].append({ "_sdc_deleted_at": { 'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic' } }) database_name = "data_types_database" schema_name = "dbo" table_name = "tiny_integers_and_bools" column_name = ["pk", "MyTinyIntColumn", "my_boolean"] insert_value = [(14, 100, False)] update_value = [(1, 101, True)] delete_value = [(5, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) insert_value = [(14, 100, False, None)] update_value = [(1, 101, True, None)] delete_value = [(5, None, None, datetime.utcnow())] self.EXPECTED_METADATA["data_types_database_dbo_tiny_integers_and_bools"]["values"] = \ [self.expected_metadata()["data_types_database_dbo_tiny_integers_and_bools"]["values"][-1]] + \ insert_value + delete_value + update_value self.EXPECTED_METADATA[ "data_types_database_dbo_tiny_integers_and_bools"][ "fields"].append({ "_sdc_deleted_at": { 'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic' } }) mssql_cursor_context_manager(*query_list) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertTrue( all([ message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:] ])) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertGreaterEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if column_name != "_sdc_deleted_at": self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) elif expected_value: # we have an expected value for a deleted row try: actual_value = datetime.strptime( actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%S.%fZ") except ValueError: actual_value = datetime.strptime( actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%SZ") self.assertGreaterEqual( actual_value, expected_value - timedelta(seconds=15)) self.assertLessEqual( actual_value, expected_value + timedelta(seconds=15)) else: # the row wasn't deleted so we can either not pass the column or it can be None self.assertIsNone( actual_row["data"].get(column_name)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg= "expected bookmark to have current_log_version because we are using log replication" ) self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") new_log_version = bookmark['current_log_version'] self.assertGreater(new_log_version, inital_log_version, msg='expected log version to increase') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))