def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ catalog for catalog in menagerie.get_catalogs(conn_id) if catalog['tap_stream_id'] in self.tap_stream_ids() ] self.assertEqual( len(found_catalogs), 2, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.tap_stream_ids().symmetric_difference(found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties found_table_names = set(map(lambda c: c['stream_name'], found_catalogs)) tables_diff = self.table_names().symmetric_difference( found_table_names) self.assertEqual( len(tables_diff), 0, msg="discovered schemas do not match: {}".format(tables_diff)) print("discovered streams are correct") # Select all catalogs for full table sync for test_catalog in found_catalogs: additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']), additional_md) # Set state to mimic that the a full table sync did not complete menagerie.set_state(conn_id, self.get_interrupted_state()) # run full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the target output (schema, record count, message actions) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.table_names(), self.expected_pks()) self.assertEqual(record_count_by_stream, { 'full_table': 4, 'full_table_composite_key': 4 }) records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], self.expected_schemas()[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) messages_for_stream = recs['messages'] message_actions = [rec['action'] for rec in messages_for_stream] self.assertEqual( message_actions, ['upsert', 'upsert', 'upsert', 'upsert', 'activate_version']) state = menagerie.get_state(conn_id) for tap_stream_id in self.tap_stream_ids(): bookmark = state['bookmarks'][tap_stream_id] # last_pk_fetched and max_pk_values are cleared after success self.assertEqual(bookmark, {'initial_full_table_complete': True})
def test_zzzu_run_sync_mode(self): # Select our catalogs our_catalogs = menagerie.get_catalogs(S3TypesAndData.conn_id) for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( self.conn_id, c['stream_id']) metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata( self.conn_id, c, c_annotated) # Clear state before our run menagerie.set_state(self.conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, self.conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file( self, self.conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) for stream in self.expected_sync_streams(): with self.subTest(stream=stream): self.assertEqual( record_count_by_stream.get(stream, 0), S3TypesAndData.expected_stream_row_counts()[stream], msg= "actual rows: {}, expected_rows: {} for stream {} don't match" .format( record_count_by_stream.get(stream, 0), S3TypesAndData.expected_stream_row_counts()[stream], stream)) print("total replicated row count: {}".format(replicated_row_count)) synced_records = runner.get_records_from_target_output() # verify that when header is longer, the end columns have null values upsert_message_header_longer = [ m for m in synced_records.get('header_longer').get('messages') if m['action'] == 'upsert' ] data_null = [ d for d in upsert_message_header_longer if d["data"]["aa0"] == d["data"]["ab0"] == d["data"]["ac0"] == d["data"]["ad0"] == d["data"]["ae0"] is None ] self.assertEqual( S3TypesAndData.expected_stream_row_counts()['header_longer'], len(data_null)) # verify that when header is shorter, the _sdc_extra has the values upsert_message_header_shorter = [ m for m in synced_records.get('header_shorter').get('messages') if m['action'] == 'upsert' ] s3_extra = [ d for d in upsert_message_header_shorter if len(d["data"]["_sdc_extra"]) == 5 ] self.assertEqual( S3TypesAndData.expected_stream_row_counts()['header_shorter'], len(s3_extra)) # verify when one row is shorter and one longer one has _sdc_extra other has null upsert_message_rows_longer_shorter = [ m for m in synced_records.get('rows_longer_and_shorter').get( 'messages') if m['action'] == 'upsert' ] data_null = [ d for d in upsert_message_rows_longer_shorter if d["data"]["v0"] == d["data"]["w0"] == d["data"]["x0"] == d["data"]["y0"] == d["data"]["z0"] is None ] s3_extra = [ d for d in upsert_message_rows_longer_shorter if len(d["data"].get("_sdc_extra", [])) == 5 ] self.assertTrue(len(data_null) == len(s3_extra) == 1)
def test_run(self): """ Verify that for each stream you can get multiple pages of data when no fields are selected and only the automatic fields are replicated. PREREQUISITE For EACH stream add enough data that you surpass the limit of a single fetch of data. For instance if you have a limit of 250 records ensure that 251 (or more) records have been posted for that stream. """ print("\n\nRUNNING {}\n\n".format(self.name())) # Resetting tracked parent objects prior to test utils.reset_tracked_parent_objects() # ensure data exists for sync streams and set expectations _, existing_boards = utils.get_total_record_count_and_objects('boards') custom_fields_dict = {x: [] for x in self.expected_custom_fields() } # ids by stream custom_fields_by_board = { x.get('id'): copy.deepcopy(custom_fields_dict) for x in existing_boards } # ids by stream # get existing custom fields for each board print("Getting objects on baord with static custom field set") for board_id, board_cfields in custom_fields_by_board.items(): cfields = utils.get_custom_fields('boards', board_id) for field in self.expected_custom_fields(): cfields_type_field = [f for f in cfields if f['type'] == field] if cfields_type_field: board_cfields[field] += cfields_type_field # get expected cards with custom fields expected_records_cfields = list() board_id = utils.NEVER_DELETE_BOARD_ID all_cards_on_board = utils.get_objects('cards', parent_id=board_id) print("Setting custom fields expectations based on static data") for card in all_cards_on_board: card_with_cfields = utils.get_objects('cards', obj_id=card.get('id'), parent_id=board_id, custom_fields=True) if card_with_cfields: expected_records_cfields += card_with_cfields # veryify at least 1 record exists for each custom field type or else our assertions are invalid fields_exist = {x: False for x in self.expected_custom_fields()} for record in expected_records_cfields: if all(v for _, v in fields_exist.items()): break value = record.get('value') if value: key = next(iter(value)) if key in self.expected_custom_fields( ) and not fields_exist.get(key): fields_exist[key] = True elif key == 'checked': fields_exist['checkbox'] = True elif key == 'option': fields_exist['list'] = True self.assertTrue(all(v for _, v in fields_exist.items()), msg="Not all custom field types have data. Data must be restored manually on Trello account" +\ "\nCurrent data: {}".format(fields_exist)) conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") # Select all streams and all fields self.select_all_streams_and_fields(conn_id, found_catalogs, select_all_fields=True) for cat in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, cat['stream_id']) for k in self.expected_automatic_fields()[cat['stream_name']]: mdata = next( (m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format( cat['stream_name'], mdata)) self.assertTrue( mdata and mdata['metadata']['inclusion'] == 'automatic') catalogs = menagerie.get_catalogs(conn_id) #clear state menagerie.set_state(conn_id, {}) # run sync sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # read target output first_record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, first_record_count_by_stream.values()) synced_records = runner.get_records_from_target_output() # Verify target has records for all synced streams for stream, count in first_record_count_by_stream.items(): assert stream in self.expected_sync_streams() self.assertGreater( count, 0, msg="failed to replicate any data for: {}".format(stream)) print("total replicated row count: {}".format(replicated_row_count)) # Testing streams with custom fields for stream in self.testable_streams(): with self.subTest(stream=stream): data = synced_records.get(stream) record_messages = [row['data'] for row in data['messages']] record_ids = [message.get('id') for message in record_messages] record_custom_fields = [ message.get('customFieldItems') for message in record_messages if message.get('customFieldItems', None) ] record_cfield_ids = [] for record in record_custom_fields: for cfield in record: record_cfield_ids.append(cfield.get('id')) # Verify that we replicated the records with custom_fields for card in all_cards_on_board: if card.get('id') in expected_records_cfields: self.assertIn( card.get('id'), records_ids, msg="Missing a record that has custom fields:\n{}". format(card.get('id'))) # Verify that we replicated the expected custom fields on those records for expected_cfield in expected_records_cfields: self.assertIn( expected_cfield.get('id'), record_cfield_ids, msg="Missing custom field from expected {} record id={}" .format(stream, expected_cfield.get('id'))) # Verify the expected custom field attributes match the replicated data for actual_cfields in record_custom_fields: expected_cfield_replicated = expected_cfield in actual_cfields if expected_cfield_replicated: break self.assertTrue(expected_cfield_replicated) # Reset the parent objects that we have been tracking utils.reset_tracked_parent_objects()
def test_run(self): """Test we get a lot of data back based on the start date configured in base""" streams_under_test = self.streams_under_test() conn_id = self.create_connection_with_initial_discovery() # Select streams and all fields within streams found_catalogs = menagerie.get_catalogs(conn_id) our_catalogs = [catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in streams_under_test] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True) # Run a sync job using orchestrator first_sync_record_count = self.run_sync(conn_id) # get results first_sync_records = runner.get_records_from_target_output() state = menagerie.get_state(conn_id) # set the start date for a new connection based off state bookmarked_values = [] expected_replication_keys_by_stream = self.expected_replication_keys() for stream in streams_under_test: replication_key = list(expected_replication_keys_by_stream[stream])[0] bookmarked_values.append(state['bookmarks'][stream][replication_key]) # grab the most recent bookmark from state greatest_bookmark_value = sorted(bookmarked_values)[-1].split("T")[0] start_date = self.timedelta_formatted(greatest_bookmark_value, days=-1, str_format="%Y-%m-%d") # BUG_TDL-19582 # start_date = self.timedelta_formatted(greatest_bookmark_value, days=0, str_format="%Y-%m-%d") # BUG_TDL-19582 self.start_date = start_date + "T00:00:00Z" # create a new connection with the new more recent start_date conn_id = self.create_connection_with_initial_discovery(original_properties=False) # Select all streams and all fields within streams found_catalogs = menagerie.get_catalogs(conn_id) our_catalogs = [catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in streams_under_test] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True) # Run a sync job using orchestrator with a more recent start date second_sync_record_count = self.run_sync(conn_id) # get results second_sync_records = runner.get_records_from_target_output() for stream in streams_under_test: with self.subTest(stream=stream): # gather expectations replication_key = list(expected_replication_keys_by_stream[stream])[0] # get results record_messages = [message['data'] for message in second_sync_records[stream]['messages'] if message.get('action') == 'upsert'] if stream == 'issues': replication_key_values = [record_message['fields'][replication_key] for record_message in record_messages] else: replication_key_values = [record_message[replication_key] for record_message in record_messages] max_replication_key_value = sorted(replication_key_values)[-1] # verify that each stream has less records than the first connection sync self.assertGreater( first_sync_record_count.get(stream, 0), second_sync_record_count.get(stream, 0), msg="second had more records, start_date usage not verified", logging="verify less records are replicated with a more recent start date" ) # verify all data from 2nd sync >= start_date self.assertGreaterEqual( parse(max_replication_key_value), parse(self.start_date), logging="verify on second sync no records are replicated prior to the start date" )
def test_discovery(self): """ Verify that discover creates the appropriate catalog, schema, metadata, etc. """ found_catalogs = menagerie.get_catalogs(S3TypesAndData.conn_id) # verify that the number of streams is correct based on the configuration self.assertEqual( len(found_catalogs), len(self.expected_streams()), "The number of catalogs doesn't match " "the number of tables in the configuration") # verify the stream names are the names in the config file -- with " " -> "_"? found_stream_names = {x["stream_name"] for x in found_catalogs} self.assertEqual(found_stream_names, self.expected_stream_names()) # verify the number of top level objects in the schema is correct for catalog in found_catalogs: with self.subTest(c=catalog): stream_name = catalog["stream_name"] files_for_stream = list( EXPECTED_STREAMS_TO_RESOURCES[stream_name]) expected_properties = S3TypesAndData.columns_in_header_of_csv_file( files_for_stream).union( S3TypesAndData.stitch_added_columns()) metadata_and_annotated_schema = menagerie.get_annotated_schema( S3TypesAndData.conn_id, catalog['stream_id']) annotated_schema = metadata_and_annotated_schema[ "annotated-schema"] metadata = metadata_and_annotated_schema["metadata"] # verify that the annotated schema has the correct number of properties self.assertEqual( len(expected_properties), len(annotated_schema.get("properties").keys())) # verify that the metadata has the correct number of breadcrumbs with properties properties_metadata = [ x for x in metadata if "properties" in x.get("breadcrumb") ] self.assertEqual(len(expected_properties), len(properties_metadata)) # verify that all non pk's are given the inclusion of available in annotated schema. expected_key_properties = \ S3TypesAndData.expected_pks()[stream_name] expected_not_pk_properties = expected_properties.difference( expected_key_properties) actual_available_properties = { k for k, v in annotated_schema["properties"].items() if v.get("inclusion") == "available" } self.assertEqual(actual_available_properties, expected_not_pk_properties) # verify that all non pk's are given the inclusion of available in metadata. # make sure that we use problematic characters for header names # - space" ", dash"-", underscore"_", comma"," etc. actual_available_properties = \ {item.get("breadcrumb", ["", ""])[1] for item in metadata if item.get("metadata").get("inclusion") == "available"} self.assertEqual(actual_available_properties, expected_not_pk_properties)
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") #select all catalogs for catalog in found_catalogs: connections.select_catalog_and_fields_via_metadata( conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id'])) future_time = "2050-01-01T00:00:00.000000Z" #clear state future_bookmarks = { "currently_syncing": None, "bookmarks": { "contacts": { "offset": {}, "versionTimestamp": future_time }, "subscription_changes": { "startTimestamp": future_time, "offset": {} }, "campaigns": { "offset": {} }, "forms": { "updatedAt": future_time }, "deals": { "offset": {}, "hs_lastmodifieddate": future_time }, "workflows": { "updatedAt": future_time }, "owners": { "updatedAt": future_time }, "contact_lists": { "updatedAt": future_time, "offset": {} }, "email_events": { "startTimestamp": future_time, "offset": {} }, "companies": { "offset": {}, "hs_lastmodifieddate": future_time }, "engagements": { "lastUpdated": future_time, "offset": {} } } } menagerie.set_state(conn_id, future_bookmarks) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) #because the bookmarks were set into the future, we should NOT actually replicate any data. #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks streams_with_bookmarks = self.expected_sync_streams() streams_with_bookmarks.remove('campaigns') streams_with_bookmarks.remove('deal_pipelines') bad_streams = streams_with_bookmarks.intersection( record_count_by_stream.keys()) self.assertEqual( len(bad_streams), 0, msg="still pulled down records from {} despite future bookmarks". format(bad_streams)) state = menagerie.get_state(conn_id) # NB: Companies and engagements won't set a bookmark in the future. state["bookmarks"].pop("companies") state["bookmarks"].pop("engagements") future_bookmarks["bookmarks"].pop("companies") future_bookmarks["bookmarks"].pop("engagements") self.assertEqual( state, future_bookmarks, msg= "state should not have been modified because we didn't replicate any data" ) bookmarks = state.get('bookmarks') bookmark_streams = set(state.get('bookmarks').keys())
def run_single_projection(self, projection_mapping): self.setUpDatabase() conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id ][0] # assert that the pks are correct self.assertEqual( self.expected_pks()[found_stream['stream_name']], set( found_stream.get('metadata', {}).get('table-key-properties'))) # assert that the row counts are correct self.assertEqual( self.expected_row_counts()[found_stream['stream_name']], found_stream.get('metadata', {}).get('row-count')) # ----------------------------------- # ----------- Initial Full Table --------- # ----------------------------------- # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] if projection_mapping['projection'] is not None: additional_md[0]['metadata'][ 'tap_mongodb.projection'] = json.dumps( projection_mapping['projection']) selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() for stream_name in self.expected_sync_streams(): stream_records = [ x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert' ] #actual_keys = set() for record in stream_records: self.assertIn(record['data'].keys(), projection_mapping['expected_keys']) #actual_keys = actual_keys.union(set(record['data'].keys())) #self.assertTrue(actual_keys.issubset(projection_mapping['expected_keys'])) self.modify_database() # ----------------------------------- # ----------- Subsequent Oplog Sync --------- # ----------------------------------- # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() for stream_name in self.expected_sync_streams(): stream_records = [ x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert' ] #actual_keys = set() for record in stream_records: self.assertIn(record['data'].keys(), projection_mapping['expected_keys'])
def test_run(self): """ Verify that a bookmark doesn't exist for the stream Verify that the second sync includes the same number or more records than the first sync Verify that all records in the first sync are included in the second sync Verify that the sync only sent records to the target for selected streams (catalogs) PREREQUISITE For EACH stream that is fully replicated there are multiple rows of data with different values for the replication key """ conn_id = self.create_connection() # Select all streams and no fields within streams found_catalogs = menagerie.get_catalogs(conn_id) full_streams = { key for key, value in self.expected_replication_method().items() if value == self.FULL } our_catalogs = [ catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in full_streams ] self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True) # Run a sync job using orchestrator first_sync_record_count = self.run_sync(conn_id) # verify that the sync only sent records to the target for selected streams (catalogs) self.assertEqual(set(first_sync_record_count.keys()), full_streams) first_sync_state = menagerie.get_state(conn_id) # Get the set of records from a first sync first_sync_records = runner.get_records_from_target_output() # Run a second sync job using orchestrator second_sync_record_count = self.run_sync(conn_id) # Get the set of records from a second sync second_sync_records = runner.get_records_from_target_output() # THIS MAKES AN ASSUMPTION THAT CHILD STREAMS DO NOT NEED TESTING. # ADJUST IF NECESSARY for stream in full_streams.difference(self.child_streams()): with self.subTest(stream=stream): # verify there is no bookmark values from state state_value = first_sync_state.get("bookmarks", {}).get(stream) self.assertIsNone(state_value) # verify that there is more than 1 record of data - setup necessary self.assertGreater( first_sync_record_count.get(stream, 0), 1, msg="Data isn't set up to be able to test full sync") # verify that you get the same or more data the 2nd time around self.assertGreaterEqual( second_sync_record_count.get(stream, 0), first_sync_record_count.get(stream, 0), msg= "second syc didn't have more records, full sync not verified" ) # verify all data from 1st sync included in 2nd sync first_data = [ record["data"] for record in first_sync_records.get( stream, {}).get("messages", {"data": {}}) ] second_data = [ record["data"] for record in second_sync_records.get( stream, {}).get("messages", {"data": {}}) ] same_records = 0 for first_record in first_data: first_value = json.dumps(first_record, sort_keys=True) for compare_record in second_data: compare_value = json.dumps(compare_record, sort_keys=True) if first_value == compare_value: second_data.remove(compare_record) same_records += 1 break self.assertEqual( len(first_data), same_records, msg= "Not all data from the first sync was in the second sync")
def test_run(self): (table_configs, conn_id, expected_streams) = self.pre_sync_test() # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata found_catalogs = menagerie.get_catalogs(conn_id) for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED', 'tap-mongodb.projection': table_configs[0]['ProjectionExpression'] } }] connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) self.first_sync_test(table_configs, conn_id, expected_streams) ################################ # Run sync SECOND TIME and check that no records came through ################################ # Disable streams forces shards to close self.disableStreams(expected_streams) sync_job_name = runner.run_sync_mode(self, conn_id) self.enableStreams(expected_streams) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() # Check that we only have 1 message (activate_version) on syncing # a stream without changes for stream in messages_by_stream.values(): self.assertEqual(1, len(stream['messages'])) menagerie.get_state(conn_id) # Add 10 rows to the DB self.addMoreData(10) # Delete some rows self.deleteData(range(40, 50)) # Change some rows self.updateData(10, 60, 'boolean_field', False) ################################ # Run sync THIRD TIME and check that records did come through ################################ # Disable streams forces shards to close self.disableStreams(expected_streams) sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() for config in table_configs: table_name = config['TableName'] for message in messages_by_stream[table_name]['messages']: if message['action'] == 'upsert': if not message['data'].get('_sdc_deleted_at'): top_level_keys = {*message['data'].keys()} self.assertEqual(config['top_level_keys'], top_level_keys) for list_key in config['top_level_list_keys']: self.assertTrue( isinstance(message['data'][list_key], list)) self.assertEqual( config['nested_map_keys']['map_field'], {*message['data']['map_field'].keys()}) # Check that we have 31 messages come through (10 upserts, 10 deletes, 10 updated records and 1 activate version) for stream in messages_by_stream.values(): self.assertEqual(31, len(stream['messages'])) menagerie.get_state(conn_id)
def discovery_test(self, conn_id): """ Basic Discovery Test for a database tap. Test Description: Ensure discovery runs without exit codes and generates a catalog of the expected form Test Cases: - Verify discovery generated the expected catalogs by name. - Verify that the table_name is in the format <collection_name> for each stream. - Verify the caatalog is found for a given stream. - Verify there is only 1 top level breadcrumb in metadata for a given stream. - Verify replication key(s) match expectations for a given stream. - Verify primary key(s) match expectations for a given stream. - Verify the replication method matches our expectations for a given stream. - Verify that only primary keys are given the inclusion of automatic in metadata for a given stream. - Verify expected unsupported fields are given the inclusion of unsupported in metadata for a given stream. - Verify that all fields for a given stream which are not unsupported or automatic have inclusion of available. - Verify row-count metadata matches expectations for a given stream. - Verify selected metadata is None for all streams. - Verify is-view metadata is False for a given stream. - Verify no forced-replication-method is present in metadata for a given stream. - Verify schema and db match expectations for a given stream. - Verify schema types match expectations for a given stream. """ ########################################################################## ### TODO ### [] Generate multiple tables (streams) and maybe dbs too? ### [] Investigate potential bug, see DOCS_BUG_1 ########################################################################## # run discovery (check mode) check_job_name = runner.run_check_mode(self, conn_id) # Verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify discovery generated a catalog found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0) # Verify discovery generated the expected catalogs by name found_catalog_names = { catalog['stream_name'] for catalog in found_catalogs } self.assertSetEqual(self.expected_check_streams(), found_catalog_names) # Verify that the table_name is in the format <collection_name> for each stream found_catalog_stream_ids = { catalog['tap_stream_id'] for catalog in found_catalogs } self.assertSetEqual(self.expected_check_stream_ids(), found_catalog_stream_ids) # Test by stream for stream in self.expected_check_streams(): with self.subTest(stream=stream): # Verify the caatalog is found for a given stream catalog = next( iter([ catalog for catalog in found_catalogs if catalog["stream_name"] == stream ])) self.assertTrue(isinstance(catalog, dict)) # collecting expected values expected_primary_keys = self.expected_primary_keys()[stream] expected_replication_keys = set() expected_unsupported_fields = self.expected_unsupported_fields( ) expected_fields_to_datatypes = self.expected_schema_types() expected_row_count = len(self.recs) # collecting actual values... schema_and_metadata = menagerie.get_annotated_schema( conn_id, catalog['stream_id']) stream_metadata = schema_and_metadata["metadata"] top_level_metadata = [ item for item in stream_metadata if item.get("breadcrumb") == [] ] stream_properties = top_level_metadata[0]['metadata'] actual_primary_keys = set( stream_properties.get(self.PRIMARY_KEYS, [])) actual_replication_keys = set( stream_properties.get(self.REPLICATION_KEYS, [])) actual_replication_method = stream_properties.get( self.REPLICATION_METHOD) actual_automatic_fields = set( item.get("breadcrumb", ["properties", None])[1] for item in stream_metadata if item.get("metadata").get("inclusion") == "automatic") actual_unsupported_fields = set( item.get("breadcrumb", ["properties", None])[1] for item in stream_metadata if item.get("metadata").get("inclusion") == "unsupported") actual_fields_to_datatypes = { item['breadcrumb'][1]: item['metadata'].get('sql-datatype') for item in stream_metadata[1:] } # Verify there is only 1 top level breadcrumb in metadata self.assertEqual(1, len(top_level_metadata)) # Verify replication key(s) match expectations self.assertSetEqual(expected_replication_keys, actual_replication_keys) # NB | We expect primary keys and replication keys to have inclusion automatic for # key-based incremental replication. But that is only true for primary keys here. # This BUG should not be carried over into hp-postgres, but will not be fixed for this tap. # Verify primary key(s) match expectations self.assertSetEqual( expected_primary_keys, actual_primary_keys, ) # Verify the replication method matches our expectations self.assertIsNone(actual_replication_method) # Verify that only primary keys # are given the inclusion of automatic in metadata. self.assertSetEqual(expected_primary_keys, actual_automatic_fields) # DOCS_BUG_1 ? | The following types were converted and selected, but docs say unsupported. # Still need to investigate how the tap handles values of these datatypes # during sync. KNOWN_MISSING = { 'invalid_bigserial', # BIGSERIAL -> bigint 'invalid_serial', # SERIAL -> integer 'invalid_smallserial', # SMALLSERIAL -> smallint } # Verify expected unsupported fields # are given the inclusion of unsupported in metadata. self.assertSetEqual(expected_unsupported_fields, actual_unsupported_fields | KNOWN_MISSING) # Verify that all other fields have inclusion of available # This assumes there are no unsupported fields for SaaS sources self.assertTrue( all( { item.get("metadata").get( "inclusion") == "available" for item in stream_metadata if item.get("breadcrumb", []) != [] and item.get("breadcrumb", ["properties", None])[1] not in actual_automatic_fields and item.get("breadcrumb", ["properties", None]) [1] not in actual_unsupported_fields }), msg= "Not all non key properties are set to available in metadata" ) # Verify row-count metadata matches expectations self.assertEqual(expected_row_count, stream_properties['row-count']) # Verify selected metadata is None for all streams self.assertNotIn('selected', stream_properties.keys()) # Verify is-view metadata is False self.assertFalse(stream_properties['is-view']) # Verify no forced-replication-method is present in metadata self.assertNotIn(self.REPLICATION_METHOD, stream_properties.keys()) # Verify schema and db match expectations self.assertEqual(test_schema_name, stream_properties['schema-name']) self.assertEqual(test_db, stream_properties['database-name']) # Verify schema types match expectations self.assertDictEqual(expected_fields_to_datatypes, actual_fields_to_datatypes)
def test_run(self): """ Verify that for each stream you can get data when no fields are selected and only the automatic fields are replicated. """ print("\n\nRUNNING {}\n\n".format(self.name())) # Initialize start_date state to capture ad_reports records self.START_DATE = self.timedelta_formatted(self.REPORTS_START_DATE, -1) self.END_DATE = self.REPORTS_END_DATE print( "INCREMENTAL STREAMS RELY ON A STATIC DATA SET. SO WE TEST WITH:\n" + " START DATE 1 | {}\n".format(self.START_DATE) + " END DATE 2 | {}".format(self.END_DATE)) # ensure data exists for sync streams and set expectations expected_records_all = {x: [] for x in self.expected_streams() } # all fields selected expected_records_auto = {x: [] for x in self.expected_streams() } # no fields selected for stream in self.testable_streams(): start_date = self.parse_date(self.START_DATE) end_date = self.parse_date(self.END_DATE) existing_objects = self.client.get_all(stream, start_date, end_date) assert existing_objects, "Test data is not properly set for {}, test will fail.".format( stream) print("Data exists for stream: {}".format(stream)) for obj in existing_objects: expected_records_all[stream].append(obj) expected_records_auto[stream].append({ field: obj.get(field) for field in self.expected_automatic_fields().get(stream) }) # format expected data to match expected output of tap self.format_expected_data(expected_records_all) # Instantiate connection with default start/end dates conn_id = connections.ensure_connection(self, original_properties=False) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") ########################################################################## ### ALL FIELDS SYNC ########################################################################## # Select all available fields from all streams exclude_streams = self.expected_streams().difference( self.testable_streams()) self.select_all_streams_and_fields(conn_id=conn_id, catalogs=found_catalogs, select_all_fields=True, exclude_streams=exclude_streams) catalogs = menagerie.get_catalogs(conn_id) # Ensure our selection worked for cat in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, cat['stream_id']) # Verify only testable streams are selected selected = catalog_entry.get('annotated-schema').get('selected') print("Validating selection on {}: {}".format( cat['stream_name'], selected)) if not cat['stream_name'] in self.testable_streams(): # None expected for {'inclusion':'available'} happens when menagerie "deselects" stream self.assertTrue(selected is None, msg="Stream is selected, but shouldn't be.") continue self.assertTrue(selected, msg="Stream not selected.") # Verify all fields within each selected stream are selected for field, field_props in catalog_entry.get( 'annotated-schema').get('properties').items(): field_selected = field_props.get('selected') print("\tValidating selection on {}.{}: {}".format( cat['stream_name'], field, field_selected)) self.assertTrue(field_selected, msg="Field not selected.") #clear state menagerie.set_state(conn_id, {}) # run sync with all fields selected sync_job_name_all = runner.run_sync_mode(self, conn_id) # Verify tap exit codes exit_status_all = menagerie.get_exit_status(conn_id, sync_job_name_all) menagerie.verify_sync_exit_status(self, exit_status_all, sync_job_name_all) # read target output record_count_by_stream_all = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) replicated_row_count_all = reduce(lambda accum, c: accum + c, record_count_by_stream_all.values()) synced_records_all = runner.get_records_from_target_output() # Verify target has records for all synced streams for stream, count in record_count_by_stream_all.items(): assert stream in self.expected_streams() if stream in self.testable_streams(): self.assertGreater( count, 0, msg="failed to replicate any data for: {}".format(stream)) print( "total replicated row count: {}".format(replicated_row_count_all)) ########################################################################## ### AUTOMATIC FIELDS SYNC ########################################################################## # Instantiate connection with default start/end dates conn_id = connections.ensure_connection(self, original_properties=False) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") # Select no available fields (only automatic) for all testable streams self.select_all_streams_and_fields(conn_id=conn_id, catalogs=found_catalogs, select_all_fields=False, exclude_streams=exclude_streams) catalogs = menagerie.get_catalogs(conn_id) # Ensure our selection worked for cat in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, cat['stream_id']) # Verify all streams are selected selected = catalog_entry.get('annotated-schema').get('selected') print("Validating selection on {}: {}".format( cat['stream_name'], selected)) if not cat['stream_name'] in self.testable_streams(): self.assertTrue(selected is None, msg="Stream is selected, but shouldn't be.") continue self.assertTrue(selected, msg="Stream not selected.") # Verify only automatic fields are selected for field, field_props in catalog_entry.get( 'annotated-schema').get('properties').items(): field_selected = field_props.get('selected') print("\tValidating selection on {}.{}: {}".format( cat['stream_name'], field, field_selected)) if field in self.expected_automatic_fields().get( cat['stream_name']): # NOTE: AUTOMATIC FIELDS IGNORE THE SELECTED md {'selected': None} print( "NOTE: selection for {} is ignored by the Transformer " .format(field) + " so long as 'inlcusion' = 'automatic'") else: self.assertFalse( field_selected, msg="Field is selected but not automatic.") # run sync with no fields selected (only automatic) sync_job_name_auto = runner.run_sync_mode(self, conn_id) # Verify tap exit codes exit_status_auto = menagerie.get_exit_status(conn_id, sync_job_name_auto) menagerie.verify_sync_exit_status(self, exit_status_auto, sync_job_name_auto) # read target output record_count_by_stream_auto = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys()) replicated_row_count_auto = reduce( lambda accum, c: accum + c, record_count_by_stream_auto.values()) synced_records_auto = runner.get_records_from_target_output() # Verify target has records for all synced streams for stream, count in record_count_by_stream_auto.items(): assert stream in self.expected_streams() if stream in self.testable_streams(): self.assertGreater( count, 0, msg="failed to replicate any data for: {}".format(stream)) print( "total replicated row count: {}".format(replicated_row_count_auto)) # Test by Stream for stream in self.testable_streams(): with self.subTest(stream=stream): ########################################################################## ### TESTING ALL FIELDS ########################################################################## data = synced_records_all.get(stream) record_messages_keys = [ set(row['data'].keys()) for row in data['messages'] ] expected_keys = expected_records_all.get(stream)[0].keys() # Verify schema covers all fields schema_keys = set(self.expected_schema_keys(stream)) self.assertEqual( set(), set(expected_keys).difference(schema_keys), msg="\nFields missing from schema: {}\n".format( set(expected_keys).difference(schema_keys))) # not a test, just logging the fields that are included in the schema but not in the expectations if schema_keys.difference(set(expected_keys)): print( "WARNING Fields missing from expectations: {}".format( schema_keys.difference(set(expected_keys)))) # Verify that all fields are sent to the target for actual_keys in record_messages_keys: self.assertEqual( actual_keys.symmetric_difference(schema_keys), set(), msg="Expected all fields, as defined by schemas/{}.json" .format(stream)) actual_records = [row['data'] for row in data['messages']] expected_records = expected_records_all.get(stream) # Verify the number of records match expectations self.assertEqual(len(expected_records), len(actual_records), msg="Number of actual records do match expectations. " +\ "Check expectations, check for duplicate records in Target.") # verify there are no dup records in the target already_tracked = [] for actual_record in actual_records: if actual_record in already_tracked: continue already_tracked.append(actual_record) self.assertEqual(len(already_tracked), len(actual_records), msg="DUPLICATES PRESENT") # verify by values, that we replicated the expected records for actual_record in actual_records: self.assertTrue( actual_record in expected_records, msg="Actual record missing from expectations\n" + "Actual Record: {}".format(actual_record)) for expected_record in expected_records: self.assertTrue( expected_record in actual_records, msg="Expected record missing from target." + "Expected Record: {}".format(expected_record)) ########################################################################## ### TESTING AUTOMATIC FIELDS ########################################################################## data = synced_records_auto.get(stream) record_messages_keys = [ set(row['data'].keys()) for row in data['messages'] ] expected_keys = self.expected_automatic_fields().get(stream) # Verify that only the automatic fields are sent to the target for actual_keys in record_messages_keys: self.assertEqual( actual_keys.symmetric_difference(expected_keys), set(), msg="Expected automatic fields and nothing else.") actual_records = [row['data'] for row in data['messages']] expected_records = expected_records_auto.get(stream) #Verify the number of records match expectations self.assertEqual(len(expected_records), len(actual_records), msg="Number of actual records do match expectations. " +\ "We probably have duplicate records.") # verify there are no dup records in the target already_tracked = [] for actual_record in actual_records: if actual_record in already_tracked: continue already_tracked.append(actual_record) self.assertEqual(len(already_tracked), len(actual_records), msg="DUPLICATES PRESENT") # verify by values, that we replicated the expected records for actual_record in actual_records: self.assertTrue( actual_record in expected_records, msg="Actual record missing from expectations\n" + "Actual Record: {}".format(actual_record)) for expected_record in expected_records: self.assertTrue( expected_record in actual_records, msg="Expected record missing from target." + "Expected Record: {}".format(expected_record))
def test_run(self): """ Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } # self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream][ 'table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part if records_by_stream[stream]['messages'][-1].get("data"): last_row_data = True else: last_row_data = False self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-2]['action'], 'activate_version') if last_row_data: self.assertEqual( records_by_stream[stream]['messages'][-3]['action'], 'activate_version') else: self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertEqual( len([ m for m in records_by_stream[stream]['messages'][1:] if m["action"] == "activate_version" ]), 2, msg= "Expect 2 more activate version messages for end of full table and beginning of log based" ) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # Verify all data is correct for the full table part if last_row_data: final_row = -3 else: final_row = -2 for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream] ['messages'][1:final_row])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, time): # sql server time has second resolution only self.assertEqual( expected_value.replace( microsecond=0).isoformat().replace( '+00:00', ''), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, date): # sql server time has second resolution only self.assertEqual( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value, actual_row["data"][column_name])) # Verify all data is correct for the log replication part if sent if records_by_stream[stream]['messages'][-1].get("data"): for column_name, expected_value in expected_messages[-1][ "data"].items(): if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace('+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, time): # sql server time has second resolution only self.assertEqual( expected_value.replace( microsecond=0).isoformat().replace( '+00:00', ''), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, date): # sql server time has second resolution only self.assertEqual( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value, actual_row["data"][column_name])) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg= "expected bookmark to have current_log_version because we are using log replication" ) self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") inital_log_version = bookmark['current_log_version'] self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "data_types_database" schema_name = "dbo" table_name = "dates_and_times" column_name = [ "pk", "just_a_date", "date_and_time", "bigger_range_and_precision_datetime", "datetime_with_timezones", "datetime_no_seconds", "its_time" ] new_date_value = datetime(2019, 7, 22, 21, 11, 40, 573000) insert_value = [ (6, new_date_value.date(), new_date_value, datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc), datetime(5749, 4, 3, 1, 47, 47, 110809, tzinfo=timezone(timedelta(hours=10, minutes=5))).isoformat(), datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc), time(21, 9, 56, 0, tzinfo=timezone.utc)) ] update_value = [ (2, new_date_value.date(), new_date_value, datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc), datetime(5749, 4, 3, 1, 47, 47, 110809, tzinfo=timezone(timedelta(hours=10, minutes=5))).isoformat(), datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc), time(21, 9, 56, 0, tzinfo=timezone.utc)) ] delete_value = [(3, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend( delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend( update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = [ (6, new_date_value.date(), new_date_value, datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc), datetime(5749, 4, 3, 1, 47, 47, 110809, tzinfo=timezone(timedelta( hours=10, minutes=5))).astimezone(timezone.utc), datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc), time(21, 9, 56, 0, tzinfo=timezone.utc)) ] update_value = [ (2, new_date_value.date(), new_date_value, datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc), datetime(5749, 4, 3, 1, 47, 47, 110809, tzinfo=timezone(timedelta( hours=10, minutes=5))).astimezone(timezone.utc), datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc), time(21, 9, 56, 0, tzinfo=timezone.utc)) ] insert_value = [insert_value[0] + (None, )] update_value = [update_value[0] + (None, )] delete_value = [(3, None, None, None, None, None, None, datetime.utcnow())] self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"]["values"] = \ [self.expected_metadata()["data_types_database_dbo_dates_and_times"]["values"][-1]] + \ insert_value + delete_value + update_value self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"][ "fields"].append({ "_sdc_deleted_at": { 'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic' } }) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = { k: len(v['values']) for k, v in self.expected_metadata().items() } self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertTrue( all([ message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:] ])) column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] expected_messages = [{ "action": "upsert", "data": { column: value for column, value in list( zip(column_names, stream_expected_data[self.VALUES] [row])) } } for row in range(len(stream_expected_data[self.VALUES]))] # remove sequences from actual values for comparison [ message.pop("sequence") for message in records_by_stream[stream]['messages'][1:] ] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertGreaterEqual( len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row[ "data"].items(): if column_name != "_sdc_deleted_at": if isinstance(expected_value, datetime): # sql server only keeps milliseconds not microseconds self.assertEqual( expected_value.isoformat().replace( '000+00:00', 'Z').replace( '+00:00', 'Z').replace('000', 'Z'), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '000+00:00', 'Z').replace( '+00:00', 'Z').replace('000', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, time): # sql server time has second resolution only self.assertEqual( expected_value.replace( microsecond=0).isoformat().replace( '+00:00', ''), actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat().replace( '+00:00', 'Z'), actual_row["data"][column_name])) elif isinstance(expected_value, date): # sql server time has second resolution only self.assertEqual( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value.isoformat() + 'T00:00:00+00:00', actual_row["data"][column_name])) else: self.assertEqual( expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_value, actual_row["data"][column_name])) elif expected_value: # we have an expected value for a deleted row try: actual_value = datetime.strptime( actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%S.%fZ") except ValueError: actual_value = datetime.strptime( actual_row["data"][column_name], "%Y-%m-%dT%H:%M:%SZ") self.assertGreaterEqual( actual_value, expected_value - timedelta(seconds=15)) self.assertLessEqual( actual_value, expected_value + timedelta(seconds=15)) else: # the row wasn't deleted so we can either not pass the column or it can be None self.assertIsNone( actual_row["data"].get(column_name)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone( state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNotNone( bookmark.get('current_log_version'), msg= "expected bookmark to have current_log_version because we are using log replication" ) self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete") new_log_version = bookmark['current_log_version'] self.assertGreater(new_log_version, inital_log_version, msg='expected log version to increase') self.assertEqual( bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual( bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format( expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): """stream_expected_data[self.VALUES] Verify that a full sync can send capture all data and send it in the correct format for integer and boolean (bit) data. Verify that the fist sync sends an activate immediately. Verify that the table version is incremented up """ print("running test {}".format(self.name())) conn_id = self.create_connection() # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # get the catalog information of discovery found_catalogs = menagerie.get_catalogs(conn_id) additional_md = [{"breadcrumb": [], "metadata": {'replication-method': 'INCREMENTAL', 'replication-key': 'replication_key_column'}}] non_selected_properties = [] BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify record counts of streams record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()} self.assertEqual(record_count_by_stream, expected_count) # verify records match on the first sync records_by_stream = runner.get_records_from_target_output() table_version = dict() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] table_version[stream] = records_by_stream[stream]['table_version'] # verify on the first sync you get # activate version message before and after all data for the full table # and before the logical replication part self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue( all([m["action"] == "upsert" for m in records_by_stream[stream]['messages'][1:-1]]), msg="Expect all but the first message to be upserts") self.assertEqual(len(records_by_stream[stream]['messages'][1:-1]), len(stream_expected_data[self.VALUES]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] replication_column = column_names.index("replication_key_column") expected_messages = [ { "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) if column not in non_selected_properties } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[replication_column] is not None, row[replication_column])) ] # Verify all data is correct for incremental for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") self.assertEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row["data"].items(): self.assertEqual(expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key self.assertEqual(bookmark['replication_key_value'], max([row[replication_column] for row in stream_expected_data[self.VALUES] if row[replication_column] is not None])) # self.assertEqual(bookmark['replication_key'], 'replication_key_column') self.assertEqual(bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format(expected_schemas, records_by_stream[stream]['schema'])) # ---------------------------------------------------------------------- # invoke the sync job AGAIN and after insert, update, delete or rows # ---------------------------------------------------------------------- database_name = "constraints_database" schema_name = "dbo" table_name = "no_constraints" column_name = ["replication_key_column"] insert_value = [(49, )] update_value = [(3, )] delete_value = [(0, )] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name)) query_list.extend([ "UPDATE constraints_database.dbo.no_constraints " "SET replication_key_column = 3 " "WHERE replication_key_column = 1"]) mssql_cursor_context_manager(*query_list) self.EXPECTED_METADATA["constraints_database_dbo_no_constraints"]["values"] = \ [(2, )] + insert_value + update_value database_name = "constraints_database" schema_name = "dbo" table_name = "multiple_column_pk" column_name = ["first_name", "last_name", "replication_key_column"] insert_value = [("Brian", "Lampkin", 72)] update_value = [("Sergey", "Brin", 65)] delete_value = [("Larry", "Page")] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:2])) query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) self.EXPECTED_METADATA["constraints_database_dbo_multiple_column_pk"]["values"] = \ [("Tim", "Berners-Lee", 64)] + insert_value + update_value # duplicative of other testing # table_name = "single_column_pk" # column_name = ["pk", "replication_key_column"] # insert_value = [(3, 49)] # update_value = [(1, 65)] # delete_value = [(0,)] # query_list = (insert(database_name, schema_name, table_name, insert_value)) # query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) # query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) # mssql_cursor_context_manager(*query_list) # insert_value = [insert_value[0] + (None,)] # update_value = [update_value[0] + (None,)] # delete_value = [delete_value[0] + (None, datetime.utcnow())] # self.EXPECTED_METADATA["constraints_database_dbo_single_column_pk"]["values"] = \ # insert_value + delete_value + update_value table_name = "pk_with_fk" column_name = ["pk", "replication_key_column"] insert_value = [(5, 2), (6, None)] delete_value = [(1,), (2,)] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) mssql_cursor_context_manager(*query_list) self.EXPECTED_METADATA["constraints_database_dbo_pk_with_fk"]["values"] = \ [(0, 1), (3, 1)] + insert_value[:-1] table_name = "pk_with_unique_not_null" column_name = ["pk", "replication_key_column"] insert_value = [(3, 49)] update_value = [(1, 65)] delete_value = [(0,)] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) self.EXPECTED_METADATA["constraints_database_dbo_pk_with_unique_not_null"]["values"] = \ [(2, 5)] + insert_value + update_value # update expected datafor VIEW_WITH_JOIN view self.EXPECTED_METADATA["constraints_database_dbo_view_with_join"]["values"] = \ [(None, None, 4), (2, 5, 5), (None, None, 6)] table_name = "default_column" column_name = ["pk", "replication_key_column"] insert_value = [(3, 49), (4, None), (5, )] update_value = [(1, 65)] query_list = (insert(database_name, schema_name, table_name, insert_value[:2])) query_list.extend(insert(database_name, schema_name, table_name, insert_value[-1:], column_names=column_name[:1])) query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) self.EXPECTED_METADATA["constraints_database_dbo_default_column"]["values"] = [ (0, -1)] + [(3, 49), (5, -1)] + update_value table_name = "check_constraint" column_name = ["pk", "replication_key_column"] insert_value = [(3, 49)] update_value = [(1, 65)] query_list = (insert(database_name, schema_name, table_name, insert_value)) query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) self.EXPECTED_METADATA["constraints_database_dbo_check_constraint"]["values"] = \ [(0, 37)] + insert_value + update_value table_name = "even_identity" column_name = ["pk", "replication_key_column"] insert_value = [(3,)] update_value = [(2,)] delete_value = [(1,)] query_list = (insert(database_name, schema_name, table_name, insert_value, column_names=column_name[:1])) query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1])) query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name)) mssql_cursor_context_manager(*query_list) insert_value = [insert_value[0] + (6, )] update_value = [update_value[0] + (4, )] self.EXPECTED_METADATA["constraints_database_dbo_even_identity"]["values"] = \ insert_value + update_value sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id()) expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()} self.assertEqual(record_count_by_stream, expected_count) records_by_stream = runner.get_records_from_target_output() for stream in self.expected_streams(): with self.subTest(stream=stream): stream_expected_data = self.expected_metadata()[stream] new_table_version = records_by_stream[stream]['table_version'] # verify on a subsequent sync you get activate version message only after all data self.assertEqual(records_by_stream[stream]['messages'][0]['action'], 'activate_version') self.assertEqual(records_by_stream[stream]['messages'][-1]['action'], 'activate_version') self.assertTrue(all( [message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:-1]] )) self.assertEqual(len(records_by_stream[stream]['messages'][1:-1]), len(stream_expected_data[self.VALUES]), msg="incorrect number of upserts") column_names = [ list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS] ] replication_column = column_names.index("replication_key_column") expected_messages = [ { "action": "upsert", "data": { column: value for column, value in list(zip(column_names, row_values)) if column not in non_selected_properties } } for row_values in sorted(stream_expected_data[self.VALUES], key=lambda row: (row[replication_column] is not None, row[replication_column])) ] # remove sequences from actual values for comparison [message.pop("sequence") for message in records_by_stream[stream]['messages'][1:-1]] # Verify all data is correct for expected_row, actual_row in list( zip(expected_messages, records_by_stream[stream]['messages'][1:-1])): with self.subTest(expected_row=expected_row): self.assertEqual(actual_row["action"], "upsert") # we only send the _sdc_deleted_at column for deleted rows self.assertEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()), msg="there are not the same number of columns") for column_name, expected_value in expected_row["data"].items(): self.assertEqual(expected_value, actual_row["data"][column_name], msg="expected: {} != actual {}".format( expected_row, actual_row)) print("records are correct for stream {}".format(stream)) # verify state and bookmarks state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None") self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental") self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental") # find the max value of the replication key self.assertEqual(bookmark['replication_key_value'], max([row[replication_column] for row in stream_expected_data[self.VALUES] if row[replication_column] is not None])) # self.assertEqual(bookmark['replication_key'], 'replication_key_column') self.assertEqual(bookmark['version'], table_version[stream], msg="expected bookmark for stream to match version") self.assertEqual(bookmark['version'], new_table_version, msg="expected bookmark for stream to match version") state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][stream] expected_schemas = self.expected_metadata()[stream]['schema'] self.assertEqual(records_by_stream[stream]['schema'], expected_schemas, msg="expected: {} != actual: {}".format(expected_schemas, records_by_stream[stream]['schema']))
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") # select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, c['stream_id']) for k in self.expected_pks()[c['stream_name']]: mdata = next( (m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None) print("Validating inclusion on {}: {}".format( c['stream_name'], mdata)) self.assertTrue( mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_via_metadata(conn_id, c, catalog_entry) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) synced_records = runner.get_records_from_target_output() for stream_name, data in synced_records.items(): record_messages = [ set(row['data'].keys()) for row in data['messages'] ] for record_keys in record_messages: # The symmetric difference should be empty self.assertEqual( record_keys, self.expected_automatic_fields().get(stream_name, set()))