def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: ############# Drop all dbs/collections ############# drop_all_collections(client) ############# Add simple collections ############# # simple_coll_1 has 50 documents] client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) # simple_coll_2 has 100 documents client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) ############# Add Index on date_field ############ client["simple_db"]["simple_coll_1"].create_index([("date_field", pymongo.ASCENDING)]) client["simple_db"]["simple_coll_2"].create_index([("date_field", pymongo.ASCENDING)]) # Add simple_coll per key type for key_name in self.key_names(): client["simple_db"]["simple_coll_{}".format(key_name)].insert_many(generate_simple_coll_docs(50)) # add index on field client["simple_db"]["simple_coll_{}".format(key_name)].create_index([(key_name, pymongo.ASCENDING)])
def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: ############# Drop all dbs/collections ############# drop_all_collections(client) ############# Add simple collections ############ # simple_coll_1 has 50 documents client["simple_db"]["simple_coll_1"].insert_many( generate_simple_coll_docs(50))
def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: # drop all dbs/collections drop_all_collections(client) # simple_coll_1 has 50 documents, id is an integer instead of ObjectId client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) # simple_coll_2 has 100 documents, id is an integer instead of ObjectId client["simple_db"]["simple_coll_2"].insert_many(generate_simple_binary_coll_docs(50))
def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: drop_all_collections(client) # simple_coll_1 has 50 documents client["simple_db"]["simple_coll_1"].insert_many( generate_simple_coll_docs(50)) # simple_coll_2 has 100 documents client["simple_db"]["simple_coll_2"].insert_many( generate_simple_coll_docs(100))
def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: # drop all dbs/collections drop_all_collections(client) # simple_coll_1 has 50 documents client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) # create view on simple_coll_1 client["simple_db"].command(bson.son.SON([("create", "simple_view_1"), ("viewOn", "simple_coll_1"), ("pipeline", [])])) # simple_coll_2 has 100 documents client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) # admin_coll_1 has 50 documents client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(50)) # simple_coll_3 is an empty collection client["simple_db"].create_collection("simple_coll_3") # simple_coll_4 has documents with special chars and a lot of nesting client["simple_db"]["simple_coll_4"].insert_one({"hebrew_ישרא": "hebrew_ישרא"}) client["simple_db"]["simple_coll_4"].insert_one({"hebrew_ישרא": 2}) client["simple_db"]["simple_coll_4"].insert_one({"another_hebrew_ישראל": "another_hebrew_ישרא"}) nested_doc = {"field0": {}} current_doc = nested_doc for i in range(1, 101): current_doc["field{}".format(i-1)]["field{}".format(i)] = {} current_doc = current_doc["field{}".format(i-1)] current_doc["field100"] = "some_value" client["simple_db"]["simple_coll_4"].insert_one(nested_doc) max_col_doc = {} for x in range(1600): max_col_doc['col_{}'.format(x)] = x client["simple_db"]["simple_coll_4"].insert_one(max_col_doc)
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) # ------------------------------------------- # ----------- First full Table Sync --------- # ------------------------------------------- # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}] selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, stream_catalog, annotated_schema, additional_md) # run full table sync sync_job_name = runner.run_sync_mode(self, conn_id) # check exit status exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # streams that we synced are the ones that we expect to see records_by_stream = runner.get_records_from_target_output() record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) # assert that we get the correct number of records for each stream self.assertEqual(self.expected_row_counts(),record_count_by_stream) # assert that an activate_version_message is first and last message sent for each stream for stream_name in self.expected_sync_streams(): self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action']) self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action']) state = menagerie.get_state(conn_id) first_versions = {} for tap_stream_id in self.expected_check_streams(): # state has an initial_full_table_complete == True self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete']) # there is a version bookmark in state first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version'] self.assertIsNotNone(first_versions[tap_stream_id]) # ------------------------------------------- # ----------- Second full Table Sync --------- # ------------------------------------------- with get_test_connection() as client: # update existing documents in the collection to make sure we get the updates as well in the next sync doc_to_update = client["simple_db"]["simple_coll_1"].find_one() client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update["_id"]}, {"$set": {"int_field": 999}}) doc_to_update = client["simple_db"]["simple_coll_2"].find_one() client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update["_id"]}, {"$set": {"int_field": 888}}) doc_to_update = client["admin"]["admin_coll_1"].find_one() client["admin"]["admin_coll_1"].find_one_and_update({"_id": doc_to_update["_id"]}, {"$set": {"int_field": 777}}) # add 2 rows and run full table again, make sure we get initial number + 2 client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(2)) client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(2)) client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(2)) sync_job_name = runner.run_sync_mode(self, conn_id) # check exit status exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) state = menagerie.get_state(conn_id) # Verify that menagerie state does not include a key for currently syncing self.assertIsNone(state['currently_syncing']) # Verify that menagerie state does not include a key for oplog based syncing self.assertNotIn('oplog', state) # assert that we have correct number of records (including the two new records and the update which is to be resynced) new_expected_row_counts = {k: v+2 for k, v in self.expected_row_counts().items() if k not in ['simple_db_simple_coll_3', 'simple_db_simple_coll_4']} new_expected_row_counts['simple_db_simple_coll_3']=0 new_expected_row_counts['simple_db_simple_coll_4']=5 self.assertEqual(new_expected_row_counts, record_count_by_stream) # assert that we only have an ActivateVersionMessage as the last message and not the first for stream_name in self.expected_sync_streams(): if len(records_by_stream[stream_name]['messages']) > 1: self.assertNotEqual('activate_version', records_by_stream[stream_name]['messages'][0]['action'], stream_name + "failed") self.assertEqual('upsert', records_by_stream[stream_name]['messages'][0]['action'], stream_name + "failed") self.assertEqual('activate_version', records_by_stream[stream_name]['messages'][-1]['action'], stream_name + "failed") second_versions = {} for tap_stream_id in self.expected_check_streams(): found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] # state has an initial_full_table_complete == True self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete']) # version bookmark second_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version'] self.assertIsNotNone(second_versions[tap_stream_id]) # version in this state is different than that of the previous state self.assertNotEqual(first_versions[tap_stream_id], second_versions[tap_stream_id]) # version which is larger than the previous target version self.assertGreater(second_versions[tap_stream_id], first_versions[tap_stream_id]) # verify that menagerie state does include the version which matches the target version self.assertEqual(records_by_stream[self.tap_stream_id_to_stream()[tap_stream_id]]['table_version'], second_versions[tap_stream_id])
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id ][0] # assert that the pks are correct self.assertEqual( self.expected_pks()[found_stream['stream_name']], set( found_stream.get('metadata', {}).get('table-key-properties'))) # assert that the row counts are correct self.assertEqual( self.expected_row_counts()[found_stream['stream_name']], found_stream.get('metadata', {}).get('row-count')) # ----------------------------------- # ----------- Initial Full Table --------- # ----------------------------------- # Select simple_coll_1 and add replication method metadata additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] for stream_catalog in found_catalogs: if stream_catalog['tap_stream_id'] == 'simple_db-simple_coll_1': annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that the full table was synced tap_stream_id = 'simple_db-simple_coll_1' self.assertGreaterEqual(record_count_by_stream['simple_coll_1'], self.expected_row_counts()['simple_coll_1']) # Verify that we have 'initial_full_table_complete' bookmark state = menagerie.get_state(conn_id) first_versions = {} # assert that the state has an initial_full_table_complete == True self.assertTrue( state['bookmarks'][tap_stream_id]['initial_full_table_complete']) # assert that there is a version bookmark in state first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id][ 'version'] self.assertIsNotNone(first_versions[tap_stream_id]) # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark self.assertIsNotNone( state['bookmarks'][tap_stream_id]['oplog_ts_time']) self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_inc']) # Insert records to coll_1 to get the bookmark to be a ts on coll_1 with get_test_connection() as client: client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 101, "string_field": random_string_generator() }) sync_job_name = runner.run_sync_mode(self, conn_id) changed_ids = set() with get_test_connection() as client: # Make changes to not selected collection changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 0})[0]['_id']) client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 1})[0]['_id']) client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 98})[0]['_id']) client["simple_db"]["simple_coll_2"].update_one( {'int_field': 98}, {'$set': { 'int_field': -1 }}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 99})[0]['_id']) client["simple_db"]["simple_coll_2"].update_one( {'int_field': 99}, {'$set': { 'int_field': -1 }}) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 100, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 100})[0]['_id']) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 101, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 101})[0]['_id']) # ----------------------------------- # ----------- Subsequent Oplog Sync --------- # ----------------------------------- # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() records_by_stream = { 'simple_coll_1': [ x for x in messages_by_stream['simple_coll_1']['messages'] if x.get('action') == 'upsert' ] } # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # 1 record due to fencepost querying on oplog ts self.assertEqual(1, record_count_by_stream['simple_coll_1']) final_state = menagerie.get_state(conn_id) with get_test_connection() as client: row = client.local.oplog.rs.find_one(sort=[('$natural', pymongo.DESCENDING)]) latest_oplog_ts = row.get('ts') self.assertEqual((latest_oplog_ts.time, latest_oplog_ts.inc), (final_state['bookmarks']['simple_db-simple_coll_1'] ['oplog_ts_time'], final_state['bookmarks'] ['simple_db-simple_coll_1']['oplog_ts_inc']))
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) # ----------------------------------- # ----------- Full Table Sync --------- # ----------------------------------- # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) # Synthesize interrupted state interrupted_state = { 'currently_syncing': 'simple_db-simple_coll_1', 'bookmarks': {} } versions = {} with get_test_connection() as client: for stream_name in self.expected_sync_streams(): rows = [ x for x in client['simple_db'][stream_name].find( sort=[("_id", pymongo.ASCENDING)]) ] # set last_id_fetched to middle point of table last_id_fetched = str(rows[int(len(rows) / 2)]['_id']) max_id_value = str(rows[-1]['_id']) tap_stream_id = 'simple_db-' + stream_name version = int(time.time() * 1000) interrupted_state['bookmarks'][tap_stream_id] = { 'max_id_value': max_id_value, 'max_id_type': 'ObjectId', 'initial_full_table_complete': False, 'last_id_fetched': last_id_fetched, 'last_id_fetched_type': 'ObjectId', 'version': version } versions[tap_stream_id] = version # update existing documents in collection with int_field value less than 25, and verify they do not come up in the sync # update existing documents in collection with int_field value greater than 25, and verify they come up in the sync # find_one() is going to retreive the first document in the collection doc_to_update_1 = client["simple_db"]["simple_coll_1"].find_one() client["simple_db"]["simple_coll_1"].find_one_and_update( {"_id": doc_to_update_1["_id"]}, {"$set": { "int_field": 999 }}) doc_to_update_2 = client["simple_db"]["simple_coll_2"].find_one() client["simple_db"]["simple_coll_2"].find_one_and_update( {"_id": doc_to_update_2["_id"]}, {"$set": { "int_field": 888 }}) doc_to_update_3 = client["simple_db"]["simple_coll_1"].find_one( {"int_field": 30}) client["simple_db"]["simple_coll_1"].find_one_and_update( {"_id": doc_to_update_3["_id"]}, {"$set": { "int_field": 777 }}) doc_to_update_4 = client["simple_db"]["simple_coll_2"].find_one( {"int_field": 80}) client["simple_db"]["simple_coll_2"].find_one_and_update( {"_id": doc_to_update_4["_id"]}, {"$set": { "int_field": 666 }}) menagerie.set_state(conn_id, interrupted_state) runner.run_sync_mode(self, conn_id) # streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # record counts records_by_stream = runner.get_records_from_target_output() self.assertEqual(self.expected_row_counts(), record_count_by_stream) # ActivateVersionMessage as the last message and not the first for stream_name in self.expected_sync_streams(): self.assertNotEqual( 'activate_version', records_by_stream[stream_name]['messages'][0]['action']) self.assertEqual( 'activate_version', records_by_stream[stream_name]['messages'][-1]['action']) # _id of the first record sync'd for each stream is the bookmarked # last_id_fetched from the interrupted_state passed to the tap self.assertEqual( records_by_stream['simple_coll_1']['messages'][0]['data']['_id'], interrupted_state['bookmarks']['simple_db-simple_coll_1'] ['last_id_fetched']) self.assertEqual( records_by_stream['simple_coll_2']['messages'][0]['data']['_id'], interrupted_state['bookmarks']['simple_db-simple_coll_2'] ['last_id_fetched']) # _id of the last record sync'd for each stream is the bookmarked # max_id_value from the interrupted_state passed to the tap self.assertEqual( records_by_stream['simple_coll_1']['messages'][-2]['data']['_id'], interrupted_state['bookmarks']['simple_db-simple_coll_1'] ['max_id_value']) self.assertEqual( records_by_stream['simple_coll_2']['messages'][-2]['data']['_id'], interrupted_state['bookmarks']['simple_db-simple_coll_2'] ['max_id_value']) # verify we are not seeing any documents which were updated having id < interrupted id value # checking just the first document value self.assertNotEqual( 999, records_by_stream['simple_coll_1']['messages'][0]['data'] ['int_field']) self.assertNotEqual( 888, records_by_stream['simple_coll_2']['messages'][0]['data'] ['int_field']) # checking if the updates are visible in all the documents in simple_coll_1 int_value = False for x in records_by_stream['simple_coll_1']['messages'][:-1]: # We are not considering the last element of this list because it does not have 'data' if int(x['data']['int_field']) == 999: int_value = True self.assertEqual(False, int_value) # checking if the updates are visible in all the documents in simple_coll_2 int_value2 = False for x in records_by_stream['simple_coll_1']['messages'][:-1]: if x['data']['int_field'] == 888: int_value2 = True self.assertEqual(False, int_value2) # verify we are seeing the documents which were updated having id > interruped id value # we are picking the 5th and 15th element in the list because we updated the 30th and 40th document, (doc starting with 25) self.assertEqual( 777, records_by_stream['simple_coll_1']['messages'][5]['data'] ['int_field']) self.assertEqual( 666, records_by_stream['simple_coll_2']['messages'][30]['data'] ['int_field']) # assert that final state has no last_id_fetched and max_id_value bookmarks final_state = menagerie.get_state(conn_id) for tap_stream_id in self.expected_check_streams(): self.assertIsNone( final_state['bookmarks'][tap_stream_id].get('last_id_fetched')) self.assertIsNone( final_state['bookmarks'][tap_stream_id].get('max_id_value')) state = menagerie.get_state(conn_id) for tap_stream_id, stream_bookmarks in state.get('bookmarks', {}).items(): self.assertTrue( stream_bookmarks.get('initial_full_table_complete', False))
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) for tap_stream_id in self.expected_check_streams(): found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0] # assert that the pks are correct self.assertEqual(self.expected_pks()[found_stream['stream_name']], set(found_stream.get('metadata', {}).get('table-key-properties'))) # assert that the row counts are correct self.assertEqual(self.expected_row_counts()[found_stream['stream_name']], found_stream.get('metadata', {}).get('row-count')) # ----------------------------------- # ----------- Full Table Sync --------- # ----------------------------------- # select simple_coll_1 stream and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}] selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, stream_catalog, annotated_schema, additional_md) # synthesize interrupted state interrupted_state = { 'currently_syncing' : 'simple_db-simple_coll_1', 'bookmarks' : {'simple_db-simple_coll_1': { 'max_id_value': 49, 'max_id_type': 'int', 'initial_full_table_complete': False, 'last_id_fetched': 25, 'last_id_fetched_type': 'int', 'version': int(time.time() * 1000)}, 'simple_db-simple_coll_2': { 'max_id_value': base64.b64encode("test {}".format(49).encode()), 'max_id_type': 'bytes', 'initial_full_table_complete': False, 'last_id_fetched': base64.b64encode("test {}".format(25).encode()), 'last_id_fetched_type': 'bytes', 'version': int(time.time() * 1000)}}} # update existing documents in collection with int_field value less than 25, and verify they do not come up in the sync # update existing documents in collection with int_field value greater than 25, and verify they come up in the sync with get_test_connection() as client: # find_one() is going to retreive the first document in the collection doc_to_update_1 = client["simple_db"]["simple_coll_1"].find_one() client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update_1["_id"]}, {"$set": {"int_field": 999}}) doc_to_update_2 = client["simple_db"]["simple_coll_2"].find_one() client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update_2["_id"]}, {"$set": {"int_field": 888}}) doc_to_update_3 = client["simple_db"]["simple_coll_1"].find_one({"int_field": 30}) client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update_3["_id"]}, {"$set": {"int_field": 777}}) doc_to_update_4 = client["simple_db"]["simple_coll_2"].find_one({"int_field": 40}) client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update_4["_id"]}, {"$set": {"int_field": 666}}) menagerie.set_state(conn_id, interrupted_state) runner.run_sync_mode(self, conn_id) # streams that we synced are the ones that we expect to see records_by_stream = runner.get_records_from_target_output() record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) # ActivateVersionMessage as the last message and not the first for stream_name in self.expected_sync_streams(): self.assertNotEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action']) self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action']) # _id of the first record sync'd for each stream is the bookmarked # last_id_fetched from the interrupted_state passed to the tap self.assertEqual(records_by_stream['simple_coll_1']['messages'][0]['data']['_id'], int(interrupted_state['bookmarks']['simple_db-simple_coll_1']['last_id_fetched'])) # _id of the last record sync'd for each stream is the bookmarked # max_id_value from the interrupted_state passed to the tap self.assertEqual(records_by_stream['simple_coll_1']['messages'][-2]['data']['_id'], int(interrupted_state['bookmarks']['simple_db-simple_coll_1']['max_id_value'])) # verify we are not seeing any documents which were updated having id < 25 self.assertNotEqual(999, records_by_stream['simple_coll_1']['messages'][0]['data']['int_field']) self.assertNotEqual(888, records_by_stream['simple_coll_2']['messages'][0]['data']['int_field']) int_value = False for x in records_by_stream['simple_coll_1']['messages'][:-1]: # We are not considering the last element of this list because it does not have 'data' if int(x['data']['int_field']) == 999: int_value = True self.assertEqual(False, int_value) int_value2 = False for x in records_by_stream['simple_coll_1']['messages'][:-1]: if x['data']['int_field'] == 888: int_value2 = True self.assertEqual(False, int_value2) # verify we are seeing the documents which were updated having id > 25 # we are picking the 5th and 15th element in the list because we updated the 30th and 40th document, (doc starting with 25) self.assertEqual(777, records_by_stream['simple_coll_1']['messages'][5]['data']['int_field']) self.assertEqual(666, records_by_stream['simple_coll_2']['messages'][15]['data']['int_field']) # assert that final state has no last_id_fetched and max_id_value bookmarks final_state = menagerie.get_state(conn_id) for tap_stream_id in self.expected_check_streams(): self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('last_id_fetched')) self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('max_id_value'))
def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: ############# Drop all dbs/collections ############# drop_all_collections(client) ############# Add datatype collections ############# pattern = re.compile('.*') regex = bson.Regex.from_native(pattern) regex.flags ^= re.UNICODE datatype_doc = { "double_field": 4.3, "string_field": "a sample string", "object_field": { "obj_field_1_key": "obj_field_1_val", "obj_field_2_key": "obj_field_2_val" }, "array_field": ["array_item_1", "array_item_2", "array_item_3"], "binary_data_field": bson.Binary(b"a binary string"), "object_id_field": bson.objectid.ObjectId(b'123456789123'), "boolean_field": True, "date_field": datetime.datetime(2019, 8, 15, 19, 29, 14, 578000), "null_field": None, "regex_field": regex, "32_bit_integer_field": 32, "timestamp_field": bson.timestamp.Timestamp(1565897157, 1), "64_bit_integer_field": 34359738368, "decimal_field": bson.Decimal128(decimal.Decimal('1.34')), "javaScript_field": bson.code.Code("var x, y, z;"), "javaScript_with_scope_field": bson.code.Code("function incrementX() { x++; }", scope={"x": 1}), "min_key_field": bson.min_key.MinKey, "max_key_field": bson.max_key.MaxKey, "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec72820c4'), "dbref_field": bson.dbref.DBRef("some_collection", bson.objectid.ObjectId(b'123456789123'), database='some_database') } client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc) # NB: Insert an invalid datetime to confirm that works correctly run_mongodb_javascript( "datatype_db", "db.invalid_datatype_coll.insert({ \"date_field\": new ISODate(\"0000-01-01T00:00:00.000Z\") });" )
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id ][0] # assert that the pks are correct self.assertEqual( self.expected_pks()[found_stream['stream_name']], set( found_stream.get('metadata', {}).get('table-key-properties'))) # assert that the row counts are correct self.assertEqual( self.expected_row_counts()[found_stream['stream_name']], found_stream.get('metadata', {}).get('row-count')) # ----------------------------------- # ----------- Full Table Sync --------- # ----------------------------------- # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) # run full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # assert that we get the correct number of records for each stream self.assertEqual(self.expected_row_counts(), record_count_by_stream) # assert that an activate_version_message is first and last message sent for each stream for stream_name in self.expected_sync_streams(): self.assertEqual( 'activate_version', records_by_stream[stream_name]['messages'][0]['action']) self.assertEqual( 'activate_version', records_by_stream[stream_name]['messages'][-1]['action']) state = menagerie.get_state(conn_id) first_versions = {} for tap_stream_id in self.expected_check_streams(): # assert that the state has an initial_full_table_complete == True self.assertTrue(state['bookmarks'][tap_stream_id] ['initial_full_table_complete']) # assert that there is a version bookmark in state first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id][ 'version'] self.assertIsNotNone(first_versions[tap_stream_id]) record_id = None with get_test_connection() as client: record_id = str([ x for x in client['datatype_db']['datatype_coll_1'].find() ][0]['_id']) expected_record = { "javaScript_field": "var x, y, z;", "timestamp_field": "2019-08-15T19:25:57.000000Z", "_id": record_id, "date_field": "2019-08-15T19:29:14.578000Z", "string_field": "a sample string", "object_field": { "obj_field_2_key": "obj_field_2_val", "obj_field_1_key": "obj_field_1_val" }, "null_field": None, "regex_field": { "flags": 0, "pattern": ".*" }, "object_id_field": "313233343536373839313233", "64_bit_integer_field": 34359738368, "32_bit_integer_field": 32, "array_field": ["array_item_1", "array_item_2", "array_item_3"], "binary_data_field": "YSBiaW5hcnkgc3RyaW5n", "javaScript_with_scope_field": { "scope": "{'x': 1}", "value": "function incrementX() { x++; }" }, "double_field": decimal.Decimal('4.3'), "boolean_field": True, "decimal_field": decimal.Decimal('1.34'), 'uuid_field': "3e139ff5-d622-45c6-bf9e-1dfec72820c4", "dbref_field": { "id": "313233343536373839313233", "database": "some_database", "collection": "some_collection" } } self.assertEquals( expected_record, records_by_stream['datatype_coll_1']['messages'][1]['data'])
def modify_database(self): with get_test_connection() as client: # Delete two documents for each collection client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0}) client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1}) client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0}) client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1}) # Update two documents for each collection client["simple_db"]["simple_coll_1"].update_one( {'int_field': 48}, {'$set': { 'int_field': -1 }}) client["simple_db"]["simple_coll_1"].update_one( {'int_field': 49}, {'$set': { 'int_field': -1 }}) client["simple_db"]["simple_coll_2"].update_one( {'int_field': 98}, {'$set': { 'int_field': -1 }}) client["simple_db"]["simple_coll_2"].update_one( {'int_field': 99}, {'$set': { 'int_field': -1 }}) # Insert two documents for each collection client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 50, "string_field": random_string_generator() }) client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 51, "string_field": random_string_generator() }) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 100, "string_field": random_string_generator() }) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 101, "string_field": random_string_generator() })
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams catalog = menagerie.get_catalog(conn_id) found_catalogs = menagerie.get_catalogs(conn_id) found_streams = {entry['tap_stream_id'] for entry in catalog['streams']} self.assertSetEqual(self.expected_check_streams(), found_streams) # verify the tap discovered stream metadata is consistent with the source database for tap_stream_id in self.expected_check_streams(): with self.subTest(stream=tap_stream_id): # gather expectations stream = tap_stream_id.split('-')[1] expected_primary_key = self.expected_pks()[stream] expected_row_count = self.expected_row_counts()[stream] expected_replication_keys = self.expected_valid_replication_keys()[stream] # gather results found_stream = [entry for entry in catalog['streams'] if entry['tap_stream_id'] == tap_stream_id][0] stream_metadata = [entry['metadata'] for entry in found_stream['metadata'] if entry['breadcrumb']==[]][0] primary_key = set(stream_metadata.get('table-key-properties')) row_count = stream_metadata.get('row-count') replication_key = set(stream_metadata.get('valid-replication-keys')) # assert that the pks are correct self.assertSetEqual(expected_primary_key, primary_key) # assert that the row counts are correct self.assertEqual(expected_row_count, row_count) # assert that valid replication keys are correct self.assertSetEqual(replication_key, expected_replication_keys) # ----------------------------------- # ----------- Initial Sync --------- # ----------------------------------- # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) rep_key = 'date_field' for key in self.key_names(): if key in stream_catalog['stream_name']: rep_key = key additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'INCREMENTAL', 'replication-key': rep_key}}] selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, stream_catalog, annotated_schema, additional_md) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() # gather expectations expected_schema = {'type': 'object'} for tap_stream_id in self.expected_sync_streams(): with self.subTest(stream=tap_stream_id): # gather results persisted_schema = messages_by_stream[tap_stream_id]['schema'] # assert the schema is an object self.assertDictEqual(expected_schema, persisted_schema) # verify that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) # verify that the entire collection was synced by comparing row counts against the source for tap_stream_id in self.expected_sync_streams(): with self.subTest(stream=tap_stream_id): expected_row_count = self.expected_row_counts()[tap_stream_id] row_count = record_count_by_stream[tap_stream_id] self.assertEqual(expected_row_count, row_count) # verify state is saved in the proper format for all streams state = menagerie.get_state(conn_id) expected_state_keys = { 'last_replication_method', 'replication_key_name', 'replication_key_type', 'replication_key_value', 'version', } for tap_stream_id in self.expected_check_streams(): with self.subTest(stream=tap_stream_id): bookmark = state['bookmarks'][tap_stream_id] # gather expectations stream = tap_stream_id.split('-')[1] expected_replication_keys = self.expected_valid_replication_keys()[stream] # gather results replication_key = bookmark['replication_key_name'] replication_key_type = bookmark['replication_key_type'] # assert that all expected bookmark keys are present self.assertSetEqual(expected_state_keys, set(bookmark.keys())) # assert all bookmark keys have values for key in expected_state_keys: self.assertIsNotNone(bookmark[key]) # assert incremental sync was performed self.assertEqual('INCREMENTAL', bookmark['last_replication_method']) # assert the replication key was used to save state self.assertIn(replication_key, expected_replication_keys) # assert the replication key type is a valid datatype self.assertIn(replication_key_type, VALID_REPLICATION_TYPES) self.assertIsNone(state['currently_syncing']) # ----------------------------------- # ------------ Second Sync ---------- # ----------------------------------- # Perform data manipulations with get_test_connection() as client: # update 1 document in each of the collection update_doc_coll_1 = client["simple_db"]["simple_coll_1"].find_one() client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": update_doc_coll_1["_id"]}, {"$set": {"date_field": datetime(2020, 1, 1, 19, 29, 14, 578000)}}) update_doc_coll_2 = client["simple_db"]["simple_coll_2"].find_one() client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": update_doc_coll_2["_id"]}, {"$set": {"date_field": datetime(2020, 1, 1, 19, 29, 14, 578000)}}) for key_name in self.key_names(): if (key_name == 'int_field'): # get the first document in the collection to update doc_to_update = client["simple_db"]["simple_coll_{}".format(key_name)].find_one(sort=[("{}".format(key_name), -1)]) value = doc_to_update["{}".format(key_name)] int_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one() client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": int_based_coll["_id"]}, {"$set": {"{}".format(key_name): value+3}}) elif (key_name == 'double_field'): doc_to_update = client["simple_db"]["simple_coll_{}".format(key_name)].find_one(sort=[("{}".format(key_name), -1)]) value = doc_to_update["{}".format(key_name)] double_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one() client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": double_based_coll["_id"]}, {"$set": {"{}".format(key_name): value+3}}) elif (key_name == '64_bit_int_field'): doc_to_update = client["simple_db"]["simple_coll_{}".format(key_name)].find_one(sort=[("{}".format(key_name), -1)]) value = doc_to_update["{}".format(key_name)] bit64_int_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one() client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": bit64_int_based_coll["_id"]}, {"$set": {"{}".format(key_name): value+3}}) elif (key_name == 'date_field'): date_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one() client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": date_based_coll["_id"]}, {"$set": {"{}".format(key_name): datetime(2021, 1, 1, 15, 30, 14, 222000)}}) elif (key_name == 'timestamp_field'): timestamp_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one() client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": timestamp_based_coll["_id"]}, {"$set": {"{}".format(key_name): bson.timestamp.Timestamp(1565897157+99, 1)}}) # TODO : figure out how to update collections with replication key = string, uuid # insert two documents with date_field > bookmark for next sync client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 50, "string_field": z_string_generator(), "date_field": datetime(2018, 9, 13, 19, 29, 14, 578000), "double_field": 51.001, "timestamp_field": bson.timestamp.Timestamp(1565897157+50, 1), "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282050'), "64_bit_int_field": 34359738368 + 50 }) client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 51, "string_field": z_string_generator(), "date_field": datetime(2018, 9, 18, 19, 29, 14, 578000), "double_field": 52.001, "timestamp_field": bson.timestamp.Timestamp(1565897157+51, 1), "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282051'), "64_bit_int_field": 34359738368 + 51 }) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 100, "string_field": z_string_generator(), "date_field": datetime(2019, 5, 21, 19, 29, 14, 578000), "double_field": 101.001, "timestamp_field": bson.timestamp.Timestamp(1565897157+100, 1), "uuid_field":uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282100'), "64_bit_int_field": 34359738368 + 100 }) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 101, "string_field": z_string_generator(), "date_field": datetime(2019, 5, 26, 19, 29, 14, 578000), "double_field": 102.001, "timestamp_field": bson.timestamp.Timestamp(1565897157+101, 1), "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282101'), "64_bit_int_field": 34359738368 + 101 }) for key_name in self.key_names(): client["simple_db"]["simple_coll_{}".format(key_name)].insert_one({ "int_field": 50, "string_field": z_string_generator(50), "date_field": datetime(2018, 9, 13, 19, 29, 15, 578000), "double_field": 51.001, "timestamp_field": bson.timestamp.Timestamp(1565897157+50, 1), "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282050'), "64_bit_int_field": 34359738368 + 50 }) client["simple_db"]["simple_coll_{}".format(key_name)].insert_one({ "int_field": 51, "string_field": z_string_generator(51), "date_field": datetime(2018, 9, 18, 19, 29, 16, 578000), "double_field": 52.001, "timestamp_field": bson.timestamp.Timestamp(1565897157+51, 1), "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282051'), "64_bit_int_field": 34359738368 + 51 }) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() records_by_stream = {} for stream_name in self.expected_sync_streams(): records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert'] # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that we got 4 records for each stream (2 because of the new records, 1 because of update and 1 because of greater than equal [for key based incremental there will always be an overlap on the bookmark value]) for k, v in record_count_by_stream.items(): # Workaround for not including collections for uuid and string, TODO : look for a solution to implement string and uuid as replication_key if k not in ('simple_coll_uuid_field', 'simple_coll_string_field'): self.assertEqual(4, v) # Verify that the _id of the records sent are the same set as the # _ids of the documents changed for stream_name in self.expected_sync_streams(): # Workaround for not including collections for uuid and string, TODO : look for a solution to implement string and uuid as replication_key if stream_name not in ('simple_coll_uuid_field', 'simple_coll_string_field'): actual = set([x['data']['int_field'] for x in records_by_stream[stream_name]]) self.assertEqual(self.expected_incremental_int_fields()[stream_name], actual) ############################################################################## # Verify that data is not replicated when non replication key is updated ############################################################################## # Sampling a document from a collection which we know it exists because of the data set up no_rep_doc_coll_1 = client["simple_db"]["simple_coll_1"].find_one({"int_field": 20}) client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": no_rep_doc_coll_1["_id"]}, {"$set": {"string_field": 'No_replication'}}) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) messages_by_stream = runner.get_records_from_target_output() second_state = menagerie.get_state(conn_id) records_by_stream = {} for stream_name in self.expected_sync_streams(): records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert'] doc_from_simple_coll_1 = records_by_stream['simple_coll_1'] # Verify the document from simple_coll_1 does not correspond to the document which we updated_data self.assertNotEqual(doc_from_simple_coll_1[0]['data']['_id'], no_rep_doc_coll_1["_id"]) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that we got 1 record for each stream (1 because of greater than equal [for key based incremental there will always be an overlap on the bookmark value]) for k, v in record_count_by_stream.items(): if k not in ('simple_coll_uuid_field', 'simple_coll_string_field'): self.assertEqual(1, v) # ----------------------------------- # ------------ Third Sync ----------- # ----------------------------------- # Change the replication method for simple_coll_1 # Change the replication key for simple_coll_2 # Make sure both do full resync for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id']) additional_md = [] if stream_catalog['tap_stream_id'] == 'simple_db-simple_coll_1': additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}] elif stream_catalog['tap_stream_id'] == 'simple_db-simple_coll_2': additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'INCREMENTAL', 'replication-key': 'timestamp_field'}}] else: additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'INCREMENTAL', 'replication-key': stream_catalog['stream_name'].replace('simple_coll_', '')}}] selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, stream_catalog, annotated_schema, additional_md) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertDictEqual(record_count_by_stream, self.expected_last_sync_row_counts())
def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: # drop all dbs/collections drop_all_collections(client) # simple_coll_1 has 50 documents client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) # simple_coll_2 has 100 documents client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) # admin_coll_1 has 50 documents client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(50)) # create view on simple_coll_1 client["simple_db"].command(bson.son.SON([("create", "simple_view_1"), ("viewOn", "simple_coll_1"), ("pipeline", [])])) # collections with same names as others in different dbs client["simple_db_2"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) client["simple_db_2"]["SIMPLE_COLL_1"].insert_many(generate_simple_coll_docs(50)) # collections with special characters in names client["special_db"]["hebrew_ישראל"].insert_many(generate_simple_coll_docs(50)) client['special_db']['hello!world?'].insert_many(generate_simple_coll_docs(50)) # Add datatype collections pattern = re.compile('.*') regex = bson.Regex.from_native(pattern) regex.flags ^= re.UNICODE datatype_doc = { "double_field": 4.3, "string_field": "a sample string", "object_field" : { "obj_field_1_key": "obj_field_1_val", "obj_field_2_key": "obj_field_2_val" }, "array_field" : [ "array_item_1", "array_item_2", "array_item_3" ], "binary_data_field" : b"a binary string", "object_id_field": bson.objectid.ObjectId(b'123456789123'), "boolean_field" : True, "date_field" : datetime.datetime.now(), "null_field": None, "regex_field" : regex, "32_bit_integer_field" : 32, "timestamp_field" : bson.timestamp.Timestamp(int(time.time()), 1), "64_bit_integer_field" : 34359738368, "decimal_field" : bson.Decimal128(decimal.Decimal('1.34')), "javaScript_field" : bson.code.Code("var x, y, z;"), "javaScript_with_scope_field" : bson.code.Code("function incrementX() { x++; }", scope={"x": 1}), "min_key_field" : bson.min_key.MinKey, "max_key_field" : bson.max_key.MaxKey } client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc) client["datatype_db"]["datatype_coll_2"].insert_one(datatype_doc) client["datatype_db"]["datatype_coll_2"].create_index([("date_field", pymongo.ASCENDING)]) client["datatype_db"]["datatype_coll_2"].create_index([("timestamp_field", pymongo.ASCENDING)]) client["datatype_db"]["datatype_coll_2"].create_index([("32_bit_integer_field", pymongo.ASCENDING)]) client["datatype_db"]["datatype_coll_2"].create_index([("64_bit_integer_field", pymongo.ASCENDING)])
def test_run(self): ensure_environment_variables_set() with get_test_connection() as client: # drop all dbs/collections drop_all_collections(client) # Create session 1 and insert docs to simple_coll_1 & simple_coll_2 ################# # Session 1 ################# session1 = client.start_session() session1.start_transaction() # simple_coll_1 has 10 documents client["simple_db"]["simple_coll_1"].insert_many( generate_simple_coll_docs(10)) # simple_coll_2 has 20 documents client["simple_db"]["simple_coll_2"].insert_many( generate_simple_coll_docs(20)) session1.commit_transaction() # Create session 2 ''' create empty collection update documents in simple_coll_1 & simple_coll_2 and tie to session 2 insert documents in simple_coll_3 and tie to session 2 execute the sync with uncommitted changes validate that the uncommitted changes are not replicated by the sync ''' ################ # Session 2 ################ session2 = client.start_session() session2.start_transaction() # simple_coll_3 is an empty collection client["simple_db"].create_collection("simple_coll_3") # update document from coll 1 and coll 2 client["simple_db"]["simple_coll_1"].update_one( {"int_field": 5}, {"$set": { "int_field": 11 }}, session=session2) client["simple_db"]["simple_coll_2"].update_one( {"int_field": 10}, {"$set": { "int_field": 21 }}, session=session2) # insert document to coll 3 client["simple_db"]["simple_coll_3"].insert_many( generate_simple_coll_docs(5), session=session2) # deletes do not matter in incremental replication, invalid scenario to test conn_id = connections.ensure_connection(self) # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams which includes all collections which are part of session1 and session2 self.assertEqual(self.expected_check_streams_sync_1(), {c['tap_stream_id'] for c in found_catalogs}) # Select streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'INCREMENTAL', 'replication_key': 'int_field' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) # run full table sync sync_1 = runner.run_sync_mode(self, conn_id) # check exit status exit_status = menagerie.get_exit_status(conn_id, sync_1) menagerie.verify_sync_exit_status(self, exit_status, sync_1) # streams that we synced are the ones that we expect to see records_by_stream = runner.get_records_from_target_output() record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams_1(), self.expected_pks_1()) # validate the record count in collections which are part of session1 and session2, should not read updates on coll 1 and coll 2 and insert on coll 3. Because the transaction is not committed self.assertEqual(self.expected_row_counts_sync_1(), record_count_by_stream) # validate there are no duplicates replicated as part of sync1 records_2 = {} pk_dict_2 = {} for stream in self.expected_sync_streams_1(): records_2[stream] = [ x for x in records_by_stream[stream]['messages'] if x.get('action') == 'upsert' ] pk_2 = [] for record in range(len(records_2[stream])): pk_2.append(records_2[stream][record]['data']['int_field']) pk_dict_2[stream] = pk_2 self.assertEqual(self.expected_pk_values_2(), pk_dict_2) session2.commit_transaction() # Create session 3 ''' Execute another sync Validate that the documents committed as part of session 2 should now be replicated in sync_2 ''' ################ # Session 3 ################ session3 = client.start_session() session3.start_transaction() # Run 2nd sync # run in discovery mode sync_2 = runner.run_sync_mode(self, conn_id) exit_status_2 = menagerie.get_exit_status(conn_id, sync_2) menagerie.verify_sync_exit_status(self, exit_status_2, sync_2) records_by_stream_2 = runner.get_records_from_target_output() record_count_by_stream_2 = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams_2(), self.expected_pks_2()) # validate that we see the updates to coll 1 and coll 2 and insert to coll 3 in the 2nd sync # we see 2 records for coll 1 and coll 2, 1 record for update and the other record for the bookmarked record self.assertEqual(self.expected_row_counts_sync_2(), record_count_by_stream_2) # validate there are no duplicates replicated as part of sync1 records_3 = {} pk_dict_3 = {} for stream in self.expected_sync_streams_1(): records_3[stream] = [ x for x in records_by_stream_2[stream]['messages'] if x.get('action') == 'upsert' ] pk_3 = [] for record in range(len(records_3[stream])): pk_3.append(records_3[stream][record]['data']['int_field']) pk_dict_3[stream] = pk_3 self.assertEqual(self.expected_pk_values_3(), pk_dict_3) # Test case to validate tap behaviour when we delete bookmarked document and run sync state_2 = menagerie.get_state(conn_id) for stream in self.expected_check_streams_sync_1(): rep_key_value = state_2['bookmarks'][stream][ 'replication_key_value'] if stream == 'simple_db-simple_coll_1': collection = 'simple_coll_1' elif stream == 'simple_db-simple_coll_2': collection = 'simple_coll_2' elif stream == 'simple_db-simple_coll_3': collection = 'simple_coll_3' client["simple_db"][collection].delete_one( {"int_field": int(rep_key_value)}, session=session3) session3.commit_transaction() ''' Execute the sync, after the commit on session 3 Session 3 commits includes deleting the bookmarked value in each of the collection Validate the state does not change after deleting the bookmarked value Validate that the sync does not replicate any documents ''' state_3 = menagerie.get_state(conn_id) sync_3 = runner.run_sync_mode(self, conn_id) exit_status_3 = menagerie.get_exit_status(conn_id, sync_3) menagerie.verify_sync_exit_status(self, exit_status_3, sync_3) records_by_stream_3 = runner.get_records_from_target_output() record_count_by_stream_3 = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams_2(), self.expected_pks_2()) # validate that we see 0 records being replicated because we deleted the bookmark value on each of the collection self.assertEqual(self.expected_row_counts_sync_3(), record_count_by_stream_3) # validate that the state value has not changed after deleting the bookmarked value in each collection self.assertEqual(state_2, state_3)
def test_run(self): conn_id = connections.ensure_connection(self) # ------------------------------- # ----------- Discovery ---------- # ------------------------------- # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), {c['tap_stream_id'] for c in found_catalogs}) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id ][0] # assert that the pks are correct self.assertEqual( self.expected_pks()[found_stream['stream_name']], set( found_stream.get('metadata', {}).get('table-key-properties'))) # assert that the row counts are correct self.assertEqual( self.expected_row_counts()[found_stream['stream_name']], found_stream.get('metadata', {}).get('row-count')) # ----------------------------------- # ----------- Initial Full Table --------- # ----------------------------------- # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that the full table was synced for tap_stream_id in self.expected_sync_streams(): self.assertGreaterEqual(record_count_by_stream[tap_stream_id], self.expected_row_counts()[tap_stream_id]) # Verify that we have 'initial_full_table_complete' bookmark state = menagerie.get_state(conn_id) first_versions = {} for tap_stream_id in self.expected_check_streams(): # assert that the state has an initial_full_table_complete == True self.assertTrue(state['bookmarks'][tap_stream_id] ['initial_full_table_complete']) # assert that there is a version bookmark in state first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id][ 'version'] self.assertIsNotNone(first_versions[tap_stream_id]) # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark self.assertIsNotNone( state['bookmarks'][tap_stream_id]['oplog_ts_time']) self.assertIsNotNone( state['bookmarks'][tap_stream_id]['oplog_ts_inc']) changed_ids = set() with get_test_connection() as client: # Delete two documents for each collection changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 0})[0]['_id']) client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0}) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 1})[0]['_id']) client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 0})[0]['_id']) client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 1})[0]['_id']) client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1}) # Update two documents for each collection changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 48})[0]['_id']) client["simple_db"]["simple_coll_1"].update_one( {'int_field': 48}, {'$set': { 'int_field': -1 }}) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 49})[0]['_id']) client["simple_db"]["simple_coll_1"].update_one( {'int_field': 49}, {'$set': { 'int_field': -1 }}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 98})[0]['_id']) client["simple_db"]["simple_coll_2"].update_one( {'int_field': 98}, {'$set': { 'int_field': -1 }}) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 99})[0]['_id']) client["simple_db"]["simple_coll_2"].update_one( {'int_field': 99}, {'$set': { 'int_field': -1 }}) # Insert two documents for each collection client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 50, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 50})[0]['_id']) client["simple_db"]["simple_coll_1"].insert_one({ "int_field": 51, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_1'].find( {'int_field': 51})[0]['_id']) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 100, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 100})[0]['_id']) client["simple_db"]["simple_coll_2"].insert_one({ "int_field": 101, "string_field": random_string_generator() }) changed_ids.add(client['simple_db']['simple_coll_2'].find( {'int_field': 101})[0]['_id']) # ----------------------------------- # ----------- Subsequent Oplog Sync --------- # ----------------------------------- # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() records_by_stream = {} for stream_name in self.expected_sync_streams(): records_by_stream[stream_name] = [ x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert' ] # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) # Verify that we got at least 6 records due to changes # (could be more due to overlap in gte oplog clause) for k, v in record_count_by_stream.items(): self.assertGreaterEqual(v, 6) # Verify that we got 2 records with _SDC_DELETED_AT self.assertEqual( 2, len([ x['data'] for x in records_by_stream['simple_coll_1'] if x['data'].get('_sdc_deleted_at') ])) self.assertEqual( 2, len([ x['data'] for x in records_by_stream['simple_coll_2'] if x['data'].get('_sdc_deleted_at') ])) # Verify that the _id of the records sent are the same set as the # _ids of the documents changed actual = set([ ObjectId(x['data']['_id']) for x in records_by_stream['simple_coll_1'] ]).union( set([ ObjectId(x['data']['_id']) for x in records_by_stream['simple_coll_2'] ])) self.assertEqual(changed_ids, actual)