def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: ############# Drop all dbs/collections ############# drop_all_collections(client) ############# Add simple collections ############# # simple_coll_1 has 50 documents] client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) # simple_coll_2 has 100 documents client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) ############# Add Index on date_field ############ client["simple_db"]["simple_coll_1"].create_index([("date_field", pymongo.ASCENDING)]) client["simple_db"]["simple_coll_2"].create_index([("date_field", pymongo.ASCENDING)]) # Add simple_coll per key type for key_name in self.key_names(): client["simple_db"]["simple_coll_{}".format(key_name)].insert_many(generate_simple_coll_docs(50)) # add index on field client["simple_db"]["simple_coll_{}".format(key_name)].create_index([(key_name, pymongo.ASCENDING)])
def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: ############# Drop all dbs/collections ############# drop_all_collections(client) ############# Add simple collections ############ # simple_coll_1 has 50 documents client["simple_db"]["simple_coll_1"].insert_many( generate_simple_coll_docs(50))
def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: # drop all dbs/collections drop_all_collections(client) # simple_coll_1 has 50 documents, id is an integer instead of ObjectId client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) # simple_coll_2 has 100 documents, id is an integer instead of ObjectId client["simple_db"]["simple_coll_2"].insert_many(generate_simple_binary_coll_docs(50))
def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: drop_all_collections(client) # simple_coll_1 has 50 documents client["simple_db"]["simple_coll_1"].insert_many( generate_simple_coll_docs(50)) # simple_coll_2 has 100 documents client["simple_db"]["simple_coll_2"].insert_many( generate_simple_coll_docs(100))
def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: # drop all dbs/collections drop_all_collections(client) # simple_coll_1 has 50 documents client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) # create view on simple_coll_1 client["simple_db"].command(bson.son.SON([("create", "simple_view_1"), ("viewOn", "simple_coll_1"), ("pipeline", [])])) # simple_coll_2 has 100 documents client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) # admin_coll_1 has 50 documents client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(50)) # simple_coll_3 is an empty collection client["simple_db"].create_collection("simple_coll_3") # simple_coll_4 has documents with special chars and a lot of nesting client["simple_db"]["simple_coll_4"].insert_one({"hebrew_ישרא": "hebrew_ישרא"}) client["simple_db"]["simple_coll_4"].insert_one({"hebrew_ישרא": 2}) client["simple_db"]["simple_coll_4"].insert_one({"another_hebrew_ישראל": "another_hebrew_ישרא"}) nested_doc = {"field0": {}} current_doc = nested_doc for i in range(1, 101): current_doc["field{}".format(i-1)]["field{}".format(i)] = {} current_doc = current_doc["field{}".format(i-1)] current_doc["field100"] = "some_value" client["simple_db"]["simple_coll_4"].insert_one(nested_doc) max_col_doc = {} for x in range(1600): max_col_doc['col_{}'.format(x)] = x client["simple_db"]["simple_coll_4"].insert_one(max_col_doc)
def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: ############# Drop all dbs/collections ############# drop_all_collections(client) ############# Add datatype collections ############# pattern = re.compile('.*') regex = bson.Regex.from_native(pattern) regex.flags ^= re.UNICODE datatype_doc = { "double_field": 4.3, "string_field": "a sample string", "object_field": { "obj_field_1_key": "obj_field_1_val", "obj_field_2_key": "obj_field_2_val" }, "array_field": ["array_item_1", "array_item_2", "array_item_3"], "binary_data_field": bson.Binary(b"a binary string"), "object_id_field": bson.objectid.ObjectId(b'123456789123'), "boolean_field": True, "date_field": datetime.datetime(2019, 8, 15, 19, 29, 14, 578000), "null_field": None, "regex_field": regex, "32_bit_integer_field": 32, "timestamp_field": bson.timestamp.Timestamp(1565897157, 1), "64_bit_integer_field": 34359738368, "decimal_field": bson.Decimal128(decimal.Decimal('1.34')), "javaScript_field": bson.code.Code("var x, y, z;"), "javaScript_with_scope_field": bson.code.Code("function incrementX() { x++; }", scope={"x": 1}), "min_key_field": bson.min_key.MinKey, "max_key_field": bson.max_key.MaxKey, "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec72820c4'), "dbref_field": bson.dbref.DBRef("some_collection", bson.objectid.ObjectId(b'123456789123'), database='some_database') } client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc) # NB: Insert an invalid datetime to confirm that works correctly run_mongodb_javascript( "datatype_db", "db.invalid_datatype_coll.insert({ \"date_field\": new ISODate(\"0000-01-01T00:00:00.000Z\") });" )
def setUp(self): ensure_environment_variables_set() with get_test_connection() as client: # drop all dbs/collections drop_all_collections(client) # simple_coll_1 has 50 documents client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) # simple_coll_2 has 100 documents client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100)) # admin_coll_1 has 50 documents client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(50)) # create view on simple_coll_1 client["simple_db"].command(bson.son.SON([("create", "simple_view_1"), ("viewOn", "simple_coll_1"), ("pipeline", [])])) # collections with same names as others in different dbs client["simple_db_2"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50)) client["simple_db_2"]["SIMPLE_COLL_1"].insert_many(generate_simple_coll_docs(50)) # collections with special characters in names client["special_db"]["hebrew_ישראל"].insert_many(generate_simple_coll_docs(50)) client['special_db']['hello!world?'].insert_many(generate_simple_coll_docs(50)) # Add datatype collections pattern = re.compile('.*') regex = bson.Regex.from_native(pattern) regex.flags ^= re.UNICODE datatype_doc = { "double_field": 4.3, "string_field": "a sample string", "object_field" : { "obj_field_1_key": "obj_field_1_val", "obj_field_2_key": "obj_field_2_val" }, "array_field" : [ "array_item_1", "array_item_2", "array_item_3" ], "binary_data_field" : b"a binary string", "object_id_field": bson.objectid.ObjectId(b'123456789123'), "boolean_field" : True, "date_field" : datetime.datetime.now(), "null_field": None, "regex_field" : regex, "32_bit_integer_field" : 32, "timestamp_field" : bson.timestamp.Timestamp(int(time.time()), 1), "64_bit_integer_field" : 34359738368, "decimal_field" : bson.Decimal128(decimal.Decimal('1.34')), "javaScript_field" : bson.code.Code("var x, y, z;"), "javaScript_with_scope_field" : bson.code.Code("function incrementX() { x++; }", scope={"x": 1}), "min_key_field" : bson.min_key.MinKey, "max_key_field" : bson.max_key.MaxKey } client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc) client["datatype_db"]["datatype_coll_2"].insert_one(datatype_doc) client["datatype_db"]["datatype_coll_2"].create_index([("date_field", pymongo.ASCENDING)]) client["datatype_db"]["datatype_coll_2"].create_index([("timestamp_field", pymongo.ASCENDING)]) client["datatype_db"]["datatype_coll_2"].create_index([("32_bit_integer_field", pymongo.ASCENDING)]) client["datatype_db"]["datatype_coll_2"].create_index([("64_bit_integer_field", pymongo.ASCENDING)])
def test_run(self): ensure_environment_variables_set() with get_test_connection() as client: # drop all dbs/collections drop_all_collections(client) # Create session 1 and insert docs to simple_coll_1 & simple_coll_2 ################# # Session 1 ################# session1 = client.start_session() session1.start_transaction() # simple_coll_1 has 10 documents client["simple_db"]["simple_coll_1"].insert_many( generate_simple_coll_docs(10)) # simple_coll_2 has 20 documents client["simple_db"]["simple_coll_2"].insert_many( generate_simple_coll_docs(20)) session1.commit_transaction() # Create session 2 ''' create empty collection update documents in simple_coll_1 & simple_coll_2 and tie to session 2 insert documents in simple_coll_3 and tie to session 2 execute the sync with uncommitted changes validate that the uncommitted changes are not replicated by the sync ''' ################ # Session 2 ################ session2 = client.start_session() session2.start_transaction() # simple_coll_3 is an empty collection client["simple_db"].create_collection("simple_coll_3") # update document from coll 1 and coll 2 client["simple_db"]["simple_coll_1"].update_one( {"int_field": 5}, {"$set": { "int_field": 11 }}, session=session2) client["simple_db"]["simple_coll_2"].update_one( {"int_field": 10}, {"$set": { "int_field": 21 }}, session=session2) # insert document to coll 3 client["simple_db"]["simple_coll_3"].insert_many( generate_simple_coll_docs(5), session=session2) # deletes do not matter in incremental replication, invalid scenario to test conn_id = connections.ensure_connection(self) # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = menagerie.get_catalogs(conn_id) # assert we find the correct streams which includes all collections which are part of session1 and session2 self.assertEqual(self.expected_check_streams_sync_1(), {c['tap_stream_id'] for c in found_catalogs}) # Select streams and add replication method metadata for stream_catalog in found_catalogs: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'INCREMENTAL', 'replication_key': 'int_field' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema, additional_md) # run full table sync sync_1 = runner.run_sync_mode(self, conn_id) # check exit status exit_status = menagerie.get_exit_status(conn_id, sync_1) menagerie.verify_sync_exit_status(self, exit_status, sync_1) # streams that we synced are the ones that we expect to see records_by_stream = runner.get_records_from_target_output() record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams_1(), self.expected_pks_1()) # validate the record count in collections which are part of session1 and session2, should not read updates on coll 1 and coll 2 and insert on coll 3. Because the transaction is not committed self.assertEqual(self.expected_row_counts_sync_1(), record_count_by_stream) # validate there are no duplicates replicated as part of sync1 records_2 = {} pk_dict_2 = {} for stream in self.expected_sync_streams_1(): records_2[stream] = [ x for x in records_by_stream[stream]['messages'] if x.get('action') == 'upsert' ] pk_2 = [] for record in range(len(records_2[stream])): pk_2.append(records_2[stream][record]['data']['int_field']) pk_dict_2[stream] = pk_2 self.assertEqual(self.expected_pk_values_2(), pk_dict_2) session2.commit_transaction() # Create session 3 ''' Execute another sync Validate that the documents committed as part of session 2 should now be replicated in sync_2 ''' ################ # Session 3 ################ session3 = client.start_session() session3.start_transaction() # Run 2nd sync # run in discovery mode sync_2 = runner.run_sync_mode(self, conn_id) exit_status_2 = menagerie.get_exit_status(conn_id, sync_2) menagerie.verify_sync_exit_status(self, exit_status_2, sync_2) records_by_stream_2 = runner.get_records_from_target_output() record_count_by_stream_2 = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams_2(), self.expected_pks_2()) # validate that we see the updates to coll 1 and coll 2 and insert to coll 3 in the 2nd sync # we see 2 records for coll 1 and coll 2, 1 record for update and the other record for the bookmarked record self.assertEqual(self.expected_row_counts_sync_2(), record_count_by_stream_2) # validate there are no duplicates replicated as part of sync1 records_3 = {} pk_dict_3 = {} for stream in self.expected_sync_streams_1(): records_3[stream] = [ x for x in records_by_stream_2[stream]['messages'] if x.get('action') == 'upsert' ] pk_3 = [] for record in range(len(records_3[stream])): pk_3.append(records_3[stream][record]['data']['int_field']) pk_dict_3[stream] = pk_3 self.assertEqual(self.expected_pk_values_3(), pk_dict_3) # Test case to validate tap behaviour when we delete bookmarked document and run sync state_2 = menagerie.get_state(conn_id) for stream in self.expected_check_streams_sync_1(): rep_key_value = state_2['bookmarks'][stream][ 'replication_key_value'] if stream == 'simple_db-simple_coll_1': collection = 'simple_coll_1' elif stream == 'simple_db-simple_coll_2': collection = 'simple_coll_2' elif stream == 'simple_db-simple_coll_3': collection = 'simple_coll_3' client["simple_db"][collection].delete_one( {"int_field": int(rep_key_value)}, session=session3) session3.commit_transaction() ''' Execute the sync, after the commit on session 3 Session 3 commits includes deleting the bookmarked value in each of the collection Validate the state does not change after deleting the bookmarked value Validate that the sync does not replicate any documents ''' state_3 = menagerie.get_state(conn_id) sync_3 = runner.run_sync_mode(self, conn_id) exit_status_3 = menagerie.get_exit_status(conn_id, sync_3) menagerie.verify_sync_exit_status(self, exit_status_3, sync_3) records_by_stream_3 = runner.get_records_from_target_output() record_count_by_stream_3 = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams_2(), self.expected_pks_2()) # validate that we see 0 records being replicated because we deleted the bookmark value on each of the collection self.assertEqual(self.expected_row_counts_sync_3(), record_count_by_stream_3) # validate that the state value has not changed after deleting the bookmarked value in each collection self.assertEqual(state_2, state_3)