Esempio n. 1
0
    def setUp(self):
        ensure_environment_variables_set()

        with get_test_connection() as client:

            ############# Drop all dbs/collections #############
            drop_all_collections(client)

            ############# Add simple collections #############
            # simple_coll_1 has 50 documents]
            client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))

            # simple_coll_2 has 100 documents
            client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))

            ############# Add Index on date_field ############
            client["simple_db"]["simple_coll_1"].create_index([("date_field", pymongo.ASCENDING)])
            client["simple_db"]["simple_coll_2"].create_index([("date_field", pymongo.ASCENDING)])

            # Add simple_coll per key type
            for key_name in self.key_names():
                client["simple_db"]["simple_coll_{}".format(key_name)].insert_many(generate_simple_coll_docs(50))

                # add index on field
                client["simple_db"]["simple_coll_{}".format(key_name)].create_index([(key_name, pymongo.ASCENDING)])
Esempio n. 2
0
    def setUp(self):
        ensure_environment_variables_set()

        with get_test_connection() as client:
            ############# Drop all dbs/collections #############
            drop_all_collections(client)

            ############# Add simple collections ############
            # simple_coll_1 has 50 documents
            client["simple_db"]["simple_coll_1"].insert_many(
                generate_simple_coll_docs(50))
Esempio n. 3
0
    def setUp(self):
        ensure_environment_variables_set()

        with get_test_connection() as client:
            # drop all dbs/collections
            drop_all_collections(client)

            # simple_coll_1 has 50 documents, id is an integer instead of ObjectId
            client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))

            # simple_coll_2 has 100 documents, id is an integer instead of ObjectId
            client["simple_db"]["simple_coll_2"].insert_many(generate_simple_binary_coll_docs(50))
Esempio n. 4
0
    def setUp(self):

        ensure_environment_variables_set()

        with get_test_connection() as client:
            drop_all_collections(client)

            # simple_coll_1 has 50 documents
            client["simple_db"]["simple_coll_1"].insert_many(
                generate_simple_coll_docs(50))

            # simple_coll_2 has 100 documents
            client["simple_db"]["simple_coll_2"].insert_many(
                generate_simple_coll_docs(100))
Esempio n. 5
0
    def setUp(self):
        ensure_environment_variables_set()

        with get_test_connection() as client:
            # drop all dbs/collections
            drop_all_collections(client)

            # simple_coll_1 has 50 documents
            client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))

            # create view on simple_coll_1
            client["simple_db"].command(bson.son.SON([("create", "simple_view_1"), ("viewOn", "simple_coll_1"), ("pipeline", [])]))

            # simple_coll_2 has 100 documents
            client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))

            # admin_coll_1 has 50 documents
            client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(50))

            # simple_coll_3 is an empty collection
            client["simple_db"].create_collection("simple_coll_3")

            # simple_coll_4 has documents with special chars and a lot of nesting
            client["simple_db"]["simple_coll_4"].insert_one({"hebrew_ישרא": "hebrew_ישרא"})
            client["simple_db"]["simple_coll_4"].insert_one({"hebrew_ישרא": 2})
            client["simple_db"]["simple_coll_4"].insert_one({"another_hebrew_ישראל": "another_hebrew_ישרא"})
            nested_doc = {"field0": {}}
            current_doc = nested_doc
            for i in range(1, 101):
                current_doc["field{}".format(i-1)]["field{}".format(i)] = {}
                current_doc = current_doc["field{}".format(i-1)]
            current_doc["field100"] = "some_value"
            client["simple_db"]["simple_coll_4"].insert_one(nested_doc)

            max_col_doc = {}
            for x in range(1600):
                max_col_doc['col_{}'.format(x)] = x
            client["simple_db"]["simple_coll_4"].insert_one(max_col_doc)
Esempio n. 6
0
    def setUp(self):
        ensure_environment_variables_set()

        with get_test_connection() as client:
            ############# Drop all dbs/collections #############
            drop_all_collections(client)

            ############# Add datatype collections #############
            pattern = re.compile('.*')
            regex = bson.Regex.from_native(pattern)
            regex.flags ^= re.UNICODE

            datatype_doc = {
                "double_field":
                4.3,
                "string_field":
                "a sample string",
                "object_field": {
                    "obj_field_1_key": "obj_field_1_val",
                    "obj_field_2_key": "obj_field_2_val"
                },
                "array_field":
                ["array_item_1", "array_item_2", "array_item_3"],
                "binary_data_field":
                bson.Binary(b"a binary string"),
                "object_id_field":
                bson.objectid.ObjectId(b'123456789123'),
                "boolean_field":
                True,
                "date_field":
                datetime.datetime(2019, 8, 15, 19, 29, 14, 578000),
                "null_field":
                None,
                "regex_field":
                regex,
                "32_bit_integer_field":
                32,
                "timestamp_field":
                bson.timestamp.Timestamp(1565897157, 1),
                "64_bit_integer_field":
                34359738368,
                "decimal_field":
                bson.Decimal128(decimal.Decimal('1.34')),
                "javaScript_field":
                bson.code.Code("var x, y, z;"),
                "javaScript_with_scope_field":
                bson.code.Code("function incrementX() { x++; }",
                               scope={"x": 1}),
                "min_key_field":
                bson.min_key.MinKey,
                "max_key_field":
                bson.max_key.MaxKey,
                "uuid_field":
                uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec72820c4'),
                "dbref_field":
                bson.dbref.DBRef("some_collection",
                                 bson.objectid.ObjectId(b'123456789123'),
                                 database='some_database')
            }

            client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc)

            # NB: Insert an invalid datetime to confirm that works correctly
            run_mongodb_javascript(
                "datatype_db",
                "db.invalid_datatype_coll.insert({ \"date_field\": new ISODate(\"0000-01-01T00:00:00.000Z\") });"
            )
Esempio n. 7
0
    def setUp(self):

        ensure_environment_variables_set()

        with get_test_connection() as client:
            # drop all dbs/collections
            drop_all_collections(client)

            # simple_coll_1 has 50 documents
            client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))

            # simple_coll_2 has 100 documents
            client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))

            # admin_coll_1 has 50 documents
            client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(50))

            # create view on simple_coll_1
            client["simple_db"].command(bson.son.SON([("create", "simple_view_1"), ("viewOn", "simple_coll_1"), ("pipeline", [])]))

            # collections with same names as others in different dbs
            client["simple_db_2"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
            client["simple_db_2"]["SIMPLE_COLL_1"].insert_many(generate_simple_coll_docs(50))

            # collections with special characters in names
            client["special_db"]["hebrew_ישראל"].insert_many(generate_simple_coll_docs(50))
            client['special_db']['hello!world?'].insert_many(generate_simple_coll_docs(50))

            # Add datatype collections
            pattern = re.compile('.*')
            regex = bson.Regex.from_native(pattern)
            regex.flags ^= re.UNICODE
            datatype_doc = {
                "double_field": 4.3,
                "string_field": "a sample string",
                "object_field" : {
                    "obj_field_1_key": "obj_field_1_val",
                    "obj_field_2_key": "obj_field_2_val"
                },
                "array_field" : [
                    "array_item_1",
                    "array_item_2",
                    "array_item_3"
                ],
                "binary_data_field" : b"a binary string",
                "object_id_field": bson.objectid.ObjectId(b'123456789123'),
                "boolean_field" : True,
                "date_field" : datetime.datetime.now(),
                "null_field": None,
                "regex_field" : regex,
                "32_bit_integer_field" : 32,
                "timestamp_field" : bson.timestamp.Timestamp(int(time.time()), 1),
                "64_bit_integer_field" : 34359738368,
                "decimal_field" : bson.Decimal128(decimal.Decimal('1.34')),
                "javaScript_field" : bson.code.Code("var x, y, z;"),
                "javaScript_with_scope_field" : bson.code.Code("function incrementX() { x++; }", scope={"x": 1}),
                "min_key_field" : bson.min_key.MinKey,
                "max_key_field" : bson.max_key.MaxKey
            }
            client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc)

            client["datatype_db"]["datatype_coll_2"].insert_one(datatype_doc)
            client["datatype_db"]["datatype_coll_2"].create_index([("date_field", pymongo.ASCENDING)])
            client["datatype_db"]["datatype_coll_2"].create_index([("timestamp_field", pymongo.ASCENDING)])
            client["datatype_db"]["datatype_coll_2"].create_index([("32_bit_integer_field", pymongo.ASCENDING)])
            client["datatype_db"]["datatype_coll_2"].create_index([("64_bit_integer_field", pymongo.ASCENDING)])
Esempio n. 8
0
    def test_run(self):

        ensure_environment_variables_set()

        with get_test_connection() as client:
            # drop all dbs/collections
            drop_all_collections(client)

            # Create session 1 and insert docs to simple_coll_1 & simple_coll_2

            #################
            # Session 1
            #################

            session1 = client.start_session()

            session1.start_transaction()

            # simple_coll_1 has 10 documents
            client["simple_db"]["simple_coll_1"].insert_many(
                generate_simple_coll_docs(10))

            # simple_coll_2 has 20 documents
            client["simple_db"]["simple_coll_2"].insert_many(
                generate_simple_coll_docs(20))

            session1.commit_transaction()

            # Create session 2
            '''
                create empty collection
                update documents in simple_coll_1 & simple_coll_2 and tie to session 2
                insert documents in simple_coll_3 and tie to session 2
                execute the sync with uncommitted changes
                validate that the uncommitted changes are not replicated by the sync
            '''
            ################
            # Session 2
            ################

            session2 = client.start_session()

            session2.start_transaction()

            # simple_coll_3 is an empty collection
            client["simple_db"].create_collection("simple_coll_3")

            # update document from coll 1 and coll 2
            client["simple_db"]["simple_coll_1"].update_one(
                {"int_field": 5}, {"$set": {
                    "int_field": 11
                }},
                session=session2)
            client["simple_db"]["simple_coll_2"].update_one(
                {"int_field": 10}, {"$set": {
                    "int_field": 21
                }},
                session=session2)

            # insert document to coll 3
            client["simple_db"]["simple_coll_3"].insert_many(
                generate_simple_coll_docs(5), session=session2)

            # deletes do not matter in incremental replication, invalid scenario to test

            conn_id = connections.ensure_connection(self)

            # run in discovery mode
            check_job_name = runner.run_check_mode(self, conn_id)

            # verify check exit codes
            exit_status = menagerie.get_exit_status(conn_id, check_job_name)
            menagerie.verify_check_exit_status(self, exit_status,
                                               check_job_name)

            # verify the tap discovered the right streams
            found_catalogs = menagerie.get_catalogs(conn_id)

            # assert we find the correct streams which includes all collections which are part of session1 and session2
            self.assertEqual(self.expected_check_streams_sync_1(),
                             {c['tap_stream_id']
                              for c in found_catalogs})

            # Select streams and add replication method metadata
            for stream_catalog in found_catalogs:
                annotated_schema = menagerie.get_annotated_schema(
                    conn_id, stream_catalog['stream_id'])
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-method': 'INCREMENTAL',
                        'replication_key': 'int_field'
                    }
                }]
                selected_metadata = connections.select_catalog_and_fields_via_metadata(
                    conn_id, stream_catalog, annotated_schema, additional_md)

            # run full table sync
            sync_1 = runner.run_sync_mode(self, conn_id)

            # check exit status
            exit_status = menagerie.get_exit_status(conn_id, sync_1)
            menagerie.verify_sync_exit_status(self, exit_status, sync_1)

            # streams that we synced are the ones that we expect to see
            records_by_stream = runner.get_records_from_target_output()
            record_count_by_stream = runner.examine_target_output_file(
                self, conn_id, self.expected_sync_streams_1(),
                self.expected_pks_1())

            # validate the record count in collections which are part of session1 and session2, should not read updates on coll 1 and coll 2 and insert on coll 3. Because the transaction is not committed
            self.assertEqual(self.expected_row_counts_sync_1(),
                             record_count_by_stream)

            # validate there are no duplicates replicated as part of sync1
            records_2 = {}
            pk_dict_2 = {}
            for stream in self.expected_sync_streams_1():
                records_2[stream] = [
                    x for x in records_by_stream[stream]['messages']
                    if x.get('action') == 'upsert'
                ]
                pk_2 = []
                for record in range(len(records_2[stream])):
                    pk_2.append(records_2[stream][record]['data']['int_field'])
                pk_dict_2[stream] = pk_2

            self.assertEqual(self.expected_pk_values_2(), pk_dict_2)

            session2.commit_transaction()

            # Create session 3
            '''
               Execute another sync
               Validate that the documents committed as part of session 2 should now be replicated in sync_2
            '''
            ################
            # Session 3
            ################

            session3 = client.start_session()

            session3.start_transaction()

            # Run 2nd sync
            # run in discovery mode

            sync_2 = runner.run_sync_mode(self, conn_id)
            exit_status_2 = menagerie.get_exit_status(conn_id, sync_2)
            menagerie.verify_sync_exit_status(self, exit_status_2, sync_2)

            records_by_stream_2 = runner.get_records_from_target_output()
            record_count_by_stream_2 = runner.examine_target_output_file(
                self, conn_id, self.expected_sync_streams_2(),
                self.expected_pks_2())
            # validate that we see the updates to coll 1 and coll 2 and insert to coll 3 in the 2nd sync
            # we see 2 records for coll 1 and coll 2, 1 record for update and the other record for the bookmarked record
            self.assertEqual(self.expected_row_counts_sync_2(),
                             record_count_by_stream_2)

            # validate there are no duplicates replicated as part of sync1
            records_3 = {}
            pk_dict_3 = {}
            for stream in self.expected_sync_streams_1():
                records_3[stream] = [
                    x for x in records_by_stream_2[stream]['messages']
                    if x.get('action') == 'upsert'
                ]
                pk_3 = []
                for record in range(len(records_3[stream])):
                    pk_3.append(records_3[stream][record]['data']['int_field'])
                pk_dict_3[stream] = pk_3

            self.assertEqual(self.expected_pk_values_3(), pk_dict_3)

            # Test case to validate tap behaviour when we delete bookmarked document and run sync
            state_2 = menagerie.get_state(conn_id)

            for stream in self.expected_check_streams_sync_1():
                rep_key_value = state_2['bookmarks'][stream][
                    'replication_key_value']
                if stream == 'simple_db-simple_coll_1':
                    collection = 'simple_coll_1'
                elif stream == 'simple_db-simple_coll_2':
                    collection = 'simple_coll_2'
                elif stream == 'simple_db-simple_coll_3':
                    collection = 'simple_coll_3'
                client["simple_db"][collection].delete_one(
                    {"int_field": int(rep_key_value)}, session=session3)

            session3.commit_transaction()
            '''
               Execute the sync, after the commit on session 3
               Session 3 commits includes deleting the bookmarked value in each of the collection
               Validate the state does not change after deleting the bookmarked value
               Validate that the sync does not replicate any documents
            '''
            state_3 = menagerie.get_state(conn_id)
            sync_3 = runner.run_sync_mode(self, conn_id)
            exit_status_3 = menagerie.get_exit_status(conn_id, sync_3)
            menagerie.verify_sync_exit_status(self, exit_status_3, sync_3)
            records_by_stream_3 = runner.get_records_from_target_output()
            record_count_by_stream_3 = runner.examine_target_output_file(
                self, conn_id, self.expected_sync_streams_2(),
                self.expected_pks_2())

            # validate that we see 0 records being replicated because we deleted the bookmark value on each of the collection
            self.assertEqual(self.expected_row_counts_sync_3(),
                             record_count_by_stream_3)

            # validate that the state value has not changed after deleting the bookmarked value in each collection
            self.assertEqual(state_2, state_3)