Example #1
0
    def setUp(self):
        ensure_environment_variables_set()

        with get_test_connection() as client:

            ############# Drop all dbs/collections #############
            drop_all_collections(client)

            ############# Add simple collections #############
            # simple_coll_1 has 50 documents]
            client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))

            # simple_coll_2 has 100 documents
            client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))

            ############# Add Index on date_field ############
            client["simple_db"]["simple_coll_1"].create_index([("date_field", pymongo.ASCENDING)])
            client["simple_db"]["simple_coll_2"].create_index([("date_field", pymongo.ASCENDING)])

            # Add simple_coll per key type
            for key_name in self.key_names():
                client["simple_db"]["simple_coll_{}".format(key_name)].insert_many(generate_simple_coll_docs(50))

                # add index on field
                client["simple_db"]["simple_coll_{}".format(key_name)].create_index([(key_name, pymongo.ASCENDING)])
Example #2
0
    def setUp(self):
        ensure_environment_variables_set()

        with get_test_connection() as client:
            ############# Drop all dbs/collections #############
            drop_all_collections(client)

            ############# Add simple collections ############
            # simple_coll_1 has 50 documents
            client["simple_db"]["simple_coll_1"].insert_many(
                generate_simple_coll_docs(50))
Example #3
0
    def setUp(self):
        ensure_environment_variables_set()

        with get_test_connection() as client:
            # drop all dbs/collections
            drop_all_collections(client)

            # simple_coll_1 has 50 documents, id is an integer instead of ObjectId
            client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))

            # simple_coll_2 has 100 documents, id is an integer instead of ObjectId
            client["simple_db"]["simple_coll_2"].insert_many(generate_simple_binary_coll_docs(50))
Example #4
0
    def setUp(self):

        ensure_environment_variables_set()

        with get_test_connection() as client:
            drop_all_collections(client)

            # simple_coll_1 has 50 documents
            client["simple_db"]["simple_coll_1"].insert_many(
                generate_simple_coll_docs(50))

            # simple_coll_2 has 100 documents
            client["simple_db"]["simple_coll_2"].insert_many(
                generate_simple_coll_docs(100))
Example #5
0
    def setUp(self):
        ensure_environment_variables_set()

        with get_test_connection() as client:
            # drop all dbs/collections
            drop_all_collections(client)

            # simple_coll_1 has 50 documents
            client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))

            # create view on simple_coll_1
            client["simple_db"].command(bson.son.SON([("create", "simple_view_1"), ("viewOn", "simple_coll_1"), ("pipeline", [])]))

            # simple_coll_2 has 100 documents
            client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))

            # admin_coll_1 has 50 documents
            client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(50))

            # simple_coll_3 is an empty collection
            client["simple_db"].create_collection("simple_coll_3")

            # simple_coll_4 has documents with special chars and a lot of nesting
            client["simple_db"]["simple_coll_4"].insert_one({"hebrew_ישרא": "hebrew_ישרא"})
            client["simple_db"]["simple_coll_4"].insert_one({"hebrew_ישרא": 2})
            client["simple_db"]["simple_coll_4"].insert_one({"another_hebrew_ישראל": "another_hebrew_ישרא"})
            nested_doc = {"field0": {}}
            current_doc = nested_doc
            for i in range(1, 101):
                current_doc["field{}".format(i-1)]["field{}".format(i)] = {}
                current_doc = current_doc["field{}".format(i-1)]
            current_doc["field100"] = "some_value"
            client["simple_db"]["simple_coll_4"].insert_one(nested_doc)

            max_col_doc = {}
            for x in range(1600):
                max_col_doc['col_{}'.format(x)] = x
            client["simple_db"]["simple_coll_4"].insert_one(max_col_doc)
Example #6
0
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id'] for c in found_catalogs})

        #  -------------------------------------------
        #  ----------- First full Table Sync ---------
        #  -------------------------------------------
        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
            additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
                                                                                    stream_catalog,
                                                                                    annotated_schema,
                                                                                    additional_md)

        # run full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # check exit status
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # streams that we synced are the ones that we expect to see
        records_by_stream = runner.get_records_from_target_output()
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        # assert that we get the correct number of records for each stream
        self.assertEqual(self.expected_row_counts(),record_count_by_stream)

        # assert that an activate_version_message is first and last message sent for each stream
        for stream_name in self.expected_sync_streams():
            self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action'])
            self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action'])

        state = menagerie.get_state(conn_id)

        first_versions = {}

        for tap_stream_id in self.expected_check_streams():

            # state has an initial_full_table_complete == True
            self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete'])

            # there is a version bookmark in state
            first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version']
            self.assertIsNotNone(first_versions[tap_stream_id])

        #  -------------------------------------------
        #  ----------- Second full Table Sync ---------
        #  -------------------------------------------
        with get_test_connection() as client:
            # update existing documents in the collection to make sure we get the updates as well in the next sync
            doc_to_update = client["simple_db"]["simple_coll_1"].find_one()
            client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update["_id"]}, {"$set": {"int_field": 999}})

            doc_to_update = client["simple_db"]["simple_coll_2"].find_one()
            client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update["_id"]}, {"$set": {"int_field": 888}})

            doc_to_update = client["admin"]["admin_coll_1"].find_one()
            client["admin"]["admin_coll_1"].find_one_and_update({"_id": doc_to_update["_id"]}, {"$set": {"int_field": 777}})

            # add 2 rows and run full table again, make sure we get initial number + 2
            client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(2))

            client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(2))

            client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(2))

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # check exit status
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        state = menagerie.get_state(conn_id)

        # Verify  that menagerie state does not include a key for currently syncing
        self.assertIsNone(state['currently_syncing'])

        # Verify that menagerie state does not include a key for oplog based syncing
        self.assertNotIn('oplog', state)

        # assert that we have correct number of records (including the two new records and the update which is to be resynced)
        new_expected_row_counts = {k: v+2 for k, v in self.expected_row_counts().items() if k not in ['simple_db_simple_coll_3',
                                                                                                    'simple_db_simple_coll_4']}
        new_expected_row_counts['simple_db_simple_coll_3']=0
        new_expected_row_counts['simple_db_simple_coll_4']=5
        self.assertEqual(new_expected_row_counts, record_count_by_stream)

        # assert that we only have an ActivateVersionMessage as the last message and not the first
        for stream_name in self.expected_sync_streams():
            if len(records_by_stream[stream_name]['messages']) > 1:
                self.assertNotEqual('activate_version', records_by_stream[stream_name]['messages'][0]['action'], stream_name + "failed")
                self.assertEqual('upsert', records_by_stream[stream_name]['messages'][0]['action'], stream_name + "failed")
            self.assertEqual('activate_version', records_by_stream[stream_name]['messages'][-1]['action'], stream_name + "failed")

        second_versions = {}
        for tap_stream_id in self.expected_check_streams():
            found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]

            # state has an initial_full_table_complete == True
            self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete'])

            # version bookmark
            second_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version']
            self.assertIsNotNone(second_versions[tap_stream_id])

            # version in this state is different than that of the previous state
            self.assertNotEqual(first_versions[tap_stream_id], second_versions[tap_stream_id])

            # version which is larger than the previous target version
            self.assertGreater(second_versions[tap_stream_id], first_versions[tap_stream_id])

            # verify that menagerie state does include the version which matches the target version
            self.assertEqual(records_by_stream[self.tap_stream_id_to_stream()[tap_stream_id]]['table_version'], second_versions[tap_stream_id])
Example #7
0
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id']
                          for c in found_catalogs})

        for tap_stream_id in self.expected_check_streams():
            found_stream = [
                c for c in found_catalogs
                if c['tap_stream_id'] == tap_stream_id
            ][0]

            # assert that the pks are correct
            self.assertEqual(
                self.expected_pks()[found_stream['stream_name']],
                set(
                    found_stream.get('metadata',
                                     {}).get('table-key-properties')))

            # assert that the row counts are correct
            self.assertEqual(
                self.expected_row_counts()[found_stream['stream_name']],
                found_stream.get('metadata', {}).get('row-count'))

        #  -----------------------------------
        # ----------- Initial Full Table ---------
        #  -----------------------------------
        # Select simple_coll_1 and add replication method metadata
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        for stream_catalog in found_catalogs:
            if stream_catalog['tap_stream_id'] == 'simple_db-simple_coll_1':
                annotated_schema = menagerie.get_annotated_schema(
                    conn_id, stream_catalog['stream_id'])
                selected_metadata = connections.select_catalog_and_fields_via_metadata(
                    conn_id, stream_catalog, annotated_schema, additional_md)

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # Verify that the full table was synced
        tap_stream_id = 'simple_db-simple_coll_1'
        self.assertGreaterEqual(record_count_by_stream['simple_coll_1'],
                                self.expected_row_counts()['simple_coll_1'])

        # Verify that we have 'initial_full_table_complete' bookmark
        state = menagerie.get_state(conn_id)
        first_versions = {}

        # assert that the state has an initial_full_table_complete == True
        self.assertTrue(
            state['bookmarks'][tap_stream_id]['initial_full_table_complete'])
        # assert that there is a version bookmark in state
        first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id][
            'version']
        self.assertIsNotNone(first_versions[tap_stream_id])
        # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark
        self.assertIsNotNone(
            state['bookmarks'][tap_stream_id]['oplog_ts_time'])
        self.assertIsNotNone(state['bookmarks'][tap_stream_id]['oplog_ts_inc'])

        # Insert records to coll_1 to get the bookmark to be a ts on coll_1
        with get_test_connection() as client:
            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field":
                101,
                "string_field":
                random_string_generator()
            })
        sync_job_name = runner.run_sync_mode(self, conn_id)

        changed_ids = set()
        with get_test_connection() as client:
            # Make changes to not selected collection
            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 0})[0]['_id'])
            client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 1})[0]['_id'])
            client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 98})[0]['_id'])
            client["simple_db"]["simple_coll_2"].update_one(
                {'int_field': 98}, {'$set': {
                    'int_field': -1
                }})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 99})[0]['_id'])
            client["simple_db"]["simple_coll_2"].update_one(
                {'int_field': 99}, {'$set': {
                    'int_field': -1
                }})

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field":
                100,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 100})[0]['_id'])

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field":
                101,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 101})[0]['_id'])

        #  -----------------------------------
        # ----------- Subsequent Oplog Sync ---------
        #  -----------------------------------

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()
        records_by_stream = {
            'simple_coll_1': [
                x for x in messages_by_stream['simple_coll_1']['messages']
                if x.get('action') == 'upsert'
            ]
        }

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # 1 record due to fencepost querying on oplog ts
        self.assertEqual(1, record_count_by_stream['simple_coll_1'])

        final_state = menagerie.get_state(conn_id)

        with get_test_connection() as client:
            row = client.local.oplog.rs.find_one(sort=[('$natural',
                                                        pymongo.DESCENDING)])
            latest_oplog_ts = row.get('ts')

        self.assertEqual((latest_oplog_ts.time, latest_oplog_ts.inc),
                         (final_state['bookmarks']['simple_db-simple_coll_1']
                          ['oplog_ts_time'], final_state['bookmarks']
                          ['simple_db-simple_coll_1']['oplog_ts_inc']))
Example #8
0
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id']
                          for c in found_catalogs})

        #  -----------------------------------
        # ----------- Full Table Sync ---------
        #  -----------------------------------
        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    'replication-method': 'FULL_TABLE'
                }
            }]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema, additional_md)
        # Synthesize interrupted state
        interrupted_state = {
            'currently_syncing': 'simple_db-simple_coll_1',
            'bookmarks': {}
        }

        versions = {}
        with get_test_connection() as client:
            for stream_name in self.expected_sync_streams():
                rows = [
                    x for x in client['simple_db'][stream_name].find(
                        sort=[("_id", pymongo.ASCENDING)])
                ]
                # set last_id_fetched to middle point of table
                last_id_fetched = str(rows[int(len(rows) / 2)]['_id'])
                max_id_value = str(rows[-1]['_id'])

                tap_stream_id = 'simple_db-' + stream_name
                version = int(time.time() * 1000)
                interrupted_state['bookmarks'][tap_stream_id] = {
                    'max_id_value': max_id_value,
                    'max_id_type': 'ObjectId',
                    'initial_full_table_complete': False,
                    'last_id_fetched': last_id_fetched,
                    'last_id_fetched_type': 'ObjectId',
                    'version': version
                }
                versions[tap_stream_id] = version

        # update existing documents in collection with int_field value less than 25, and verify they do not come up in the sync
        # update existing documents in collection with int_field value greater than 25, and verify they come up in the sync

        # find_one() is going to retreive the first document in the collection
            doc_to_update_1 = client["simple_db"]["simple_coll_1"].find_one()
            client["simple_db"]["simple_coll_1"].find_one_and_update(
                {"_id": doc_to_update_1["_id"]}, {"$set": {
                    "int_field": 999
                }})

            doc_to_update_2 = client["simple_db"]["simple_coll_2"].find_one()
            client["simple_db"]["simple_coll_2"].find_one_and_update(
                {"_id": doc_to_update_2["_id"]}, {"$set": {
                    "int_field": 888
                }})

            doc_to_update_3 = client["simple_db"]["simple_coll_1"].find_one(
                {"int_field": 30})
            client["simple_db"]["simple_coll_1"].find_one_and_update(
                {"_id": doc_to_update_3["_id"]}, {"$set": {
                    "int_field": 777
                }})

            doc_to_update_4 = client["simple_db"]["simple_coll_2"].find_one(
                {"int_field": 80})
            client["simple_db"]["simple_coll_2"].find_one_and_update(
                {"_id": doc_to_update_4["_id"]}, {"$set": {
                    "int_field": 666
                }})

        menagerie.set_state(conn_id, interrupted_state)

        runner.run_sync_mode(self, conn_id)

        # streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # record counts
        records_by_stream = runner.get_records_from_target_output()
        self.assertEqual(self.expected_row_counts(), record_count_by_stream)

        # ActivateVersionMessage as the last message and not the first
        for stream_name in self.expected_sync_streams():
            self.assertNotEqual(
                'activate_version',
                records_by_stream[stream_name]['messages'][0]['action'])
            self.assertEqual(
                'activate_version',
                records_by_stream[stream_name]['messages'][-1]['action'])

        # _id of the first record sync'd for each stream is the bookmarked
        # last_id_fetched from the interrupted_state passed to the tap
        self.assertEqual(
            records_by_stream['simple_coll_1']['messages'][0]['data']['_id'],
            interrupted_state['bookmarks']['simple_db-simple_coll_1']
            ['last_id_fetched'])
        self.assertEqual(
            records_by_stream['simple_coll_2']['messages'][0]['data']['_id'],
            interrupted_state['bookmarks']['simple_db-simple_coll_2']
            ['last_id_fetched'])

        # _id of the last record sync'd for each stream is the bookmarked
        # max_id_value from the interrupted_state passed to the tap
        self.assertEqual(
            records_by_stream['simple_coll_1']['messages'][-2]['data']['_id'],
            interrupted_state['bookmarks']['simple_db-simple_coll_1']
            ['max_id_value'])
        self.assertEqual(
            records_by_stream['simple_coll_2']['messages'][-2]['data']['_id'],
            interrupted_state['bookmarks']['simple_db-simple_coll_2']
            ['max_id_value'])

        # verify we are not seeing any documents which were updated having id < interrupted id value
        # checking just the first document value
        self.assertNotEqual(
            999, records_by_stream['simple_coll_1']['messages'][0]['data']
            ['int_field'])
        self.assertNotEqual(
            888, records_by_stream['simple_coll_2']['messages'][0]['data']
            ['int_field'])
        # checking if the updates are visible in all the documents in simple_coll_1
        int_value = False
        for x in records_by_stream['simple_coll_1']['messages'][:-1]:
            # We are not considering the last element of this list because it does not have 'data'
            if int(x['data']['int_field']) == 999:
                int_value = True
        self.assertEqual(False, int_value)
        # checking if the updates are visible in all the documents in simple_coll_2
        int_value2 = False
        for x in records_by_stream['simple_coll_1']['messages'][:-1]:
            if x['data']['int_field'] == 888:
                int_value2 = True
        self.assertEqual(False, int_value2)

        # verify we are seeing the documents which were updated having id > interruped id value
        # we are picking the 5th and 15th element in the list because we updated the 30th and 40th document, (doc starting with 25)
        self.assertEqual(
            777, records_by_stream['simple_coll_1']['messages'][5]['data']
            ['int_field'])
        self.assertEqual(
            666, records_by_stream['simple_coll_2']['messages'][30]['data']
            ['int_field'])

        # assert that final state has no last_id_fetched and max_id_value bookmarks
        final_state = menagerie.get_state(conn_id)
        for tap_stream_id in self.expected_check_streams():
            self.assertIsNone(
                final_state['bookmarks'][tap_stream_id].get('last_id_fetched'))
            self.assertIsNone(
                final_state['bookmarks'][tap_stream_id].get('max_id_value'))

        state = menagerie.get_state(conn_id)
        for tap_stream_id, stream_bookmarks in state.get('bookmarks',
                                                         {}).items():
            self.assertTrue(
                stream_bookmarks.get('initial_full_table_complete', False))
Example #9
0
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id'] for c in found_catalogs})

        for tap_stream_id in self.expected_check_streams():
            found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]

            # assert that the pks are correct
            self.assertEqual(self.expected_pks()[found_stream['stream_name']],
                             set(found_stream.get('metadata', {}).get('table-key-properties')))

            # assert that the row counts are correct
            self.assertEqual(self.expected_row_counts()[found_stream['stream_name']],
                             found_stream.get('metadata', {}).get('row-count'))

        #  -----------------------------------
        # ----------- Full Table Sync ---------
        #  -----------------------------------
        # select simple_coll_1 stream and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
            additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
                                                                                    stream_catalog,
                                                                                    annotated_schema,
                                                                                    additional_md)
        # synthesize interrupted state
        interrupted_state = {
            'currently_syncing' : 'simple_db-simple_coll_1',
            'bookmarks' : {'simple_db-simple_coll_1': { 'max_id_value': 49,
                                                        'max_id_type': 'int',
                                                        'initial_full_table_complete': False,
                                                        'last_id_fetched': 25,
                                                        'last_id_fetched_type': 'int',
                                                        'version': int(time.time() * 1000)},
                           'simple_db-simple_coll_2': { 'max_id_value': base64.b64encode("test {}".format(49).encode()),
                                                        'max_id_type': 'bytes',
                                                        'initial_full_table_complete': False,
                                                        'last_id_fetched': base64.b64encode("test {}".format(25).encode()),
                                                        'last_id_fetched_type': 'bytes',
                                                        'version': int(time.time() * 1000)}}}

        # update existing documents in collection with int_field value less than 25, and verify they do not come up in the sync
        # update existing documents in collection with int_field value greater than 25, and verify they come up in the sync
        with get_test_connection() as client:
            # find_one() is going to retreive the first document in the collection
            doc_to_update_1 = client["simple_db"]["simple_coll_1"].find_one()
            client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update_1["_id"]}, {"$set": {"int_field": 999}})

            doc_to_update_2 = client["simple_db"]["simple_coll_2"].find_one()
            client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update_2["_id"]}, {"$set": {"int_field": 888}})

            doc_to_update_3 = client["simple_db"]["simple_coll_1"].find_one({"int_field": 30})
            client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": doc_to_update_3["_id"]}, {"$set": {"int_field": 777}})

            doc_to_update_4 = client["simple_db"]["simple_coll_2"].find_one({"int_field": 40})
            client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": doc_to_update_4["_id"]}, {"$set": {"int_field": 666}})

        menagerie.set_state(conn_id, interrupted_state)
        runner.run_sync_mode(self, conn_id)

        # streams that we synced are the ones that we expect to see
        records_by_stream = runner.get_records_from_target_output()
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        # ActivateVersionMessage as the last message and not the first
        for stream_name in self.expected_sync_streams():
            self.assertNotEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action'])
            self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action'])

        # _id of the first record sync'd for each stream is the bookmarked
        # last_id_fetched from the interrupted_state passed to the tap
        self.assertEqual(records_by_stream['simple_coll_1']['messages'][0]['data']['_id'],
                         int(interrupted_state['bookmarks']['simple_db-simple_coll_1']['last_id_fetched']))

        # _id of the last record sync'd for each stream is the bookmarked
        # max_id_value from the interrupted_state passed to the tap
        self.assertEqual(records_by_stream['simple_coll_1']['messages'][-2]['data']['_id'],
                         int(interrupted_state['bookmarks']['simple_db-simple_coll_1']['max_id_value']))

        # verify we are not seeing any documents which were updated having id < 25
        self.assertNotEqual(999, records_by_stream['simple_coll_1']['messages'][0]['data']['int_field'])
        self.assertNotEqual(888, records_by_stream['simple_coll_2']['messages'][0]['data']['int_field'])

        int_value = False
        for x in records_by_stream['simple_coll_1']['messages'][:-1]:
            # We are not considering the last element of this list because it does not have 'data'
            if int(x['data']['int_field']) == 999:
                int_value = True
        self.assertEqual(False, int_value)

        int_value2 = False
        for x in records_by_stream['simple_coll_1']['messages'][:-1]:
            if x['data']['int_field'] == 888:
                int_value2 = True
        self.assertEqual(False, int_value2)

        # verify we are seeing the documents which were updated having id > 25
        # we are picking the 5th and 15th element in the list because we updated the 30th and 40th document, (doc starting with 25)
        self.assertEqual(777, records_by_stream['simple_coll_1']['messages'][5]['data']['int_field'])
        self.assertEqual(666, records_by_stream['simple_coll_2']['messages'][15]['data']['int_field'])

        # assert that final state has no last_id_fetched and max_id_value bookmarks
        final_state = menagerie.get_state(conn_id)
        for tap_stream_id in self.expected_check_streams():
            self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('last_id_fetched'))
            self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('max_id_value'))
Example #10
0
    def setUp(self):
        ensure_environment_variables_set()

        with get_test_connection() as client:
            ############# Drop all dbs/collections #############
            drop_all_collections(client)

            ############# Add datatype collections #############
            pattern = re.compile('.*')
            regex = bson.Regex.from_native(pattern)
            regex.flags ^= re.UNICODE

            datatype_doc = {
                "double_field":
                4.3,
                "string_field":
                "a sample string",
                "object_field": {
                    "obj_field_1_key": "obj_field_1_val",
                    "obj_field_2_key": "obj_field_2_val"
                },
                "array_field":
                ["array_item_1", "array_item_2", "array_item_3"],
                "binary_data_field":
                bson.Binary(b"a binary string"),
                "object_id_field":
                bson.objectid.ObjectId(b'123456789123'),
                "boolean_field":
                True,
                "date_field":
                datetime.datetime(2019, 8, 15, 19, 29, 14, 578000),
                "null_field":
                None,
                "regex_field":
                regex,
                "32_bit_integer_field":
                32,
                "timestamp_field":
                bson.timestamp.Timestamp(1565897157, 1),
                "64_bit_integer_field":
                34359738368,
                "decimal_field":
                bson.Decimal128(decimal.Decimal('1.34')),
                "javaScript_field":
                bson.code.Code("var x, y, z;"),
                "javaScript_with_scope_field":
                bson.code.Code("function incrementX() { x++; }",
                               scope={"x": 1}),
                "min_key_field":
                bson.min_key.MinKey,
                "max_key_field":
                bson.max_key.MaxKey,
                "uuid_field":
                uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec72820c4'),
                "dbref_field":
                bson.dbref.DBRef("some_collection",
                                 bson.objectid.ObjectId(b'123456789123'),
                                 database='some_database')
            }

            client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc)

            # NB: Insert an invalid datetime to confirm that works correctly
            run_mongodb_javascript(
                "datatype_db",
                "db.invalid_datatype_coll.insert({ \"date_field\": new ISODate(\"0000-01-01T00:00:00.000Z\") });"
            )
Example #11
0
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id']
                          for c in found_catalogs})

        for tap_stream_id in self.expected_check_streams():
            found_stream = [
                c for c in found_catalogs
                if c['tap_stream_id'] == tap_stream_id
            ][0]

            # assert that the pks are correct
            self.assertEqual(
                self.expected_pks()[found_stream['stream_name']],
                set(
                    found_stream.get('metadata',
                                     {}).get('table-key-properties')))

            # assert that the row counts are correct
            self.assertEqual(
                self.expected_row_counts()[found_stream['stream_name']],
                found_stream.get('metadata', {}).get('row-count'))

        #  -----------------------------------
        # ----------- Full Table Sync ---------
        #  -----------------------------------
        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    'replication-method': 'FULL_TABLE'
                }
            }]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema, additional_md)

        # run full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # assert that we get the correct number of records for each stream
        self.assertEqual(self.expected_row_counts(), record_count_by_stream)

        # assert that an activate_version_message is first and last message sent for each stream
        for stream_name in self.expected_sync_streams():
            self.assertEqual(
                'activate_version',
                records_by_stream[stream_name]['messages'][0]['action'])
            self.assertEqual(
                'activate_version',
                records_by_stream[stream_name]['messages'][-1]['action'])

        state = menagerie.get_state(conn_id)

        first_versions = {}

        for tap_stream_id in self.expected_check_streams():
            # assert that the state has an initial_full_table_complete == True
            self.assertTrue(state['bookmarks'][tap_stream_id]
                            ['initial_full_table_complete'])
            # assert that there is a version bookmark in state
            first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id][
                'version']
            self.assertIsNotNone(first_versions[tap_stream_id])

        record_id = None
        with get_test_connection() as client:
            record_id = str([
                x for x in client['datatype_db']['datatype_coll_1'].find()
            ][0]['_id'])

        expected_record = {
            "javaScript_field": "var x, y, z;",
            "timestamp_field": "2019-08-15T19:25:57.000000Z",
            "_id": record_id,
            "date_field": "2019-08-15T19:29:14.578000Z",
            "string_field": "a sample string",
            "object_field": {
                "obj_field_2_key": "obj_field_2_val",
                "obj_field_1_key": "obj_field_1_val"
            },
            "null_field": None,
            "regex_field": {
                "flags": 0,
                "pattern": ".*"
            },
            "object_id_field": "313233343536373839313233",
            "64_bit_integer_field": 34359738368,
            "32_bit_integer_field": 32,
            "array_field": ["array_item_1", "array_item_2", "array_item_3"],
            "binary_data_field": "YSBiaW5hcnkgc3RyaW5n",
            "javaScript_with_scope_field": {
                "scope": "{'x': 1}",
                "value": "function incrementX() { x++; }"
            },
            "double_field": decimal.Decimal('4.3'),
            "boolean_field": True,
            "decimal_field": decimal.Decimal('1.34'),
            'uuid_field': "3e139ff5-d622-45c6-bf9e-1dfec72820c4",
            "dbref_field": {
                "id": "313233343536373839313233",
                "database": "some_database",
                "collection": "some_collection"
            }
        }

        self.assertEquals(
            expected_record,
            records_by_stream['datatype_coll_1']['messages'][1]['data'])
Example #12
0
    def modify_database(self):
        with get_test_connection() as client:
            # Delete two documents for each collection

            client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0})

            client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1})

            client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0})

            client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1})

            # Update two documents for each collection
            client["simple_db"]["simple_coll_1"].update_one(
                {'int_field': 48}, {'$set': {
                    'int_field': -1
                }})

            client["simple_db"]["simple_coll_1"].update_one(
                {'int_field': 49}, {'$set': {
                    'int_field': -1
                }})

            client["simple_db"]["simple_coll_2"].update_one(
                {'int_field': 98}, {'$set': {
                    'int_field': -1
                }})

            client["simple_db"]["simple_coll_2"].update_one(
                {'int_field': 99}, {'$set': {
                    'int_field': -1
                }})

            # Insert two documents for each collection
            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field":
                50,
                "string_field":
                random_string_generator()
            })

            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field":
                51,
                "string_field":
                random_string_generator()
            })

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field":
                100,
                "string_field":
                random_string_generator()
            })

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field":
                101,
                "string_field":
                random_string_generator()
            })
Example #13
0
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        catalog = menagerie.get_catalog(conn_id)
        found_catalogs = menagerie.get_catalogs(conn_id)
        found_streams = {entry['tap_stream_id'] for entry in catalog['streams']}
        self.assertSetEqual(self.expected_check_streams(), found_streams)

        # verify the tap discovered stream metadata is consistent with the source database
        for tap_stream_id in self.expected_check_streams():
            with self.subTest(stream=tap_stream_id):

                # gather expectations
                stream = tap_stream_id.split('-')[1]
                expected_primary_key = self.expected_pks()[stream]
                expected_row_count = self.expected_row_counts()[stream]
                expected_replication_keys = self.expected_valid_replication_keys()[stream]

                # gather results
                found_stream = [entry for entry in catalog['streams'] if entry['tap_stream_id'] == tap_stream_id][0]
                stream_metadata = [entry['metadata'] for entry in found_stream['metadata'] if entry['breadcrumb']==[]][0]
                primary_key = set(stream_metadata.get('table-key-properties'))
                row_count = stream_metadata.get('row-count')
                replication_key = set(stream_metadata.get('valid-replication-keys'))

                # assert that the pks are correct
                self.assertSetEqual(expected_primary_key, primary_key)

                # assert that the row counts are correct
                self.assertEqual(expected_row_count, row_count)

                # assert that valid replication keys are correct
                self.assertSetEqual(replication_key, expected_replication_keys)

        #  -----------------------------------
        # ----------- Initial Sync ---------
        #  -----------------------------------

        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
            rep_key = 'date_field'
            for key in self.key_names():
                if key in stream_catalog['stream_name']:
                    rep_key = key
            additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'INCREMENTAL',
                                                                'replication-key': rep_key}}]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
                                                                                   stream_catalog,
                                                                                   annotated_schema,
                                                                                   additional_md)

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()

        # gather expectations
        expected_schema = {'type': 'object'}

        for tap_stream_id in self.expected_sync_streams():
            with self.subTest(stream=tap_stream_id):

                # gather results
                persisted_schema = messages_by_stream[tap_stream_id]['schema']

                # assert the schema is an object
                self.assertDictEqual(expected_schema, persisted_schema)

        # verify that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        # verify that the entire collection was synced by comparing row counts against the source
        for tap_stream_id in self.expected_sync_streams():
            with self.subTest(stream=tap_stream_id):

                expected_row_count = self.expected_row_counts()[tap_stream_id]
                row_count = record_count_by_stream[tap_stream_id]

                self.assertEqual(expected_row_count, row_count)

        # verify state is saved in the proper format for all streams
        state = menagerie.get_state(conn_id)
        expected_state_keys = {
            'last_replication_method',
            'replication_key_name',
            'replication_key_type',
            'replication_key_value',
            'version',
        }
        for tap_stream_id in self.expected_check_streams():
            with self.subTest(stream=tap_stream_id):
                bookmark = state['bookmarks'][tap_stream_id]

                # gather expectations
                stream = tap_stream_id.split('-')[1]
                expected_replication_keys = self.expected_valid_replication_keys()[stream]

                # gather results
                replication_key = bookmark['replication_key_name']
                replication_key_type = bookmark['replication_key_type']

                # assert that all expected bookmark keys are present
                self.assertSetEqual(expected_state_keys, set(bookmark.keys()))

                # assert all bookmark keys have values
                for key in expected_state_keys:
                    self.assertIsNotNone(bookmark[key])

                # assert incremental sync was performed
                self.assertEqual('INCREMENTAL', bookmark['last_replication_method'])

                # assert the replication key was used to save state
                self.assertIn(replication_key, expected_replication_keys)

                # assert the replication key type is a valid datatype
                self.assertIn(replication_key_type, VALID_REPLICATION_TYPES)

                self.assertIsNone(state['currently_syncing'])

        # -----------------------------------
        # ------------ Second Sync ----------
        # -----------------------------------

        # Perform data manipulations
        with get_test_connection() as client:

            # update 1 document in each of the collection
            update_doc_coll_1 = client["simple_db"]["simple_coll_1"].find_one()
            client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": update_doc_coll_1["_id"]}, {"$set": {"date_field": datetime(2020, 1, 1, 19, 29, 14, 578000)}})

            update_doc_coll_2 = client["simple_db"]["simple_coll_2"].find_one()
            client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": update_doc_coll_2["_id"]}, {"$set": {"date_field": datetime(2020, 1, 1, 19, 29, 14, 578000)}})

            for key_name in self.key_names():
                if (key_name == 'int_field'):
                    # get the first document in the collection to update
                    doc_to_update = client["simple_db"]["simple_coll_{}".format(key_name)].find_one(sort=[("{}".format(key_name), -1)])
                    value = doc_to_update["{}".format(key_name)]
                    int_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one()
                    client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": int_based_coll["_id"]}, {"$set": {"{}".format(key_name): value+3}})
                elif (key_name == 'double_field'):
                    doc_to_update = client["simple_db"]["simple_coll_{}".format(key_name)].find_one(sort=[("{}".format(key_name), -1)])
                    value = doc_to_update["{}".format(key_name)]
                    double_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one()
                    client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": double_based_coll["_id"]}, {"$set": {"{}".format(key_name): value+3}})
                elif (key_name == '64_bit_int_field'):
                    doc_to_update = client["simple_db"]["simple_coll_{}".format(key_name)].find_one(sort=[("{}".format(key_name), -1)])
                    value = doc_to_update["{}".format(key_name)]
                    bit64_int_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one()
                    client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": bit64_int_based_coll["_id"]}, {"$set": {"{}".format(key_name): value+3}})
                elif (key_name == 'date_field'):
                    date_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one()
                    client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": date_based_coll["_id"]}, {"$set": {"{}".format(key_name): datetime(2021, 1, 1, 15, 30, 14, 222000)}})
                elif (key_name == 'timestamp_field'):
                    timestamp_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one()
                    client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": timestamp_based_coll["_id"]}, {"$set": {"{}".format(key_name): bson.timestamp.Timestamp(1565897157+99, 1)}})

            # TODO : figure out how to update collections with replication key = string, uuid

            # insert two documents with date_field > bookmark for next sync
            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field": 50,
                "string_field": z_string_generator(),
                "date_field": datetime(2018, 9, 13, 19, 29, 14, 578000),
                "double_field": 51.001,
                "timestamp_field": bson.timestamp.Timestamp(1565897157+50, 1),
                "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282050'),
                "64_bit_int_field": 34359738368 + 50
            })
            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field": 51,
                "string_field": z_string_generator(),
                "date_field": datetime(2018, 9, 18, 19, 29, 14, 578000),
                "double_field": 52.001,
                "timestamp_field": bson.timestamp.Timestamp(1565897157+51, 1),
                "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282051'),
                "64_bit_int_field": 34359738368 + 51
            })

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field": 100,
                "string_field": z_string_generator(),
                "date_field": datetime(2019, 5, 21, 19, 29, 14, 578000),
                "double_field": 101.001,
                "timestamp_field": bson.timestamp.Timestamp(1565897157+100, 1),
                "uuid_field":uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282100'),
                "64_bit_int_field": 34359738368 + 100
            })
            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field": 101,
                "string_field": z_string_generator(),
                "date_field": datetime(2019, 5, 26, 19, 29, 14, 578000),
                "double_field": 102.001,
                "timestamp_field": bson.timestamp.Timestamp(1565897157+101, 1),
                "uuid_field":  uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282101'),
                "64_bit_int_field": 34359738368 + 101
            })

            for key_name in self.key_names():
                client["simple_db"]["simple_coll_{}".format(key_name)].insert_one({
                    "int_field": 50,
                    "string_field": z_string_generator(50),
                    "date_field": datetime(2018, 9, 13, 19, 29, 15, 578000),
                    "double_field": 51.001,
                    "timestamp_field": bson.timestamp.Timestamp(1565897157+50, 1),
                    "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282050'),
                    "64_bit_int_field": 34359738368 + 50
                })
                client["simple_db"]["simple_coll_{}".format(key_name)].insert_one({
                    "int_field": 51,
                    "string_field": z_string_generator(51),
                    "date_field": datetime(2018, 9, 18, 19, 29, 16, 578000),
                    "double_field": 52.001,
                    "timestamp_field": bson.timestamp.Timestamp(1565897157+51, 1),
                    "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282051'),
                    "64_bit_int_field": 34359738368 + 51
                })

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()
        records_by_stream = {}
        for stream_name in self.expected_sync_streams():
            records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert']

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        # Verify that we got 4 records for each stream (2 because of the new records, 1 because of update and 1 because of greater than equal [for key based incremental there will always be an overlap on the bookmark value])
        for k, v in record_count_by_stream.items():
            # Workaround for not including collections for uuid and string, TODO : look for a solution to implement string and uuid as replication_key
            if k not in ('simple_coll_uuid_field', 'simple_coll_string_field'):
                self.assertEqual(4, v)

        # Verify that the _id of the records sent are the same set as the
        # _ids of the documents changed
        for stream_name in self.expected_sync_streams():
            # Workaround for not including collections for uuid and string, TODO : look for a solution to implement string and uuid as replication_key
            if stream_name not in ('simple_coll_uuid_field', 'simple_coll_string_field'):
                actual = set([x['data']['int_field'] for x in records_by_stream[stream_name]])
                self.assertEqual(self.expected_incremental_int_fields()[stream_name], actual)

        ##############################################################################
        # Verify that data is not replicated when non replication key is updated
        ##############################################################################

        # Sampling a document from a collection which we know it exists because of the data set up
        no_rep_doc_coll_1 = client["simple_db"]["simple_coll_1"].find_one({"int_field": 20})
        client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": no_rep_doc_coll_1["_id"]}, {"$set": {"string_field": 'No_replication'}})

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        messages_by_stream = runner.get_records_from_target_output()
        second_state = menagerie.get_state(conn_id)
        records_by_stream = {}
        for stream_name in self.expected_sync_streams():
            records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert']

        doc_from_simple_coll_1 = records_by_stream['simple_coll_1']

        # Verify the document from simple_coll_1 does not correspond to the document which we updated_data
        self.assertNotEqual(doc_from_simple_coll_1[0]['data']['_id'], no_rep_doc_coll_1["_id"])

        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        # Verify that we got 1 record for each stream (1 because of greater than equal [for key based incremental there will always be an overlap on the bookmark value])
        for k, v in record_count_by_stream.items():
            if k not in ('simple_coll_uuid_field', 'simple_coll_string_field'):
                self.assertEqual(1, v)

        # -----------------------------------
        # ------------ Third Sync -----------
        # -----------------------------------
        # Change the replication method for simple_coll_1
        # Change the replication key for simple_coll_2
        # Make sure both do full resync
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
            additional_md = []
            if stream_catalog['tap_stream_id'] == 'simple_db-simple_coll_1':
                additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}]
            elif stream_catalog['tap_stream_id'] == 'simple_db-simple_coll_2':
                additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'INCREMENTAL',
                                                                    'replication-key': 'timestamp_field'}}]
            else:
                additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'INCREMENTAL',
                                                                    'replication-key': stream_catalog['stream_name'].replace('simple_coll_', '')}}]

            selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
                                                                                   stream_catalog,
                                                                                   annotated_schema,
                                                                                   additional_md)
        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        self.assertDictEqual(record_count_by_stream, self.expected_last_sync_row_counts())
Example #14
0
    def setUp(self):

        ensure_environment_variables_set()

        with get_test_connection() as client:
            # drop all dbs/collections
            drop_all_collections(client)

            # simple_coll_1 has 50 documents
            client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))

            # simple_coll_2 has 100 documents
            client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(100))

            # admin_coll_1 has 50 documents
            client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(50))

            # create view on simple_coll_1
            client["simple_db"].command(bson.son.SON([("create", "simple_view_1"), ("viewOn", "simple_coll_1"), ("pipeline", [])]))

            # collections with same names as others in different dbs
            client["simple_db_2"]["simple_coll_1"].insert_many(generate_simple_coll_docs(50))
            client["simple_db_2"]["SIMPLE_COLL_1"].insert_many(generate_simple_coll_docs(50))

            # collections with special characters in names
            client["special_db"]["hebrew_ישראל"].insert_many(generate_simple_coll_docs(50))
            client['special_db']['hello!world?'].insert_many(generate_simple_coll_docs(50))

            # Add datatype collections
            pattern = re.compile('.*')
            regex = bson.Regex.from_native(pattern)
            regex.flags ^= re.UNICODE
            datatype_doc = {
                "double_field": 4.3,
                "string_field": "a sample string",
                "object_field" : {
                    "obj_field_1_key": "obj_field_1_val",
                    "obj_field_2_key": "obj_field_2_val"
                },
                "array_field" : [
                    "array_item_1",
                    "array_item_2",
                    "array_item_3"
                ],
                "binary_data_field" : b"a binary string",
                "object_id_field": bson.objectid.ObjectId(b'123456789123'),
                "boolean_field" : True,
                "date_field" : datetime.datetime.now(),
                "null_field": None,
                "regex_field" : regex,
                "32_bit_integer_field" : 32,
                "timestamp_field" : bson.timestamp.Timestamp(int(time.time()), 1),
                "64_bit_integer_field" : 34359738368,
                "decimal_field" : bson.Decimal128(decimal.Decimal('1.34')),
                "javaScript_field" : bson.code.Code("var x, y, z;"),
                "javaScript_with_scope_field" : bson.code.Code("function incrementX() { x++; }", scope={"x": 1}),
                "min_key_field" : bson.min_key.MinKey,
                "max_key_field" : bson.max_key.MaxKey
            }
            client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc)

            client["datatype_db"]["datatype_coll_2"].insert_one(datatype_doc)
            client["datatype_db"]["datatype_coll_2"].create_index([("date_field", pymongo.ASCENDING)])
            client["datatype_db"]["datatype_coll_2"].create_index([("timestamp_field", pymongo.ASCENDING)])
            client["datatype_db"]["datatype_coll_2"].create_index([("32_bit_integer_field", pymongo.ASCENDING)])
            client["datatype_db"]["datatype_coll_2"].create_index([("64_bit_integer_field", pymongo.ASCENDING)])
Example #15
0
    def test_run(self):

        ensure_environment_variables_set()

        with get_test_connection() as client:
            # drop all dbs/collections
            drop_all_collections(client)

            # Create session 1 and insert docs to simple_coll_1 & simple_coll_2

            #################
            # Session 1
            #################

            session1 = client.start_session()

            session1.start_transaction()

            # simple_coll_1 has 10 documents
            client["simple_db"]["simple_coll_1"].insert_many(
                generate_simple_coll_docs(10))

            # simple_coll_2 has 20 documents
            client["simple_db"]["simple_coll_2"].insert_many(
                generate_simple_coll_docs(20))

            session1.commit_transaction()

            # Create session 2
            '''
                create empty collection
                update documents in simple_coll_1 & simple_coll_2 and tie to session 2
                insert documents in simple_coll_3 and tie to session 2
                execute the sync with uncommitted changes
                validate that the uncommitted changes are not replicated by the sync
            '''
            ################
            # Session 2
            ################

            session2 = client.start_session()

            session2.start_transaction()

            # simple_coll_3 is an empty collection
            client["simple_db"].create_collection("simple_coll_3")

            # update document from coll 1 and coll 2
            client["simple_db"]["simple_coll_1"].update_one(
                {"int_field": 5}, {"$set": {
                    "int_field": 11
                }},
                session=session2)
            client["simple_db"]["simple_coll_2"].update_one(
                {"int_field": 10}, {"$set": {
                    "int_field": 21
                }},
                session=session2)

            # insert document to coll 3
            client["simple_db"]["simple_coll_3"].insert_many(
                generate_simple_coll_docs(5), session=session2)

            # deletes do not matter in incremental replication, invalid scenario to test

            conn_id = connections.ensure_connection(self)

            # run in discovery mode
            check_job_name = runner.run_check_mode(self, conn_id)

            # verify check exit codes
            exit_status = menagerie.get_exit_status(conn_id, check_job_name)
            menagerie.verify_check_exit_status(self, exit_status,
                                               check_job_name)

            # verify the tap discovered the right streams
            found_catalogs = menagerie.get_catalogs(conn_id)

            # assert we find the correct streams which includes all collections which are part of session1 and session2
            self.assertEqual(self.expected_check_streams_sync_1(),
                             {c['tap_stream_id']
                              for c in found_catalogs})

            # Select streams and add replication method metadata
            for stream_catalog in found_catalogs:
                annotated_schema = menagerie.get_annotated_schema(
                    conn_id, stream_catalog['stream_id'])
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-method': 'INCREMENTAL',
                        'replication_key': 'int_field'
                    }
                }]
                selected_metadata = connections.select_catalog_and_fields_via_metadata(
                    conn_id, stream_catalog, annotated_schema, additional_md)

            # run full table sync
            sync_1 = runner.run_sync_mode(self, conn_id)

            # check exit status
            exit_status = menagerie.get_exit_status(conn_id, sync_1)
            menagerie.verify_sync_exit_status(self, exit_status, sync_1)

            # streams that we synced are the ones that we expect to see
            records_by_stream = runner.get_records_from_target_output()
            record_count_by_stream = runner.examine_target_output_file(
                self, conn_id, self.expected_sync_streams_1(),
                self.expected_pks_1())

            # validate the record count in collections which are part of session1 and session2, should not read updates on coll 1 and coll 2 and insert on coll 3. Because the transaction is not committed
            self.assertEqual(self.expected_row_counts_sync_1(),
                             record_count_by_stream)

            # validate there are no duplicates replicated as part of sync1
            records_2 = {}
            pk_dict_2 = {}
            for stream in self.expected_sync_streams_1():
                records_2[stream] = [
                    x for x in records_by_stream[stream]['messages']
                    if x.get('action') == 'upsert'
                ]
                pk_2 = []
                for record in range(len(records_2[stream])):
                    pk_2.append(records_2[stream][record]['data']['int_field'])
                pk_dict_2[stream] = pk_2

            self.assertEqual(self.expected_pk_values_2(), pk_dict_2)

            session2.commit_transaction()

            # Create session 3
            '''
               Execute another sync
               Validate that the documents committed as part of session 2 should now be replicated in sync_2
            '''
            ################
            # Session 3
            ################

            session3 = client.start_session()

            session3.start_transaction()

            # Run 2nd sync
            # run in discovery mode

            sync_2 = runner.run_sync_mode(self, conn_id)
            exit_status_2 = menagerie.get_exit_status(conn_id, sync_2)
            menagerie.verify_sync_exit_status(self, exit_status_2, sync_2)

            records_by_stream_2 = runner.get_records_from_target_output()
            record_count_by_stream_2 = runner.examine_target_output_file(
                self, conn_id, self.expected_sync_streams_2(),
                self.expected_pks_2())
            # validate that we see the updates to coll 1 and coll 2 and insert to coll 3 in the 2nd sync
            # we see 2 records for coll 1 and coll 2, 1 record for update and the other record for the bookmarked record
            self.assertEqual(self.expected_row_counts_sync_2(),
                             record_count_by_stream_2)

            # validate there are no duplicates replicated as part of sync1
            records_3 = {}
            pk_dict_3 = {}
            for stream in self.expected_sync_streams_1():
                records_3[stream] = [
                    x for x in records_by_stream_2[stream]['messages']
                    if x.get('action') == 'upsert'
                ]
                pk_3 = []
                for record in range(len(records_3[stream])):
                    pk_3.append(records_3[stream][record]['data']['int_field'])
                pk_dict_3[stream] = pk_3

            self.assertEqual(self.expected_pk_values_3(), pk_dict_3)

            # Test case to validate tap behaviour when we delete bookmarked document and run sync
            state_2 = menagerie.get_state(conn_id)

            for stream in self.expected_check_streams_sync_1():
                rep_key_value = state_2['bookmarks'][stream][
                    'replication_key_value']
                if stream == 'simple_db-simple_coll_1':
                    collection = 'simple_coll_1'
                elif stream == 'simple_db-simple_coll_2':
                    collection = 'simple_coll_2'
                elif stream == 'simple_db-simple_coll_3':
                    collection = 'simple_coll_3'
                client["simple_db"][collection].delete_one(
                    {"int_field": int(rep_key_value)}, session=session3)

            session3.commit_transaction()
            '''
               Execute the sync, after the commit on session 3
               Session 3 commits includes deleting the bookmarked value in each of the collection
               Validate the state does not change after deleting the bookmarked value
               Validate that the sync does not replicate any documents
            '''
            state_3 = menagerie.get_state(conn_id)
            sync_3 = runner.run_sync_mode(self, conn_id)
            exit_status_3 = menagerie.get_exit_status(conn_id, sync_3)
            menagerie.verify_sync_exit_status(self, exit_status_3, sync_3)
            records_by_stream_3 = runner.get_records_from_target_output()
            record_count_by_stream_3 = runner.examine_target_output_file(
                self, conn_id, self.expected_sync_streams_2(),
                self.expected_pks_2())

            # validate that we see 0 records being replicated because we deleted the bookmark value on each of the collection
            self.assertEqual(self.expected_row_counts_sync_3(),
                             record_count_by_stream_3)

            # validate that the state value has not changed after deleting the bookmarked value in each collection
            self.assertEqual(state_2, state_3)
Example #16
0
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id']
                          for c in found_catalogs})

        for tap_stream_id in self.expected_check_streams():
            found_stream = [
                c for c in found_catalogs
                if c['tap_stream_id'] == tap_stream_id
            ][0]

            # assert that the pks are correct
            self.assertEqual(
                self.expected_pks()[found_stream['stream_name']],
                set(
                    found_stream.get('metadata',
                                     {}).get('table-key-properties')))

            # assert that the row counts are correct
            self.assertEqual(
                self.expected_row_counts()[found_stream['stream_name']],
                found_stream.get('metadata', {}).get('row-count'))

        #  -----------------------------------
        # ----------- Initial Full Table ---------
        #  -----------------------------------
        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    'replication-method': 'LOG_BASED'
                }
            }]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema, additional_md)

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # Verify that the full table was synced
        for tap_stream_id in self.expected_sync_streams():
            self.assertGreaterEqual(record_count_by_stream[tap_stream_id],
                                    self.expected_row_counts()[tap_stream_id])

        # Verify that we have 'initial_full_table_complete' bookmark
        state = menagerie.get_state(conn_id)
        first_versions = {}

        for tap_stream_id in self.expected_check_streams():
            # assert that the state has an initial_full_table_complete == True
            self.assertTrue(state['bookmarks'][tap_stream_id]
                            ['initial_full_table_complete'])
            # assert that there is a version bookmark in state
            first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id][
                'version']
            self.assertIsNotNone(first_versions[tap_stream_id])
            # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark
            self.assertIsNotNone(
                state['bookmarks'][tap_stream_id]['oplog_ts_time'])
            self.assertIsNotNone(
                state['bookmarks'][tap_stream_id]['oplog_ts_inc'])

        changed_ids = set()
        with get_test_connection() as client:
            # Delete two documents for each collection

            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 0})[0]['_id'])
            client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0})

            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 1})[0]['_id'])
            client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 0})[0]['_id'])
            client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 1})[0]['_id'])
            client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1})

            # Update two documents for each collection
            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 48})[0]['_id'])
            client["simple_db"]["simple_coll_1"].update_one(
                {'int_field': 48}, {'$set': {
                    'int_field': -1
                }})

            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 49})[0]['_id'])
            client["simple_db"]["simple_coll_1"].update_one(
                {'int_field': 49}, {'$set': {
                    'int_field': -1
                }})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 98})[0]['_id'])
            client["simple_db"]["simple_coll_2"].update_one(
                {'int_field': 98}, {'$set': {
                    'int_field': -1
                }})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 99})[0]['_id'])
            client["simple_db"]["simple_coll_2"].update_one(
                {'int_field': 99}, {'$set': {
                    'int_field': -1
                }})

            # Insert two documents for each collection
            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field":
                50,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 50})[0]['_id'])

            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field":
                51,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 51})[0]['_id'])

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field":
                100,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 100})[0]['_id'])

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field":
                101,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 101})[0]['_id'])

        #  -----------------------------------
        # ----------- Subsequent Oplog Sync ---------
        #  -----------------------------------

        # Run sync

        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()
        records_by_stream = {}
        for stream_name in self.expected_sync_streams():
            records_by_stream[stream_name] = [
                x for x in messages_by_stream[stream_name]['messages']
                if x.get('action') == 'upsert'
            ]

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # Verify that we got at least 6 records due to changes
        # (could be more due to overlap in gte oplog clause)
        for k, v in record_count_by_stream.items():
            self.assertGreaterEqual(v, 6)

        # Verify that we got 2 records with _SDC_DELETED_AT
        self.assertEqual(
            2,
            len([
                x['data'] for x in records_by_stream['simple_coll_1']
                if x['data'].get('_sdc_deleted_at')
            ]))
        self.assertEqual(
            2,
            len([
                x['data'] for x in records_by_stream['simple_coll_2']
                if x['data'].get('_sdc_deleted_at')
            ]))
        # Verify that the _id of the records sent are the same set as the
        # _ids of the documents changed
        actual = set([
            ObjectId(x['data']['_id'])
            for x in records_by_stream['simple_coll_1']
        ]).union(
            set([
                ObjectId(x['data']['_id'])
                for x in records_by_stream['simple_coll_2']
            ]))
        self.assertEqual(changed_ids, actual)