Example #1
0
    def test_run(self):
        # Select our catalogs
        # found_catalogs = menagerie.get_catalogs(conn_id)
        # our_catalogs = [c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()]
        # for c in our_catalogs:
        #     c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id'])
        #     c_metadata = metadata.to_map(c_annotated['metadata'])
        #     connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], [])

        conn_id = self.create_connection()

        # Clear state before our run
        menagerie.set_state(conn_id, {})
        # Select a stream
        found_catalogs = menagerie.get_catalogs(conn_id)
        our_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in self.expected_sync_streams()
        ]
        self.select_all_streams_and_fields(conn_id,
                                           our_catalogs,
                                           select_all_fields=False)

        # Run a sync job using orchestrator
        state = menagerie.get_state(conn_id)
        record_count_by_stream = self.run_and_verify_sync(conn_id, state)

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream, {}).get('messages')
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk),
                                         msg="oh no! {}".format(m))

        bookmarks = menagerie.get_state(conn_id)['bookmarks']

        replication_methods = self.expected_replication_method()

        for stream in self.expected_sync_streams():
            with self.subTest(stream=stream):
                replication_method = replication_methods.get(stream)
                if replication_method is self.INCREMENTAL:
                    self.assertTrue(stream in bookmarks)

                elif replication_method is self.FULL_TABLE:
                    self.assertTrue(stream not in bookmarks)

                else:
                    raise NotImplementedError(
                        "stream {} has an invalid replication method {}".
                        format(stream, replication_method))
Example #2
0
    def test_run(self):
        """
        Verify that for each stream you can get multiple pages of data
        when no fields are selected and only the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """
        self.start_date = '2020-11-10T00:00:00Z'
        conn_id = self.create_connection(original_properties=False)

        # Select all parent streams and no fields within streams
        # Select all (testable) report streams and only fields which are automatic and/or required by bing to genereate a report
        found_catalogs = menagerie.get_catalogs(conn_id)
        test_catalogs = [catalog for catalog in found_catalogs
                       if catalog.get('tap_stream_id') in self.expected_sync_streams()]

        # BUG_SRCE-4313 (https://stitchdata.atlassian.net/browse/SRCE-4313) streams missing automatic fields
        specific_fields = {**self.report_automatic_fields(), **self.parent_automatic_fields()} # COMMENT to reproduce
        # specific_fields = {**self.report_measure_fields(), **self.parent_automatic_fields()} #  UNCOMMENT to reproduce
        # specific_fields = self.report_measure_fields()  # TODO Use this line once bugs addressed.

        self.perform_and_verify_adjusted_selection(
            conn_id, test_catalogs, select_all_fields=False, specific_fields=specific_fields
        )

        # COMMENT EVERYTHING DOWN FROM HERE TO ADDRESS BUG_SRCE-4313

        # Run a sync job using orchestrator
        state = menagerie.get_state(conn_id)
        record_count_by_stream = self.run_and_verify_sync(conn_id, state)

        actual_fields_by_stream = runner.examine_target_output_for_fields()

        for stream in self.expected_sync_streams():
            with self.subTest(stream=stream):

                if stream == 'goals_and_funnels_report':  # SKIP TESTING FOR THIS STREAM
                    continue  # There is no data available, since we would need to implement a tracking script on singer's site

                # verify that you get some records for each stream
                self.assertGreater(
                    record_count_by_stream.get(stream, -1), 0,
                    msg="The number of records is not over the stream max limit")

                # verify that only the automatic fields are sent to the target for parent streams, and that
                # automatic fields, _sdc_report_datetime, AND specific measure fields are sent to target for report streams
                actual = actual_fields_by_stream.get(stream) or set()
                expected = self.expected_automatic_fields().get(stream, set())
                if stream.endswith('_report'):  # update expectations for report streams
                    expected_measure = 'Assists' if stream.startswith('goals') else 'Clicks'
                    expected.update({
                        '_sdc_report_datetime',  # tap applies sdc value as pk for all reports
                        expected_measure  # reports require a perf measure (which is intentionally not automatic)
                    })

                self.assertSetEqual(expected, actual)
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        #select all catalogs
        for catalog in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id']))

        future_time = "2050-01-01T00:00:00.000000Z"

        #clear state
        future_bookmarks = {"currently_syncing" : None,
                            "bookmarks":  {"contacts" : {"offset" : {},
                                                         "versionTimestamp" :  future_time},
                                           "subscription_changes" : {"startTimestamp" : future_time,
                                                                     "offset" :  {}},
                                           "campaigns" :  {"offset" : {}},
                                           "forms" : {"updatedAt" :  future_time},
                                           "deals" :  {"offset" :  {},
                                                       "hs_lastmodifieddate" :  future_time},
                                           "workflows" :  {"updatedAt" : future_time},
                                           "owners" :  {"updatedAt" :  future_time},
                                           "contact_lists" :  {"updatedAt" :  future_time,
                                                               "offset" :  {}},
                                           "email_events" :  {"startTimestamp" : future_time,
                                                              "offset" : {}},
                                           "companies" :  {"offset" : {},
                                                           "hs_lastmodifieddate" :  future_time},
                                           "engagements" :  {"lastUpdated" :  future_time,
                                                             "offset" : {}}}}

        menagerie.set_state(conn_id, future_bookmarks)

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        #because the bookmarks were set into the future, we should NOT actually replicate any data.
        #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks
        streams_with_bookmarks = self.expected_sync_streams()
        streams_with_bookmarks.remove('campaigns')
        streams_with_bookmarks.remove('deal_pipelines')
        bad_streams = streams_with_bookmarks.intersection(record_count_by_stream.keys())
        self.assertEqual(len(bad_streams), 0, msg="still pulled down records from {} despite future bookmarks".format(bad_streams))


        state = menagerie.get_state(conn_id)

        # NB: Companies and engagements won't set a bookmark in the future.
        state["bookmarks"].pop("companies")
        state["bookmarks"].pop("engagements")
        future_bookmarks["bookmarks"].pop("companies")
        future_bookmarks["bookmarks"].pop("engagements")

        self.assertEqual(state, future_bookmarks, msg="state should not have been modified because we didn't replicate any data")
        bookmarks = state.get('bookmarks')
        bookmark_streams = set(state.get('bookmarks').keys())
Example #4
0
    def first_sync_test(self, table_configs, conn_id):
        # run first full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        expected_pks = {}

        for config in table_configs:
            key = {config['HashKey']}
            if config.get('SortKey'):
                key |= {config.get('SortKey')}
            expected_pks[config['TableName']] = key

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, {x['TableName']
                            for x in table_configs}, expected_pks)

        state = menagerie.get_state(conn_id)
        state_version = menagerie.get_state_version(conn_id)

        first_versions = {}

        # assert that we get the correct number of records for each stream
        for config in table_configs:
            table_name = config['TableName']

            self.assertEqual(config['num_rows'],
                             record_count_by_stream[table_name])

            # assert that an activate_version_message is first and last message sent for each stream
            self.assertEqual(
                'activate_version',
                records_by_stream[table_name]['messages'][0]['action'])
            self.assertEqual(
                'activate_version',
                records_by_stream[table_name]['messages'][-1]['action'])

            # assert that the state has an initial_full_table_complete == True
            self.assertTrue(
                state['bookmarks'][table_name]['initial_full_table_complete'])
            # assert that there is a version bookmark in state
            first_versions[table_name] = state['bookmarks'][table_name][
                'version']
            self.assertIsNotNone(first_versions[table_name])

            # Write state with missing finished_shards so it
            # re-reads data from all shards
            # This should result in the next sync having same number of records
            # as the full table sync
            state['bookmarks'][table_name].pop('finished_shards')
            menagerie.set_state(conn_id, state, version=state_version)
Example #5
0
    def test_run(self):
        # Select our catalogs
        # found_catalogs = menagerie.get_catalogs(conn_id)
        # our_catalogs = [c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()]
        # for c in our_catalogs:
        #     c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id'])
        #     c_metadata = metadata.to_map(c_annotated['metadata'])
        #     connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], [])

        conn_id = self.create_connection()

        # Clear state before our run
        menagerie.set_state(conn_id, {})
        # Select a stream
        found_catalogs = menagerie.get_catalogs(conn_id)
        our_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in self.expected_sync_streams()
        ]
        self.select_all_streams_and_fields(conn_id,
                                           our_catalogs,
                                           select_all_fields=False)

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = sum(record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream, {}).get('messages')
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk),
                                         msg="oh no! {}".format(m))

        bookmarks = menagerie.get_state(conn_id)['bookmarks']

        for stream in self.expected_sync_streams():
            self.assertTrue(stream in bookmarks)
Example #6
0
    def test_run(self):

        # sync 1
        conn_id = connections.ensure_connection(self)

        found_catalogs_1 = self.run_and_verify_check_mode(conn_id)

        self.perform_and_verify_table_and_field_selection(conn_id,found_catalogs_1)

        record_count_by_stream_1 = self.run_and_verify_sync(conn_id)

        # checking if we got any data from sync 1
        self.assertGreater(sum(record_count_by_stream_1.values()), 0)

        for tap_stream_id in self.expected_first_sync_streams():
            self.assertEqual(self.expected_first_sync_row_counts()[tap_stream_id],
                             record_count_by_stream_1[tap_stream_id])

        # getting state
        state = menagerie.get_state(conn_id)

        # creating file "table_1_fileB"
        with self.get_test_connection() as client:
            root_dir = os.getenv('TAP_SFTP_ROOT_DIR')
            client.chdir(root_dir + '/tap_tester/folderA')

            file_group = self.get_files()[0]
            with client.open('table_1_fileB.csv', 'w') as f:
                writer = csv.writer(f)
                lines = [file_group['headers']] + file_group['generator'](file_group['num_rows'])
                writer.writerows(lines)

        # adding some data to file "table_1_fileA" and "table_3_fileA"
        self.append_to_files()

        # setting state
        menagerie.set_state(conn_id, state)

        # sync 2
        record_count_by_stream_2 = self.run_and_verify_sync(conn_id, second_sync = True)

        # checking if we got any data from sync 2
        self.assertGreater(sum(record_count_by_stream_2.values()), 0)

        # checking if data after in 2nd sync is as expected
        # here as we have modified start date, so we should recieve only modified data
        # ie. after appending and creating file
        for tap_stream_id in self.expected_second_sync_streams():
            self.assertEqual(self.expected_second_sync_row_counts()[tap_stream_id],
                             record_count_by_stream_2[tap_stream_id])
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        #select all catalogs
        #selected_catalogs = list(map(lambda catalog: self.perform_field_selection(conn_id, catalog), found_catalogs))
        #menagerie.post_annotated_catalogs(conn_id, selected_catalogs)

        for c in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(conn_id, c,
                                                               menagerie.get_annotated_schema(conn_id, c['stream_id']))

        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # bookmarks for the 4 streams should be 2015-03-16
        states = menagerie.get_state(conn_id)["bookmarks"]
        end_date = self.get_properties()["end_date"].split()[0]
        for k, v in states.items():
            if "insights" in k:
                bm_date = v.get("date_start")
                self.assertEqual(end_date, bm_date)
        print("bookmarks match end_date of {}".format(end_date))
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # Select all Catalogs
        for catalog in found_catalogs:
            if catalog['tap_stream_id'] in self.expected_sync_streams():
                connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id']))

        #clear state
        menagerie.set_state(conn_id, {})

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        max_bookmarks_from_records = runner.get_most_recent_records_from_target(self, self.expected_bookmarks(), self.get_properties()['start_date'])

        start_of_today =  utils.strftime(datetime.datetime(datetime.datetime.utcnow().year, datetime.datetime.utcnow().month, datetime.datetime.utcnow().day, 0, 0, 0, 0, datetime.timezone.utc))
        max_bookmarks_from_records['subscription_changes'] = start_of_today
        max_bookmarks_from_records['email_events'] = start_of_today


        #if we didn't replicate data, the bookmark should be the start_date
        for k in self.expected_bookmarks().keys():
            if max_bookmarks_from_records.get(k) is None:
                max_bookmarks_from_records[k] = utils.strftime(datetime.datetime(2017, 5, 1, 0, 0, 0, 0, datetime.timezone.utc))

        state = menagerie.get_state(conn_id)
        bookmarks = state.get('bookmarks')
        bookmark_streams = set(state.get('bookmarks').keys())

        #verify bookmarks and offsets
        for k,v in sorted(list(self.expected_bookmarks().items())):
            for w in v:
                bk_value = bookmarks.get(k,{}).get(w)
                self.assertEqual(utils.strptime_with_tz(bk_value),
                                 utils.strptime_with_tz(max_bookmarks_from_records[k]),
                                 "Bookmark {} ({}) for stream {} should have been updated to {}".format(bk_value, w, k, max_bookmarks_from_records[k]))
                print("bookmark {}({}) updated to {} from max record value {}".format(k, w, bk_value, max_bookmarks_from_records[k]))

        for k,v in self.expected_offsets().items():
            self.assertEqual(bookmarks.get(k,{}).get('offset', {}), v, msg="unexpected offset found for stream {} {}. state: {}".format(k, v, state))
            print("offsets {} cleared".format(k))

        diff = bookmark_streams.difference(self.acceptable_bookmarks())
        self.assertEqual(len(diff), 0, msg="Unexpected bookmarks: {} Expected: {} Actual: {}".format(diff, self.acceptable_bookmarks(), bookmarks))

        self.assertEqual(state.get('currently_syncing'), None,"Unexpected `currently_syncing` bookmark value: {} Expected: None".format(state.get('currently_syncing')))
Example #9
0
    def test_run(self):

        conn_id = connections.ensure_connection(self, payload_hook=None)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(subset, msg="Expected check streams are not subset of discovered catalog")
        #
        # # Select some catalogs
        our_catalogs = [c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()]
        for catalog in our_catalogs:
            schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id'])
            connections.select_catalog_and_fields_via_metadata(conn_id, catalog, schema, [], [])

        # # Verify that all streams sync at least one row for initial sync
        # # This test is also verifying access token expiration handling. If test fails with
        # # authentication error, refresh token was not replaced after expiring.
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(),
                                                                   self.expected_pks())
        zero_count_streams = {k for k, v in record_count_by_stream.items() if v == 0}
        self.assertFalse(zero_count_streams,
                         msg="The following streams did not sync any rows {}".format(zero_count_streams))

        # Verify that bookmark values are correct after incremental sync
        start_date = os.getenv(configuration['properties']['start_date'])
        bookmark_props = configuration['bookmark']
        current_state = menagerie.get_state(conn_id)
        test_bookmark = current_state['bookmarks'][bookmark_props['bookmark_key']]
        print(test_bookmark)
        self.assertTrue(test_bookmark['updated'] > start_date,
                        msg="The bookmark value does not match the expected result")
Example #10
0
    def test_run(self):
        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.check_all_streams_in_catalogs(found_catalogs)
        self.select_found_catalogs(found_catalogs)

        # clear state and run the actual sync
        menagerie.set_state(self.conn_id, {})
        runner.run_sync_job_and_check_status(self)
        self.check_output_record_counts()

        max_bookmarks_from_records = runner.get_max_bookmarks_from_target(self)
        state = menagerie.get_state(self.conn_id)
        bookmarks = state.get("bookmarks", {})
        self.check_bookmarks(bookmarks, max_bookmarks_from_records)
        self.check_offsets(bookmarks)
        self.look_for_unexpected_bookmarks(bookmarks)
        self.assertIsNone(state.get("currently_syncing"))
Example #11
0
    def test_future_date_in_state(self):
        conn_id = connections.ensure_connection(self)

        expected_streams = self.streams_to_select()

        future_date = datetime.datetime.strftime(
            datetime.datetime.today() + datetime.timedelta(days=1),
            "%Y-%m-%dT00:00:00Z")

        state = {'bookmarks': dict()}
        replication_keys = self.expected_replication_keys()
        for stream in expected_streams:
            if self.is_incremental(stream):
                state['bookmarks'][stream] = dict()
                state['bookmarks'][stream]['field'] = next(
                    iter(replication_keys[stream]))
                state['bookmarks'][stream]['last_record'] = future_date

        # set state for running sync mode
        menagerie.set_state(conn_id, state)

        runner.run_check_mode(self, conn_id)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.select_found_catalogs(conn_id,
                                   found_catalogs,
                                   only_streams=expected_streams)

        # run sync mode
        self.run_and_verify_sync(conn_id)

        # get the state after running sync mode
        latest_state = menagerie.get_state(conn_id)

        # verify that the state passed before sync
        # and the state we got after sync are same
        self.assertEquals(latest_state, state)
Example #12
0
    def test_run(self):

        # append some data to particular files to test the modified date
        self.append_to_files(
            ["table_1_file.csv", "table_3_file.csv", "table_4_file.csv"])

        # sync
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        self.perform_and_verify_table_and_field_selection(
            conn_id, found_catalogs)

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        state = menagerie.get_state(conn_id)

        # checking if we got any data from sync
        self.assertGreater(sum(record_count_by_stream.values()), 0)

        # checking if data after sync is as expected
        for tap_stream_id in self.expected_first_sync_streams():
            self.assertEqual(self.expected_sync_row_counts()[tap_stream_id],
                             record_count_by_stream[tap_stream_id])

        # getting maximum of last mofified dates from all files
        max_date = max(self.get_last_modified()).replace(microsecond=0)
        expected_date = max_date.timestamp()

        # getting bookmark
        actual_date = datetime.datetime.fromisoformat(
            state['bookmarks']['table']['modified_since']).timestamp()

        # checking if maximum last modified date is set as bookmark
        self.assertEqual(int(expected_date), int(actual_date))
Example #13
0
    def test_run(self):
        expected_streams = self.expected_streams()

        expected_replication_keys = self.expected_replication_keys()
        expected_replication_methods = self.expected_replication_method()

        ##########################################################################
        ### First Sync
        ##########################################################################
        self.start_date_1 = self.get_properties().get("start_date")
        self.start_date_2 = self.timedelta_formatted(self.start_date_1, days=3)

        self.start_date = self.start_date_1
        conn_id = connections.ensure_connection(self,
                                                original_properties=False)

        # Run in check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # Select only the expected streams tables
        catalog_entries = [
            ce for ce in found_catalogs
            if ce['tap_stream_id'] in expected_streams
        ]
        self.perform_and_verify_table_and_field_selection(
            conn_id, catalog_entries, select_all_fields=True)

        # Run a sync job using orchestrator
        first_sync_record_count = self.run_and_verify_sync(conn_id)
        first_sync_records = runner.get_records_from_target_output()
        first_sync_bookmarks = menagerie.get_state(conn_id)

        ##########################################################################
        ### Update State Between Syncs
        ##########################################################################

        new_states = {'bookmarks': dict()}
        simulated_states = self.calculated_states_by_stream(
            first_sync_bookmarks)
        for stream, new_state in simulated_states.items():
            new_states['bookmarks'][stream] = new_state
        menagerie.set_state(conn_id, new_states)

        for stream in simulated_states.keys():
            for state_key, state_value in simulated_states[stream].items():
                if stream not in new_states['bookmarks']:
                    new_states['bookmarks'][stream] = {}
                if state_key not in new_states['bookmarks'][stream]:
                    new_states['bookmarks'][stream][state_key] = state_value

        ##########################################################################
        ### Second Sync
        ##########################################################################
        self.start_date = self.start_date_2

        # run check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # table and field selection
        test_catalogs_2_all_fields = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in expected_streams
        ]
        self.perform_and_verify_table_and_field_selection(
            conn_id, test_catalogs_2_all_fields, select_all_fields=True)

        second_sync_record_count = self.run_and_verify_sync(conn_id)
        second_sync_records = runner.get_records_from_target_output()
        second_sync_bookmarks = menagerie.get_state(conn_id)

        ##########################################################################
        ### Test By Stream
        ##########################################################################

        for stream in expected_streams:
            with self.subTest(stream=stream):
                expected_replication_method = expected_replication_methods[
                    stream]
                first_bookmark_key_value = first_sync_bookmarks.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)
                second_bookmark_key_value = second_sync_bookmarks.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)

                # expected values
                first_sync_count = first_sync_record_count.get(stream, 0)
                second_sync_count = second_sync_record_count.get(stream, 0)

                # collect information for assertions from syncs 1 & 2 base on expected values
                first_sync_messages = [
                    record.get('data') for record in first_sync_records.get(
                        stream).get('messages')
                    if record.get('action') == 'upsert'
                ]
                second_sync_messages = [
                    record.get('data') for record in second_sync_records.get(
                        stream).get('messages')
                    if record.get('action') == 'upsert'
                ]

                if expected_replication_method == self.INCREMENTAL:

                    replication_key = next(
                        iter(expected_replication_keys[stream]))

                    if stream != 'forms':
                        for form_key in self.get_forms():
                            first_bookmark_value = first_bookmark_key_value.get(
                                form_key, {}).get(replication_key)
                            second_bookmark_value = second_bookmark_key_value.get(
                                form_key, {}).get(replication_key)
                            first_bookmark_value_utc = self.convert_state_to_utc(
                                first_bookmark_value)
                            second_bookmark_value_utc = self.convert_state_to_utc(
                                second_bookmark_value)
                            simulated_bookmark_value = new_states['bookmarks'][
                                stream][form_key]
                            simulated_bookmark_minus_lookback = simulated_bookmark_value

                            # Verify the first sync sets a bookmark of the expected form
                            self.assertIsNotNone(first_bookmark_key_value)

                            # Verify the second sync sets a bookmark of the expected form
                            self.assertIsNotNone(second_bookmark_key_value)

                            # Verify the second sync bookmark is Greater or Equal to the first sync bookmark
                            self.assertGreaterEqual(
                                second_bookmark_value, first_bookmark_value
                            )  # new responses could be picked up for the form in the second sync

                            for record in second_sync_messages:

                                # Verify the second sync records respect the previous (simulated) bookmark value
                                replication_key_value = record.get(
                                    replication_key)
                                self.assertGreaterEqual(
                                    replication_key_value,
                                    simulated_bookmark_minus_lookback,
                                    msg=
                                    "Second sync records do not repect the previous bookmark."
                                )

                                # Verify the second sync bookmark value is the max replication key value for a given stream
                                self.assertLessEqual(
                                    replication_key_value,
                                    second_bookmark_value_utc,
                                    msg=
                                    "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                                )

                            for record in first_sync_messages:
                                # Verify the first sync bookmark value is the max replication key value for a given stream
                                replication_key_value = record.get(
                                    replication_key)
                                self.assertLessEqual(
                                    replication_key_value,
                                    first_bookmark_value_utc,
                                    msg=
                                    "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                                )

                            # Verify the number of records in the 2nd sync is less then the first
                            self.assertLess(second_sync_count,
                                            first_sync_count)

                    else:
                        # collect information specific to incremental streams from syncs 1 & 2
                        first_bookmark_value = first_bookmark_key_value.get(
                            replication_key)
                        second_bookmark_value = second_bookmark_key_value.get(
                            replication_key)
                        first_bookmark_value_utc = self.convert_state_to_utc(
                            first_bookmark_value)
                        second_bookmark_value_utc = self.convert_state_to_utc(
                            second_bookmark_value)
                        simulated_bookmark_value = new_states['bookmarks'][
                            stream][replication_key]
                        simulated_bookmark_minus_lookback = simulated_bookmark_value

                    # Verify the first sync sets a bookmark of the expected form
                    self.assertIsNotNone(first_bookmark_key_value)

                    # Verify the second sync sets a bookmark of the expected form
                    self.assertIsNotNone(second_bookmark_key_value)

                    # Verify the second sync bookmark is Greater or Equal to the first sync bookmark
                    self.assertGreaterEqual(
                        second_bookmark_value, first_bookmark_value
                    )  # new responses could be picked up for the form in the second sync

                    for record in second_sync_messages:

                        # Verify the second sync records respect the previous (simulated) bookmark value
                        replication_key_value = record.get(replication_key)
                        self.assertGreaterEqual(
                            replication_key_value,
                            simulated_bookmark_minus_lookback,
                            msg=
                            "Second sync records do not repect the previous bookmark."
                        )

                        # Verify the second sync bookmark value is the max replication key value for a given stream
                        self.assertLessEqual(
                            replication_key_value,
                            second_bookmark_value_utc,
                            msg=
                            "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                        )

                    for record in first_sync_messages:

                        # Verify the first sync bookmark value is the max replication key value for a given stream
                        replication_key_value = record.get(replication_key)
                        self.assertLessEqual(
                            replication_key_value,
                            first_bookmark_value_utc,
                            msg=
                            "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                        )

                    # Verify the number of records in the 2nd sync is less then the first
                    self.assertLess(second_sync_count, first_sync_count)

                elif expected_replication_method == self.FULL_TABLE:

                    # Verify the syncs do not set a bookmark for full table streams
                    self.assertIsNone(first_bookmark_key_value)
                    self.assertIsNone(second_bookmark_key_value)

                    # Verify the number of records in the second sync is the same as the first
                    self.assertEqual(second_sync_count, first_sync_count)

                else:

                    raise NotImplementedError(
                        "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}"
                        .format(stream, expected_replication_method))

                # Verify at least 1 record was replicated in the second sync
                self.assertGreater(
                    second_sync_count,
                    0,
                    msg="We are not fully testing bookmarking for {}".format(
                        stream))
Example #14
0
    def binlog_json_test(self):
        print("RUNNING {}\n\n".format(self.name()))

        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        expected_check_streams = {self.tap_stream_id()}
        expected_sync_streams = {self.table_name()}
        expected_pks = {self.table_name(): {'id'}}

        # verify the tap discovered the right streams
        found_catalogs = [
            catalog for catalog in menagerie.get_catalogs(conn_id)
            if catalog['tap_stream_id'] in expected_check_streams
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = expected_check_streams.symmetric_difference(found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]

        self.assertEqual(self.table_name(), test_catalog['stream_name'])

        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        selected_metadata = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog,
            menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']),
            additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run initial full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        self.maxDiff = None
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {self.table_name(): 1})
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(message_actions,
                         ['activate_version', 'upsert', 'activate_version'])

        # ensure some log_file and log_pos state was persisted
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertIsNotNone(bookmark['log_file'])
        self.assertIsNotNone(bookmark['log_pos'])

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        # grab version, log_file and log_pos from state to check later
        expected_table_version = records_for_stream['table_version']

        self.assertEqual(expected_table_version, bookmark['version'])

        # check for expected records
        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]

        self.assertEqual([expected_rec_1], upsert_records)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # record count should be empty as we did not persist anything to the gate
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {})

        # insert a new huge row
        data = dict([('foooo%i' % i, 'baaaaar%i' % i) for i in range(2560)],
                    literal=True)
        rec = {'id': 2, 'our_json': json.dumps(data)}

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            self.insert_record(cur, rec)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # Either the log_file is the same but the log_pos has increased or the log_file
        # has rotated and the numeric suffix has increased
        if expected_log_file == bookmark['log_file']:
            self.assertGreater(bookmark['log_pos'], expected_log_pos)
        else:
            expected_log_file_suffix = re.search('^.*\.(\d+)$',
                                                 expected_log_file).groups()[0]
            updated_log_file_suffix = re.search(
                '^.*\.(\d+)$', bookmark['log_file']).groups()[0]

            self.assertGreater(int(updated_log_file_suffix),
                               int(expected_log_file_suffix))

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        expected_rec_2 = copy.deepcopy(rec)

        # check for expected records
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(message_actions, ['upsert'])

        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]
        del upsert_records[0]['_sdc_deleted_at']

        expected_json = json.loads(expected_rec_2.get('our_json', {}))
        actual_json = json.loads(upsert_records[0].get('our_json', {}))

        self.assertTrue(len(actual_json.keys()) > 0)
        self.assertEqual(expected_json, actual_json)
Example #15
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]

        self.assertEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        chicken_catalog = found_catalogs[0]

        self.assertEqual('chicken_view', chicken_catalog['stream_name'])
        print("discovered streams are correct")

        print('checking discoverd metadata for ROOT-CHICKEN_VIEW')
        md = menagerie.get_annotated_schema(
            conn_id, chicken_catalog['stream_id'])['metadata']

        self.assertEqual(
            {
                (): {
                    'database-name': 'postgres',
                    'is-view': True,
                    'row-count': 0,
                    'schema-name': 'public',
                    'table-key-properties': []
                },
                ('properties', 'fk_id'): {
                    'inclusion': 'available',
                    'sql-datatype': 'bigint',
                    'selected-by-default': True
                },
                ('properties', 'name'): {
                    'inclusion': 'available',
                    'sql-datatype': 'character varying',
                    'selected-by-default': True
                },
                ('properties', 'age'): {
                    'inclusion': 'available',
                    'sql-datatype': 'integer',
                    'selected-by-default': True
                },
                ('properties', 'size'): {
                    'inclusion': 'available',
                    'sql-datatype': 'character varying',
                    'selected-by-default': True
                },
                ('properties', 'id'): {
                    'inclusion': 'available',
                    'sql-datatype': 'integer',
                    'selected-by-default': True
                }
            }, metadata.to_map(md))

        # 'ID' selected as view-key-properties
        replication_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-key': None,
                "replication-method": "FULL_TABLE",
                'view-key-properties': ["id"]
            }
        }]

        connections.select_catalog_and_fields_via_metadata(
            conn_id, chicken_catalog,
            menagerie.get_annotated_schema(conn_id,
                                           chicken_catalog['stream_id']),
            replication_md)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        self.assertEqual(record_count_by_stream, {'chicken_view': 1})
        records_by_stream = runner.get_records_from_target_output()

        table_version = records_by_stream['chicken_view']['table_version']
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][0]['action'],
            'activate_version')
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][1]['action'],
            'upsert')
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][2]['action'],
            'activate_version')

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        actual_chicken_record = records_by_stream['chicken_view']['messages'][
            1]['data']

        expected_chicken_record = {
            'id': 1,
            'fk_id': 1,
            'name': 'fred',
            'age': 99,
            'size': 'big'
        }
        self.assertEqual(
            actual_chicken_record,
            expected_chicken_record,
            msg=
            "Expected `various_types` upsert record data to be {}, but target output {}"
            .format(expected_chicken_record, actual_chicken_record))

        print("records are correct")

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)

        chicken_bookmark = state['bookmarks']['postgres-public-chicken_view']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        self.assertEqual(
            chicken_bookmark['version'],
            table_version,
            msg="expected bookmark for stream ROOT-CHICKEN to match version")
Example #16
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        for c in found_catalogs:
            catalog_props_to_check = ['stream_name', 'tap_stream_id']
            stream = c['stream_name']

            for prop in catalog_props_to_check:
                self.assertEqual(
                    c[prop],
                    expected_catalogs[stream][prop],
                    msg=
                    "unexpected stream catalog property `{}` for stream `{}`: `{}` != `{}`"
                    .format(prop, stream, expected_catalogs[stream][prop],
                            c[prop]))

        print("discovered streams are correct")

        print('checking discoverd metadata for tap_tester_mysql_0-incremental')
        incremental_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_0-incremental'
        ][0]
        md = menagerie.get_annotated_schema(
            conn_id, incremental_catalog['stream_id'])['metadata']

        incremental_stream_metadata = {
            'database-name': 'tap_tester_mysql_0',
            'row-count': 3,
            'is-view': False,
            'selected-by-default': False,
            'table-key-properties': ['c_pk']
        }

        self.assertEqual(
            sorted(md, key=lambda x: x['breadcrumb']),
            [{
                'breadcrumb': [],
                'metadata': incremental_stream_metadata
            }, {
                'breadcrumb': ['properties', 'c_dt'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'datetime'
                }
            }, {
                'breadcrumb': ['properties', 'c_pk'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'int(11)'
                }
            }, {
                'breadcrumb': ['properties', 'c_varchar'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'varchar(255)'
                }
            }, {
                'breadcrumb': ['properties', 'c_varchar_to_deselect'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'varchar(255)'
                }
            }])

        print('checking discovered metadata for tap_tester_mysql_1-view')
        view_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_1-view'
        ][0]
        view_catalog_key_properties_md = [{
            'breadcrumb': [],
            'metadata': {
                'view-key-properties': ['c_pk']
            }
        }]

        connections.set_non_discoverable_metadata(
            conn_id, view_catalog,
            menagerie.get_annotated_schema(conn_id, view_catalog['stream_id']),
            view_catalog_key_properties_md)
        md = menagerie.get_annotated_schema(
            conn_id, view_catalog['stream_id'])['metadata']

        view_stream_metadata = {
            'database-name': 'tap_tester_mysql_1',
            'is-view': True,
            'selected-by-default': False,
            'view-key-properties': ['c_pk']
        }

        self.assertEqual(sorted(md, key=lambda x: x['breadcrumb']),
                         [{
                             'breadcrumb': [],
                             'metadata': view_stream_metadata
                         }, {
                             'breadcrumb': ['properties', 'c_pk'],
                             'metadata': {
                                 'selected-by-default': True,
                                 'sql-datatype': 'int(11)'
                             }
                         }, {
                             'breadcrumb': ['properties', 'c_varchar'],
                             'metadata': {
                                 'selected-by-default': True,
                                 'sql-datatype': 'varchar(255)'
                             }
                         }])

        #No selected-by-default MD for c_year because it is an unsupported type
        various_types_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_0-various_types'
        ][0]
        md = menagerie.get_annotated_schema(
            conn_id, various_types_catalog['stream_id'])['metadata']
        c_year_md = [
            x for x in md if x['breadcrumb'] == ['properties', 'c_year']
        ]
        self.assertEqual(c_year_md, [{
            'breadcrumb': ['properties', 'c_year'],
            'metadata': {
                'selected-by-default': False,
                'sql-datatype': 'year(4)'
            }
        }])

        ##select_simple_example
        catalogs_to_select = [
            c for c in found_catalogs
            if c['tap_stream_id'] != 'tap_tester_mysql_0-simple_example'
        ]

        for a_catalog in catalogs_to_select:
            additional_md = []
            unselected_fields = []
            if a_catalog['tap_stream_id'] == 'tap_tester_mysql_0-incremental':
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-key': 'c_dt',
                        'replication-method': 'INCREMENTAL'
                    }
                }]
                unselected_fields = ['c_varchar_to_deselect']

            elif a_catalog['tap_stream_id'] == 'tap_tester_mysql_1-view':
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'view-key-properties': ['c_pk'],
                        'replication-method': 'FULL_TABLE'
                    }
                }]
            else:
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-method': 'FULL_TABLE'
                    }
                }]

            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, a_catalog,
                menagerie.get_annotated_schema(conn_id,
                                               a_catalog['stream_id']),
                additional_md, unselected_fields)
        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        expected_row_count = 8  # {'my_isam': 1, 'various_types': 3, 'incremental': 3, 'view': 1}
        self.assertEqual(
            replicated_row_count,
            expected_row_count,
            msg="failed to replicate correct number of rows: {}".format(
                record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        records_by_stream = runner.get_records_from_target_output()

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify that activate version messages were sent in the proper position
            self.assertEqual(
                recs['messages'][0]['action'],
                'activate_version',
                msg=
                "Expected first message sent for stream `{}` to have action `activate_version`"
                .format(stream))

            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # verify that the target output the proper numeric and date representations
        expected_various_types_records = [{
            'c_time':
            '1970-01-01T12:34:56.000000Z',
            'c_mediumint':
            8388607,
            'c_smallint':
            32767,
            'c_tinyint':
            127,
            'c_date':
            '2017-09-13T00:00:00.000000Z',
            'c_bigint':
            9223372036854775807,
            'c_decimal':
            -1,
            'c_int':
            2147483647,
            'c_bit':
            True,
            'c_decimal_2':
            Decimal('123456789.0'),
            'c_pk':
            1,
            'c_double':
            Decimal("1.234"),
            'c_float':
            Decimal("1.234"),
            'c_decimal_2_unsigned':
            Decimal("1.23"),
            'c_tinyint_1':
            True
        }, {
            'c_time':
            '1970-01-01T12:34:57.000000Z',
            'c_mediumint':
            -8388608,
            'c_smallint':
            -32768,
            'c_tinyint':
            -128,
            'c_date':
            '2017-09-14T00:00:00.000000Z',
            'c_bigint':
            -9223372036854775808,
            'c_decimal':
            0,
            'c_int':
            -2147483648,
            'c_bit':
            False,
            'c_decimal_2':
            Decimal("123456790.0"),
            'c_pk':
            2,
            'c_double':
            Decimal("2.234"),
            'c_float':
            Decimal("2.234"),
            'c_decimal_2_unsigned':
            Decimal("0.23"),
            'c_tinyint_1':
            False
        }, {
            'c_time':
            '1970-01-01T12:34:57.000000Z',
            'c_mediumint':
            -8388608,
            'c_smallint':
            -32768,
            'c_tinyint':
            -128,
            'c_date':
            '2017-09-14T00:00:00.000000Z',
            'c_bigint':
            -9223372036854775808,
            'c_decimal':
            0,
            'c_int':
            -2147483648,
            'c_bit':
            None,
            'c_decimal_2':
            Decimal("123456790.0"),
            'c_pk':
            3,
            'c_double':
            Decimal("2.234"),
            'c_float':
            Decimal("2.234"),
            'c_decimal_2_unsigned':
            Decimal("0.23"),
            'c_tinyint_1':
            None
        }]

        actual_various_types_records = [
            r['data']
            for r in records_by_stream['various_types']['messages'][1:4]
        ]

        self.assertEqual(
            actual_various_types_records,
            expected_various_types_records,
            msg=
            "Expected `various_types` upsert record data to be {}, but target output {}"
            .format(expected_various_types_records,
                    actual_various_types_records))

        # verify that deselected property was not output
        expected_incremental_record = {
            'c_pk': 1,
            'c_dt': '2017-01-01T00:00:00.000000Z',
            'c_varchar': 'a'
        }

        actual_incremental_record = records_by_stream['incremental'][
            'messages'][1]['data']

        self.assertEqual(
            actual_incremental_record,
            expected_incremental_record,
            msg=
            "Expected first `incremental` upsert record data to be {}, but target output {}"
            .format(expected_incremental_record, actual_incremental_record))

        print("records are correct")

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)
        bookmarks = state['bookmarks']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        for k, v in bookmarks.items():
            if k == 'tap_tester_mysql_0-incremental':
                self.assertIsNotNone(
                    v['version'],
                    msg="expected bookmark for stream `{}` to have a version set"
                    .format(k))
                self.assertEqual(
                    v['replication_key_value'],
                    '2017-01-01T00:00:02.000000Z',
                    msg=
                    "incorrect replication_key_value in bookmark for stream `{}`"
                    .format(k))
                self.assertEqual(
                    v['replication_key'],
                    'c_dt',
                    msg=
                    "incorrect replication_key specified in bookmark for stream `{}`"
                    .format(k))
            else:
                self.assertFalse(
                    'version' in v,
                    msg=
                    "expected bookmark for stream `{}` to not have a version key"
                    .format(k))
                self.assertTrue(
                    'initial_full_table_complete' in v,
                    msg=
                    "expected bookmark for stream `{}` to have a true initial_full_table_complete key"
                    .format(k))
        print("state and bookmarks are correct")

        incremental_table_initial_table_version = bookmarks[
            'tap_tester_mysql_0-incremental']['version']

        #----------------------------------------------------------------------
        # invoke the sync job again after some modifications
        #----------------------------------------------------------------------

        print("adding a column to an existing table in the source db")
        connection = db_utils.get_db_connection(self.get_properties(),
                                                self.get_credentials())

        with connection.cursor() as cursor:
            add_column_sql = '''
                ALTER TABLE tap_tester_mysql_0.incremental
                  ADD COLUMN favorite_number INTEGER;
                INSERT INTO tap_tester_mysql_0.incremental VALUES (4, '4', '2017-01-01 00:00:03', 'yeehaw', 999);
            '''
            cursor.execute(add_column_sql)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        expected_row_count = 7  # {'my_isam': 1, 'various_types': 3, 'incremental': 2, 'view': 1}
        self.assertEqual(
            replicated_row_count,
            expected_row_count,
            msg="failed to replicate correct number of rows: {}".format(
                record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        records_by_stream = runner.get_records_from_target_output()

        expected_schema_of_new_column = {
            'maximum': 2147483647,
            'selected': True,
            'inclusion': 'available',
            'type': ['null', 'integer'],
            'minimum': -2147483648
        }

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify that a activate version messages were sent in the proper position
            if stream == 'incremental':
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version',
                    msg=
                    "Expected first message sent for stream `{}` not to have action `activate_version`"
                    .format(stream))
                expected_schema_of_new_column = {
                    'maximum': 2147483647,
                    'inclusion': 'available',
                    'type': ['null', 'integer'],
                    'minimum': -2147483648
                }
                self.assertEqual(
                    records_by_stream[stream]['schema']['properties']
                    ['favorite_number'],
                    expected_schema_of_new_column,
                    msg=
                    "Expected newly-added column to be present in schema for stream `{}`, but it was not."
                    .format(stream))
            else:
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'upsert',
                    msg=
                    "Expected first message sent for stream `{}` to have action `upsert`"
                    .format(stream))
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version',
                    msg=
                    "Expected last message sent for stream `{}` to have action `activate_version`"
                    .format(stream))

        state = menagerie.get_state(conn_id)
        bookmarks = state['bookmarks']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        for k, v in bookmarks.items():
            if k == 'tap_tester_mysql_0-incremental':
                self.assertIsNotNone(
                    v['version'],
                    msg="expected bookmark for stream `{}` to have a version set"
                    .format(k))
                self.assertEqual(
                    v['replication_key_value'],
                    '2017-01-01T00:00:03.000000Z',
                    msg=
                    "incorrect replication_key_value in bookmark for stream `{}`"
                    .format(k))
                self.assertEqual(
                    v['replication_key'],
                    'c_dt',
                    msg=
                    "incorrect replication_key specified in bookmark for stream `{}`"
                    .format(k))
            else:
                self.assertFalse(
                    'version' in v,
                    msg=
                    "expected bookmark for stream `{}` to not have a version key"
                    .format(k))
                self.assertTrue(
                    'initial_full_table_complete' in v,
                    msg=
                    "expected bookmark for stream `{}` to have a true initial_full_table_complete key"
                    .format(k))

        print("state and bookmarks are correct")

        # verify incremental table_version didn't change
        incremental_table_new_table_version = bookmarks[
            'tap_tester_mysql_0-incremental']['version']

        self.assertEqual(
            incremental_table_initial_table_version,
            incremental_table_new_table_version,
            msg=
            "Expected incrementally-replicated table's table_version to remain unchanged over multiple invocations."
        )
Example #17
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            2,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties

        test_catalog_cows = list(
            filter(
                lambda c: c['stream_name'] ==
                'postgres_logical_replication_test_cows', found_catalogs))[0]
        self.assertEqual('postgres_logical_replication_test_cows',
                         test_catalog_cows['stream_name'])

        test_catalog_chickens = list(
            filter(
                lambda c: c['stream_name'
                            ] == 'postgres_logical_replication_test_chickens',
                found_catalogs))[0]
        self.assertEqual('postgres_logical_replication_test_chickens',
                         test_catalog_chickens['stream_name'])
        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog_cows,
            menagerie.get_annotated_schema(conn_id,
                                           test_catalog_cows['stream_id']),
            additional_md)
        connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog_chickens,
            menagerie.get_annotated_schema(conn_id,
                                           test_catalog_chickens['stream_id']),
            additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        self.assertEqual(
            record_count_by_stream, {
                'postgres_logical_replication_test_cows': 1,
                'postgres_logical_replication_test_chickens': 1
            })
        records_by_stream = runner.get_records_from_target_output()

        table_version_cows = records_by_stream[
            'postgres_logical_replication_test_cows']['table_version']
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_cows']
            ['messages'][0]['action'], 'activate_version')
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_cows']
            ['messages'][1]['action'], 'upsert')
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_cows']
            ['messages'][2]['action'], 'activate_version')

        table_version_chickens = records_by_stream[
            'postgres_logical_replication_test_chickens']['table_version']
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_chickens']
            ['messages'][0]['action'], 'activate_version')
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_chickens']
            ['messages'][1]['action'], 'upsert')
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_chickens']
            ['messages'][2]['action'], 'activate_version')

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")

        bookmark_cows = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_cows']
        self.assertIsNotNone(bookmark_cows['lsn'],
                             msg="expected bookmark for stream to have an lsn")
        lsn_cows_1 = bookmark_cows['lsn']
        self.assertEqual(bookmark_cows['version'],
                         table_version_cows,
                         msg="expected bookmark for stream to match version")

        bookmark_chickens = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_chickens']
        self.assertIsNotNone(bookmark_chickens['lsn'],
                             msg="expected bookmark for stream to have an lsn")
        lsn_chickens_1 = bookmark_chickens['lsn']
        self.assertEqual(bookmark_chickens['version'],
                         table_version_chickens,
                         msg="expected bookmark for stream to match version")

        #----------------------------------------------------------------------
        # invoke the sync job again after adding records
        #----------------------------------------------------------------------
        print("inserting 2 more cows and 2 more chickens")

        with db_utils.get_test_connection('dev') as conn:
            conn.autocommit = True
            with conn.cursor() as cur:
                # insert another cow
                self.cows_rec_2 = {'cow_name': "betty cow", 'cow_age': 21}
                insert_record(cur, test_table_name_cows, self.cows_rec_2)
                # update that cow's expected values
                self.cows_rec_2['id'] = 2
                self.cows_rec_2['_sdc_deleted_at'] = None

                # insert another chicken
                self.chicken_rec_2 = {
                    'chicken_name': "burt chicken",
                    'chicken_age': 14
                }
                insert_record(cur, test_table_name_chickens,
                              self.chicken_rec_2)
                # update that cow's expected values
                self.chicken_rec_2['id'] = 2
                self.chicken_rec_2['_sdc_deleted_at'] = None

                # and repeat...

                self.cows_rec_3 = {'cow_name': "cindy cow", 'cow_age': 10}
                insert_record(cur, test_table_name_cows, self.cows_rec_3)
                self.cows_rec_3['id'] = 3
                self.cows_rec_3['_sdc_deleted_at'] = None

                self.chicken_rec_3 = {
                    'chicken_name': "carl chicken",
                    'chicken_age': 4
                }
                insert_record(cur, test_table_name_chickens,
                              self.chicken_rec_3)
                self.chicken_rec_3['id'] = 3
                self.chicken_rec_3['_sdc_deleted_at'] = None

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        self.assertEqual(
            record_count_by_stream, {
                'postgres_logical_replication_test_cows': 2,
                'postgres_logical_replication_test_chickens': 2
            })
        records_by_stream = runner.get_records_from_target_output()
        chicken_messages = records_by_stream[
            "postgres_logical_replication_test_chickens"]['messages']
        cow_messages = records_by_stream[
            "postgres_logical_replication_test_cows"]['messages']

        self.assertDictEqual(self.cows_rec_2, cow_messages[0]['data'])
        self.assertDictEqual(self.chicken_rec_2, chicken_messages[0]['data'])
        self.assertDictEqual(self.cows_rec_3, cow_messages[1]['data'])
        self.assertDictEqual(self.chicken_rec_3, chicken_messages[1]['data'])

        print("inserted record is correct")

        state = menagerie.get_state(conn_id)
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        cows_bookmark = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_cows']
        self.assertIsNotNone(
            cows_bookmark['lsn'],
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to have an scn"
        )
        lsn_cows_2 = cows_bookmark['lsn']
        self.assertTrue(lsn_cows_2 >= lsn_cows_1)

        chickens_bookmark = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_chickens']
        self.assertIsNotNone(
            chickens_bookmark['lsn'],
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to have an scn"
        )
        lsn_chickens_2 = chickens_bookmark['lsn']
        self.assertTrue(lsn_chickens_2 >= lsn_chickens_1)

        #table_version does NOT change
        self.assertEqual(
            chickens_bookmark['version'],
            table_version_chickens,
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to match version"
        )

        #table_version does NOT change
        self.assertEqual(
            cows_bookmark['version'],
            table_version_cows,
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to match version"
        )
Example #18
0
    def bookmarks_test(self, testable_streams):
        """
        Verify for each stream that you can do a sync which records bookmarks.
        Verify that the bookmark is the max value sent to the target for the `date` PK field
        Verify that the 2nd sync respects the bookmark
        Verify that all data of the 2nd sync is >= the bookmark from the first sync
        Verify that the number of records in the 2nd sync is less then the first
        Verify inclusivivity of bookmarks

        PREREQUISITE
        For EACH stream that is incrementally replicated there are multiple rows of data with
            different values for the replication key
        """
        print("\n\nRUNNING {}\n\n".format(self.name()))

        # Ensure tested streams have existing records
        expected_records_first_sync = self.create_test_data(testable_streams, self.START_DATE, force_create_records=True)

        # Instantiate connection with default start
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Select all testable streams and no fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        streams_to_select = testable_streams
        our_catalogs = [catalog for catalog in found_catalogs if
                        catalog.get('tap_stream_id') in streams_to_select]
        self.select_all_streams_and_fields(conn_id, our_catalogs)

        # Run a sync job using orchestrator
        first_sync_record_count = self.run_sync(conn_id)

        # verify that the sync only sent records to the target for selected streams (catalogs)
        self.assertEqual(streams_to_select, set(first_sync_record_count.keys()),
                         msg="Expect first_sync_record_count keys {} to equal testable streams {},"
                         " first_sync_record_count was {}".format(
                             first_sync_record_count.keys(),
                             streams_to_select,
                             first_sync_record_count))

        first_sync_state = menagerie.get_state(conn_id)

        # Get the set of records from a first sync
        first_sync_records = runner.get_records_from_target_output()

        # Add data before next sync via insert and update, and set expectations
        created_records = {x: [] for x in self.expected_streams()}
        updated_records = {x: [] for x in self.expected_streams()}
        expected_records_second_sync = {x: [] for x in self.expected_streams()}

        # We should expect any records with rep-keys equal to the bookmark from the first sync to be returned by the second
        if 'orders' in testable_streams:
            for order in first_sync_records['orders']['messages']:
                if order['data']['updated_at'] == first_sync_state.get('bookmarks', {}).get('orders', {}).get('updated_at'):
                    expected_records_second_sync['orders'].append(order['data'])

        streams_to_create_records = list(testable_streams)
        if 'payments' in testable_streams:
            streams_to_create_records.remove('payments')
            streams_to_create_records.append('payments')

        for stream in streams_to_create_records:

            new_records = []

            if stream == 'refunds':  # a CREATE for refunds is equivalent to an UPDATE for payments
                # a CREATE for refunds will result in a new payments object
                (new_refund, payment) = self.client.create_refund(start_date=self.START_DATE)
                new_records = new_refund

                created_records['payments'].append(payment)
                expected_records_second_sync['payments'].append(payment)
            else:
                # TEST_ISSUE_1 | get the time that the customer record was created
                if stream == 'customers':
                    customers_create_time = perf_counter()

                # Create
                new_records = self.client.create(stream, start_date=self.START_DATE)

            assert new_records, "Failed to create a {} record".format(stream)
            assert len(new_records) == 1, "Created too many {} records: {}".format(stream, len(new_records))
            expected_records_second_sync[stream] += new_records
            created_records[stream] += new_records

        for stream in testable_streams.difference(self.cannot_update_streams()):
            first_rec = None
            # Update all streams (but save payments for last)
            if stream == 'payments':
                continue

            if stream == 'orders':  # Use the first available order that is still 'OPEN'
                for message in first_sync_records.get(stream).get('messages'):
                    if message.get('data')['state'] not in ['COMPLETED', 'CANCELED']:
                        first_rec = message.get('data')
                        break

                if not first_rec:
                    raise RuntimeError("Unable to find any any orders with state other than COMPLETED")
            elif stream == 'roles':  # Use the first available role that has limited permissions (where is_owner = False)
                for message in first_sync_records.get(stream).get('messages'):
                    data = message.get('data')
                    if not data['is_owner'] and 'role' in data['name']:
                        first_rec = message.get('data')
                        break

                if not first_rec:
                    raise RuntimeError("Unable to find any any orders with state other than COMPLETED")
            else: # By default we want the last created record
                last_message = first_sync_records.get(stream).get('messages')[-1]
                if last_message.get('data') and not last_message.get('data').get('is_deleted'):
                    first_rec = last_message.get('data')
                else: # If last record happens to be deleted grab first available that wasn't
                    LOGGER.warning("The last created record for %s was deleted.", stream)
                    for message in first_sync_records.get(stream).get('messages'):
                        data = message.get('data')
                        if not data.get('is_deleted'):
                            first_rec = message.get('data')
                            break

                if not first_rec:
                    raise RuntimeError("Cannot find any {} records that were not deleted .".format(stream))

            if stream == 'inventories': # This is an append only stream, we will make multiple 'updates'
                first_rec_catalog_obj_id = first_rec.get('catalog_object_id')
                first_rec_location_id = first_rec.get('location_id')
                # IN_STOCK -> SOLD [quantity -1]
                updated_record = self.client.create_specific_inventory_adjustment(
                    first_rec_catalog_obj_id, first_rec_location_id,
                    from_state='IN_STOCK', to_state='SOLD', quantity='1.0')
                assert len(updated_record) == 1, "Failed to update the {} records as intended".format(stream)
                # UNLINKED_RETURN -> IN_STOCK [quantity +1]
                updated_record = self.client.create_specific_inventory_adjustment(
                    first_rec_catalog_obj_id, first_rec_location_id,
                    from_state='UNLINKED_RETURN', to_state='IN_STOCK', quantity='2.0')
                assert len(updated_record) == 1, "Failed to update the {} records as intended".format(stream)
                # NONE -> IN_STOCK [quantity +2]
                updated_record = self.client.create_specific_inventory_adjustment(
                    first_rec_catalog_obj_id, first_rec_location_id,
                    from_state='NONE', to_state='IN_STOCK', quantity='1.0')
                assert len(updated_record) == 1, "Failed to update the {} records as intended".format(stream)
                # IN_STOCK -> WASTE [quantity +1]
                updated_record = self.client.create_specific_inventory_adjustment(
                    first_rec_catalog_obj_id, first_rec_location_id,
                    from_state='IN_STOCK', to_state='WASTE', quantity='1.0')  # creates 2 records
                assert len(updated_record) == 2, "Failed to update the {} records as intended".format(stream)
            else:
                first_rec_id = first_rec.get('id')
                first_rec_version = first_rec.get('version')

                if stream == 'customers':  # TEST_ISSUE_1 get the time that the customer record was updated
                    customers_update_time = perf_counter()

                updated_record = self.client.update(stream, obj_id=first_rec_id, version=first_rec_version,
                                                    obj=first_rec, start_date=self.START_DATE)
                assert updated_record, "Failed to update a {} record".format(stream)

                assert len(updated_record) == 1, "Updated too many {} records".format(stream)

            expected_records_second_sync[stream] += updated_record

            updated_records[stream] += updated_record

        if 'payments' in testable_streams:
            # Update a Payment AFTER all other streams have been updated
            # Payments which have already completed/cancelled can't be done so again so find first APPROVED payment
            first_rec = dict()
            for message in first_sync_records.get('payments').get('messages'):
                if message.get('data')['status'] == 'APPROVED':
                    first_rec = message.get('data')
                    break

            if not first_rec:
                raise RuntimeError("Unable to find any any payment with status APPROVED")
            first_rec_id = first_rec.get('id')
            first_rec_version = first_rec.get('version')

            updated_record = self.client.update('payments', first_rec_id, first_rec_version)
            assert updated_record, "Failed to update a {} record".format('payments')
            assert len(updated_record) == 1, "Updated too many {} records".format('payments')

            expected_records_second_sync['payments'] += updated_record[0]
            updated_records['payments'] += updated_record[0]

        # adjust expectations for full table streams to include the expected records from sync 1
        for stream in self.expected_full_table_streams():
            if stream == 'inventories':
                primary_keys = self.makeshift_primary_keys().get(stream)
            else:
                primary_keys = list(self.expected_primary_keys().get(stream))
            updated_pk_values = {tuple([record.get(pk) for pk in primary_keys]) for record in updated_records[stream]}
            for record in expected_records_first_sync.get(stream, []):
                record_pk_values = tuple([record.get(pk) for pk in primary_keys])
                if record_pk_values in updated_pk_values:
                    continue  # do not add the orginal of the updated record
                expected_records_second_sync[stream].append(record)

        # Adjust expectations for datetime format
        for record_desc, records in [("created", created_records), ("updated", updated_records),
                                     ("2nd sync expected records", expected_records_second_sync)]:
            print("Adjusting epxectations for {} records".format(record_desc))
            for stream, expected_records in records.items():
                print("\tadjusting for stream: {}".format(stream))
                self.modify_expected_records(expected_records)

        # ensure validity of expected_records_second_sync
        for stream in testable_streams:
            if stream in self.expected_incremental_streams():
                if stream in self.cannot_update_streams():
                    self.assertEqual(1, len(expected_records_second_sync.get(stream)),
                                     msg="Expectations are invalid for incremental stream {}".format(stream))
                elif stream == 'orders': # ORDERS are returned inclusive on the datetime queried
                    self.assertEqual(3, len(expected_records_second_sync.get(stream)),
                                     msg="Expectations are invalid for incremental stream {}".format(stream))
                else:  # Most streams will have 2 records from the Update and Insert
                    self.assertEqual(2, len(expected_records_second_sync.get(stream)),
                                     msg="Expectations are invalid for incremental stream {}".format(stream))
            if stream in self.expected_full_table_streams():
                if stream == 'inventories':
                    # Typically changes to inventories object will replace an IN_STOCK record with two records
                    #    1 IN_STOCK  ->  1 IN_STOCK, 1 WASTE
                    # if a given combination of {'catalog_object_id', 'location_id', 'state'} already has a
                    # WASTE record then both records will be replaced
                    #    1 IN_STOCK, 1 WASTE  ->  1 IN_STOCK, 1 WASTE
                    self.assertLessEqual(
                        len(expected_records_second_sync.get(stream)),
                        len(expected_records_first_sync.get(stream)) + len(created_records[stream]) + 1,
                        msg="Expectations are invalid for full table stream {}".format(stream))
                    self.assertGreaterEqual(
                        len(expected_records_second_sync.get(stream)),
                        len(expected_records_first_sync.get(stream)) + len(created_records[stream]),
                        msg="Expectations are invalid for full table stream {}".format(stream))
                    continue
                self.assertEqual(len(expected_records_second_sync.get(stream)), len(expected_records_first_sync.get(stream)) + len(created_records[stream]),
                                 msg="Expectations are invalid for full table stream {}".format(stream))

        # Run a second sync job using orchestrator
        second_sync_time_start = perf_counter()  # TEST_ISSUE_1 get the time that the 2nd sync starts
        second_sync_record_count = self.run_sync(conn_id)
        second_sync_time_end = perf_counter()  # TEST_ISSUE_1 get the time that the 2nd sync ends

        # Get the set of records from a second sync
        second_sync_records = runner.get_records_from_target_output()

        second_sync_state = menagerie.get_state(conn_id)


        # BUG_1 | https://stitchdata.atlassian.net/browse/SRCE-4975
        PARENT_FIELD_MISSING_SUBFIELDS = {'payments': {'card_details'}}

        # BUG_2 | https://stitchdata.atlassian.net/browse/SRCE-5143
        MISSING_FROM_SCHEMA = {'payments': {'capabilities', 'version_token', 'approved_money'}}


        # Loop first_sync_records and compare against second_sync_records
        for stream in testable_streams:
            with self.subTest(stream=stream):

                second_sync_data = [record.get("data") for record
                                    in second_sync_records.get(stream, {}).get("messages", [])]
                stream_replication_keys = self.expected_replication_keys()
                stream_primary_keys = self.expected_primary_keys()

                # TESTING INCREMENTAL STREAMS
                if stream in self.expected_incremental_streams():

                    replication_keys = stream_replication_keys.get(stream)

                    # Verify both syncs write / keep the same bookmark
                    self.assertEqual(set(first_sync_state.get('bookmarks', {}).keys()),
                                     set(second_sync_state.get('bookmarks', {}).keys()))

                    # verify that there is more than 1 record of data - setup necessary
                    self.assertGreater(first_sync_record_count.get(stream, 0), 1,
                                       msg="Data isn't set up to be able to test full sync")

                    # verify that you get less/same amount of data on the 2nd sync
                    self.assertGreater(
                        first_sync_record_count.get(stream, 0),
                        second_sync_record_count.get(stream, 0),
                        msg="first sync didn't have more records, bookmark usage not verified")

                    for replication_key in replication_keys:

                        # Verify second sync's bookmarks move past the first sync's
                        self.assertGreater(
                            second_sync_state.get('bookmarks', {stream: {}}).get(
                                stream, {replication_key: -1}).get(replication_key),
                            first_sync_state.get('bookmarks', {stream: {}}).get(
                                stream, {replication_key: -1}).get(replication_key)
                        )

                        # Verify that all data of the 2nd sync is >= the bookmark from the first sync
                        first_sync_bookmark = first_sync_state.get('bookmarks').get(stream).get(replication_key)
                        for record in second_sync_data:
                            date_value = record[replication_key]
                            self.assertGreaterEqual(date_value,
                                                    first_sync_bookmark,
                                                    msg="A 2nd sync record has a replication-key that is less than or equal to the 1st sync bookmark.")

                elif stream in self.expected_full_table_streams():

                    # TESTING FULL TABLE STREAMS

                    # Verify no bookmarks are present
                    first_state = first_sync_state.get('bookmarks', {}).get(stream)
                    self.assertEqual({}, first_state,
                                     msg="Unexpected state for {}\n".format(stream) + \
                                     "\tState: {}\n".format(first_sync_state) + \
                                     "\tBookmark: {}".format(first_state))
                    second_state = second_sync_state.get('bookmarks', {}).get(stream)
                    self.assertEqual({}, second_state,
                                     msg="Unexpected state for {}\n".format(stream) + \
                                     "\tState: {}\n".format(second_sync_state) + \
                                     "\tBookmark: {}".format(second_state))

                if stream == 'customers' and len(second_sync_data) == 0: # BUG https://stitchdata.atlassian.net/browse/SRCE-4639
                    # NOTE: Square sometimes lags on the customers stream, so we'll give them one more shot
                    #       before we say this stream fails in catching the create and update. This was tested
                    #       manually while syncing all streams and while sycning only the customers stream
                    #       and we were unable to produce a scenario in which a subsequent sync failed to pick
                    #       up the create and update after failing to catch them in the 2nd sync.

                    # TEST_ISSUE_1 | Log the time diffs for record created, updated, second sync ran
                    LOGGER.warning(
                        'Second sync missed %s records that were just created and updated.\n' +
                        'Time between record create and: \n\tsync start = %s\tsync end: %s\n' +
                        'Time between record update and: \n\tsync start = %s\tsync end: %s',
                        stream,
                        second_sync_time_start - customers_create_time, second_sync_time_end - customers_create_time,
                        second_sync_time_start - customers_update_time, second_sync_time_end - customers_update_time,
                    )

                    # TODO TIMING | get the time the third sync ran
                    # Run another sync since square can't keep up
                    third_sync_time_start = perf_counter()  # TEST_ISSUE_1 get the time that the 3rd sync starts
                    _ = self.run_sync(conn_id)
                    third_sync_time_end = perf_counter()  # TEST_ISSUE_1 get the time that the 3rd sync ends

                    # Get the set of records from a thrid sync and apply
                    third_sync_records = runner.get_records_from_target_output()
                    second_sync_data = [record.get("data") for record
                                        in third_sync_records.get(stream, {}).get("messages", [])]
                else:  # TEST_ISSUE_1
                    third_sync_time_start = perf_counter()
                    third_sync_time_end = perf_counter()

                # TESTING APPLICABLE TO ALL STREAMS

                # Verify that the expected records are replicated in the 2nd sync
                # For incremental streams we should see at least 2 records (a new record and an updated record)
                # but we may see more as the bookmmark is inclusive.
                # For full table streams we should see 1 more record than the first sync
                expected_records = expected_records_second_sync.get(stream)
                if stream == 'inventories':
                    primary_keys = self.makeshift_primary_keys().get(stream)
                else:
                    primary_keys = stream_primary_keys.get(stream)

                updated_pk_values = {tuple([record.get(pk) for pk in primary_keys]) for record in updated_records[stream]}

                if stream == 'customers' and len(second_sync_data) != len(expected_records): # TEST_ISSUE_1
                    # TEST_ISSUE_1 | Log the time diffs for record created, updated, third sync ran
                    LOGGER.warning(
                        'Third sync missed %s records that were just created and updated.\n' +
                        'Time between record create and: \n\tsync start = %s\tsync end: %s\n' +
                        'Time between record update and: \n\tsync start = %s\tsync end: %s',
                        stream,
                        third_sync_time_start - customers_create_time, third_sync_time_end - customers_create_time,
                        third_sync_time_start - customers_update_time, third_sync_time_end - customers_update_time,
                    )

                self.assertLessEqual(
                    len(expected_records), len(second_sync_data),
                    msg="Expected number of records are not less than or equal to actual for 2nd sync.\n" +
                    "Expected: {}\nActual: {}".format(len(expected_records), len(second_sync_data))
                )
                if (len(second_sync_data) - len(expected_records)) > 0:
                    LOGGER.warning('Second sync replicated %s records more than our create and update for %s',
                                   len(second_sync_data), stream)

                if not primary_keys:
                    raise NotImplementedError("PKs are needed for comparing records")

                # Verify that the inserted records are replicated by the 2nd sync and match our expectations
                for created_record in created_records.get(stream):
                    record_pk_values = tuple([created_record.get(pk) for pk in primary_keys])
                    sync_records = [sync_record for sync_record in second_sync_data
                                    if tuple([sync_record.get(pk) for pk in primary_keys]) == record_pk_values]
                    self.assertTrue(len(sync_records),
                                    msg="An inserted record is missing from our sync: \nRECORD: {}".format(created_record))
                    self.assertEqual(1, len(sync_records),
                                     msg="A duplicate record was found in the sync for {}\nRECORD: {}.".format(stream, sync_records))
                    sync_record = sync_records[0]
                    # Test Workaround Start ##############################
                    if stream == 'payments':

                        off_keys = MISSING_FROM_SCHEMA[stream] # BUG_2
                        self.assertParentKeysEqualWithOffKeys(
                            created_record, sync_record, off_keys
                        )
                        off_keys = PARENT_FIELD_MISSING_SUBFIELDS[stream] | MISSING_FROM_SCHEMA[stream] # BUG_1 | # BUG_2
                        self.assertDictEqualWithOffKeys(
                            created_record, sync_record, off_keys
                        )  # Test Workaround End ##############################

                    else:
                        self.assertRecordsEqual(stream, created_record, sync_record)

                # Verify that the updated records are replicated by the 2nd sync and match our expectations
                for updated_record in updated_records.get(stream):
                    if stream not in self.cannot_update_streams():
                        record_pk_values = tuple([updated_record.get(pk) for pk in primary_keys])
                        sync_records = [sync_record for sync_record in second_sync_data
                                        if tuple([sync_record.get(pk) for pk in primary_keys]) == record_pk_values]
                        if stream != 'modifier_lists':
                            self.assertTrue(len(sync_records),
                                            msg="An updated record is missing from our sync: \nRECORD: {}".format(updated_record))
                            self.assertEqual(1, len(sync_records),
                                             msg="A duplicate record was found in the sync for {}\nRECORDS: {}.".format(stream, sync_records))

                        sync_record = sync_records[0]

                        # Test Workaround Start ##############################
                        if stream == 'payments':

                            off_keys = MISSING_FROM_SCHEMA[stream] # BUG_2
                            self.assertParentKeysEqualWithOffKeys(
                                updated_record, sync_record, off_keys
                            )
                            off_keys = PARENT_FIELD_MISSING_SUBFIELDS[stream] | MISSING_FROM_SCHEMA[stream] # BUG_1 | # BUG_2
                            self.assertDictEqualWithOffKeys(
                                updated_record, sync_record, off_keys
                            )  # Test Workaround End ##############################

                        else:
                            self.assertRecordsEqual(stream, updated_record, sync_record)
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id']
                          for c in found_catalogs})

        for tap_stream_id in self.expected_check_streams():
            found_stream = [
                c for c in found_catalogs
                if c['tap_stream_id'] == tap_stream_id
            ][0]

            # assert that the pks are correct
            self.assertEqual(
                self.expected_pks()[found_stream['stream_name']],
                set(
                    found_stream.get('metadata',
                                     {}).get('table-key-properties')))

            # assert that the row counts are correct
            self.assertEqual(
                self.expected_row_counts()[found_stream['stream_name']],
                found_stream.get('metadata', {}).get('row-count'))

        #  -----------------------------------
        # ----------- Initial Full Table ---------
        #  -----------------------------------
        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    'replication-method': 'LOG_BASED'
                }
            }]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema, additional_md)

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # Verify that the full table was syncd
        for tap_stream_id in self.expected_sync_streams():
            self.assertGreaterEqual(record_count_by_stream[tap_stream_id],
                                    self.expected_row_counts()[tap_stream_id])

        # Verify that we have 'initial_full_table_complete' bookmark
        state = menagerie.get_state(conn_id)
        first_versions = {}

        for tap_stream_id in self.expected_check_streams():
            # assert that the state has an initial_full_table_complete == True
            self.assertTrue(state['bookmarks'][tap_stream_id]
                            ['initial_full_table_complete'])
            # assert that there is a version bookmark in state
            first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id][
                'version']
            self.assertIsNotNone(first_versions[tap_stream_id])
            # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark
            self.assertIsNotNone(
                state['bookmarks'][tap_stream_id]['oplog_ts_time'])
            self.assertIsNotNone(
                state['bookmarks'][tap_stream_id]['oplog_ts_inc'])

        changed_ids = set()
        with get_test_connection() as client:
            # Delete two documents for each collection

            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 0})[0]['_id'])
            client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0})

            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 1})[0]['_id'])
            client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 0})[0]['_id'])
            client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 1})[0]['_id'])
            client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1})

            # Update two documents for each collection
            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 48})[0]['_id'])
            client["simple_db"]["simple_coll_1"].update_one(
                {'int_field': 48}, {'$set': {
                    'int_field': -1
                }})

            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 49})[0]['_id'])
            client["simple_db"]["simple_coll_1"].update_one(
                {'int_field': 49}, {'$set': {
                    'int_field': -1
                }})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 98})[0]['_id'])
            client["simple_db"]["simple_coll_2"].update_one(
                {'int_field': 98}, {'$set': {
                    'int_field': -1
                }})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 99})[0]['_id'])
            client["simple_db"]["simple_coll_2"].update_one(
                {'int_field': 99}, {'$set': {
                    'int_field': -1
                }})

            # Insert two documents for each collection
            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field":
                50,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 50})[0]['_id'])

            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field":
                51,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 51})[0]['_id'])

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field":
                100,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 100})[0]['_id'])

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field":
                101,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 101})[0]['_id'])

        #  -----------------------------------
        # ----------- Subsequent Oplog Sync ---------
        #  -----------------------------------

        # Run sync

        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()
        records_by_stream = {}
        for stream_name in self.expected_sync_streams():
            records_by_stream[stream_name] = [
                x for x in messages_by_stream[stream_name]['messages']
                if x.get('action') == 'upsert'
            ]

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # Verify that we got at least 6 records due to changes
        # (could be more due to overlap in gte oplog clause)
        for k, v in record_count_by_stream.items():
            self.assertGreaterEqual(v, 6)

        # Verify that we got 2 records with _SDC_DELETED_AT
        self.assertEqual(
            2,
            len([
                x['data'] for x in records_by_stream['simple_coll_1']
                if x['data'].get('_sdc_deleted_at')
            ]))
        self.assertEqual(
            2,
            len([
                x['data'] for x in records_by_stream['simple_coll_2']
                if x['data'].get('_sdc_deleted_at')
            ]))
        # Verify that the _id of the records sent are the same set as the
        # _ids of the documents changed
        actual = set([
            ObjectId(x['data']['_id'])
            for x in records_by_stream['simple_coll_1']
        ]).union(
            set([
                ObjectId(x['data']['_id'])
                for x in records_by_stream['simple_coll_2']
            ]))
        self.assertEqual(changed_ids, actual)
Example #20
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify discovery produced (at least) 1 expected catalog
        found_catalogs = [
            found_catalog for found_catalog in menagerie.get_catalogs(conn_id)
            if found_catalog['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreaterEqual(len(found_catalogs), 1)

        # verify the tap discovered the expected streams
        found_catalog_names = {
            catalog['tap_stream_id']
            for catalog in found_catalogs
        }
        self.assertSetEqual(self.expected_check_streams(), found_catalog_names)

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]
        self.assertEqual(test_table_name, test_catalog['stream_name'])
        print("discovered streams are correct")

        # perform table selection
        print('selecting {} and all fields within the table'.format(
            test_table_name))
        schema_and_metadata = menagerie.get_annotated_schema(
            conn_id, test_catalog['stream_id'])
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'FULL_TABLE'
            }
        }]
        _ = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog, schema_and_metadata, additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run sync job 1 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_1 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(3, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(5, len(messages))
        self.assertEqual('activate_version', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('upsert', messages[3]['action'])
        self.assertEqual('activate_version', messages[4]['action'])

        # verify the persisted schema matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # verify replicated records match expectations
        self.assertDictEqual(self.expected_records[0], messages[1]['data'])
        self.assertDictEqual(self.expected_records[1], messages[2]['data'])
        self.assertDictEqual(self.expected_records[2], messages[3]['data'])

        print("records are correct")

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_1, bookmark['version'])

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN and get the same 3 records
        #----------------------------------------------------------------------

        # run sync job 2 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_2 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(3, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(4, len(messages))
        self.assertEqual('upsert', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('activate_version', messages[3]['action'])

        # verify the new table version increased on the second sync
        self.assertGreater(table_version_2, table_version_1)

        # verify the persisted schema still matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # verify replicated records still match expectations
        self.assertDictEqual(self.expected_records[0], messages[0]['data'])
        self.assertDictEqual(self.expected_records[1], messages[1]['data'])
        self.assertDictEqual(self.expected_records[2], messages[2]['data'])

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_2, bookmark['version'])

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN following various manipulations to the data
        #----------------------------------------------------------------------

        with db_utils.get_test_connection('dev') as conn:
            conn.autocommit = True
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:

                # NB | We will perform the following actions prior to the next sync:
                #      [Action (EXPECTED RESULT)]

                #      Insert a record
                #      Insert a record to be updated prior to sync
                #      Insert a record to be deleted prior to sync (NOT REPLICATED)

                #      Update an existing record
                #      Update a newly inserted record

                #      Delete an existing record
                #      Delete  a newly inserted record

                # inserting...
                # a new record
                nyc_tz = pytz.timezone('America/New_York')
                our_time_offset = "-04:00"
                our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(6, 6, 6)
                our_time_tz = our_time.isoformat() + our_time_offset
                our_date = datetime.date(1970, 7, 1)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 2",
                    'our_varchar_10':
                    "varchar_10",
                    'our_text':
                    "some text 2",
                    'our_integer':
                    44101,
                    'our_smallint':
                    2,
                    'our_bigint':
                    1000001,
                    'our_decimal':
                    decimal.Decimal('9876543210.02'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '1',
                    'our_json':
                    json.dumps({'nymn': 77}),
                    'our_jsonb':
                    json.dumps({'burgers': 'good++'}),
                    'our_uuid':
                    my_uuid,
                    'our_citext':
                    'cyclops 2',
                    'our_store':
                    'dances=>"floor",name=>"betty"',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    '$0.98789'
                })
                self.expected_records.append({
                    'id':
                    4,
                    'our_varchar':
                    "our_varchar 2",
                    'our_varchar_10':
                    "varchar_10",
                    'our_text':
                    "some text 2",
                    'our_integer':
                    44101,
                    'our_smallint':
                    2,
                    'our_bigint':
                    1000001,
                    'our_decimal':
                    decimal.Decimal('9876543210.02'),
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'OUR DATE':
                    '1970-07-01T00:00:00+00:00',
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    True,
                    'our_json':
                    '{"nymn": 77}',
                    'our_jsonb':
                    '{"burgers": "good++"}',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_citext':
                    self.inserted_records[-1]['our_citext'],
                    'our_store': {
                        "name": "betty",
                        "dances": "floor"
                    },
                    'our_cidr':
                    self.inserted_records[-1]['our_cidr'],
                    'our_inet':
                    self.inserted_records[-1]['our_inet'],
                    'our_mac':
                    self.inserted_records[-1]['our_mac'],
                    'our_money':
                    '$0.99',
                    'our_alignment_enum':
                    None,
                })
                # a new record which we will then update prior to sync
                our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(12, 11, 10)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1999, 9, 9)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 4",
                    'our_varchar_10':
                    "varchar_3",
                    'our_text':
                    "some text 4",
                    'our_integer':
                    55200,
                    'our_smallint':
                    1,
                    'our_bigint':
                    100000,
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '0',
                    'our_json':
                    json.dumps('some string'),
                    'our_jsonb':
                    json.dumps(['burgers are good']),
                    'our_uuid':
                    my_uuid,
                    'our_store':
                    'size=>"small",name=>"betty"',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                })
                self.expected_records.append({
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    'our_text':
                    'some text 4',
                    'our_bit':
                    False,
                    'our_integer':
                    55200,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'id':
                    5,
                    'our_json':
                    self.inserted_records[-1]['our_json'],
                    'our_boolean':
                    True,
                    'our_jsonb':
                    self.inserted_records[-1]['our_jsonb'],
                    'our_bigint':
                    100000,
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'our_store': {
                        "name": "betty",
                        "size": "small"
                    },
                    'our_smallint':
                    1,
                    'OUR DATE':
                    '1999-09-09T00:00:00+00:00',
                    'our_varchar':
                    'our_varchar 4',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_varchar_10':
                    'varchar_3',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                    'our_alignment_enum':
                    None,
                })
                # a new record to be deleted prior to sync
                our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(12, 11, 10)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1999, 9, 9)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 4",
                    'our_varchar_10':
                    "varchar_3",
                    'our_text':
                    "some text 4",
                    'our_integer':
                    55200,
                    'our_smallint':
                    1,
                    'our_bigint':
                    100000,
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '0',
                    'our_json':
                    json.dumps('some string'),
                    'our_jsonb':
                    json.dumps(['burgers are good']),
                    'our_uuid':
                    my_uuid,
                    'our_store':
                    'size=>"small",name=>"betty"',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                })
                self.expected_records.append({
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    'our_text':
                    'some text 4',
                    'our_bit':
                    False,
                    'our_integer':
                    55200,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'id':
                    6,
                    'our_json':
                    self.inserted_records[-1]['our_json'],
                    'our_boolean':
                    True,
                    'our_jsonb':
                    self.inserted_records[-1]['our_jsonb'],
                    'our_bigint':
                    100000,
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'our_store': {
                        "name": "betty",
                        "size": "small"
                    },
                    'our_smallint':
                    1,
                    'OUR DATE':
                    '1999-09-09T00:00:00+00:00',
                    'our_varchar':
                    'our_varchar 4',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_varchar_10':
                    'varchar_3',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                    'our_alignment_enum':
                    None,
                })

                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[3])
                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[4])
                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[5])

                # updating ...
                # an existing record
                canon_table_name = db_utils.canonicalized_table_name(
                    cur, test_schema_name, test_table_name)
                record_pk = 1
                our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                updated_data = {
                    "OUR TS TZ": our_ts_tz,
                    "our_double": decimal.Decimal("6.6"),
                    "our_money": "$0.00"
                }
                self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz(
                    our_ts_tz)
                self.expected_records[0]["our_double"] = decimal.Decimal("6.6")
                self.expected_records[0]["our_money"] = "$0.00"

                db_utils.update_record(cur, canon_table_name, record_pk,
                                       updated_data)

                # a newly inserted record
                canon_table_name = db_utils.canonicalized_table_name(
                    cur, test_schema_name, test_table_name)
                record_pk = 5
                our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                updated_data = {
                    "OUR TS TZ": our_ts_tz,
                    "our_double": decimal.Decimal("6.6"),
                    "our_money": "$0.00"
                }
                self.expected_records[4]["OUR TS TZ"] = self.expected_ts_tz(
                    our_ts_tz)
                self.expected_records[4]["our_double"] = decimal.Decimal("6.6")
                self.expected_records[4]["our_money"] = "$0.00"

                db_utils.update_record(cur, canon_table_name, record_pk,
                                       updated_data)

                # deleting
                # an existing record
                record_pk = 2
                db_utils.delete_record(cur, canon_table_name, record_pk)

                # a newly inserted record
                record_pk = 6
                db_utils.delete_record(cur, canon_table_name, record_pk)

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN after vairous manipulations
        #----------------------------------------------------------------------

        # run sync job 3 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_3 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(4, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(5, len(messages))
        self.assertEqual('upsert', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('upsert', messages[3]['action'])
        self.assertEqual('activate_version', messages[4]['action'])

        # verify the new table version increased on the second sync
        self.assertGreater(table_version_3, table_version_2)

        # verify the persisted schema still matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # NB | This is a little tough to track mentally so here's a breakdown of
        #      the order of operations by expected records indexes:

        #      Prior to Sync 1
        #        insert 0, 1, 2

        #      Prior to Sync 2
        #        No db changes

        #      Prior to Sync 3
        #        insert 3, 4, 5
        #        update 0, 4
        #        delete 1, 5

        #      Resulting Synced Records: 2, 3, 0, 4

        # verify replicated records still match expectations
        self.assertDictEqual(self.expected_records[2],
                             messages[0]['data'])  # existing insert
        self.assertDictEqual(self.expected_records[3],
                             messages[1]['data'])  # new insert
        self.assertDictEqual(self.expected_records[0],
                             messages[2]['data'])  # existing update
        self.assertDictEqual(self.expected_records[4],
                             messages[3]['data'])  # new insert / update

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_3, bookmark['version'])
    def bookmarks_test(self, expected_streams):
        """A Parametrized Bookmarks Test"""
        expected_replication_keys = self.expected_replication_keys()
        expected_replication_methods = self.expected_replication_method()
        expected_insights_buffer = -1 * int(
            self.get_properties()['insights_buffer_days'])  # lookback window

        ##########################################################################
        ### First Sync
        ##########################################################################

        conn_id = connections.ensure_connection(self,
                                                original_properties=False)

        # Run in check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # Select only the expected streams tables
        catalog_entries = [
            ce for ce in found_catalogs
            if ce['tap_stream_id'] in expected_streams
        ]
        self.perform_and_verify_table_and_field_selection(
            conn_id, catalog_entries, select_all_fields=True)

        # Run a sync job using orchestrator
        first_sync_record_count = self.run_and_verify_sync(conn_id)
        first_sync_records = runner.get_records_from_target_output()
        first_sync_bookmarks = menagerie.get_state(conn_id)

        ##########################################################################
        ### Update State Between Syncs
        ##########################################################################

        new_states = {'bookmarks': dict()}
        simulated_states = self.calculated_states_by_stream(
            first_sync_bookmarks)
        for stream, new_state in simulated_states.items():
            new_states['bookmarks'][stream] = new_state
        menagerie.set_state(conn_id, new_states)

        ##########################################################################
        ### Second Sync
        ##########################################################################

        second_sync_record_count = self.run_and_verify_sync(conn_id)
        second_sync_records = runner.get_records_from_target_output()
        second_sync_bookmarks = menagerie.get_state(conn_id)

        ##########################################################################
        ### Test By Stream
        ##########################################################################

        for stream in expected_streams:
            with self.subTest(stream=stream):

                # expected values
                expected_replication_method = expected_replication_methods[
                    stream]

                # collect information for assertions from syncs 1 & 2 base on expected values
                first_sync_count = first_sync_record_count.get(stream, 0)
                second_sync_count = second_sync_record_count.get(stream, 0)
                first_sync_messages = [
                    record.get('data') for record in first_sync_records.get(
                        stream).get('messages')
                    if record.get('action') == 'upsert'
                ]
                second_sync_messages = [
                    record.get('data') for record in second_sync_records.get(
                        stream).get('messages')
                    if record.get('action') == 'upsert'
                ]
                first_bookmark_key_value = first_sync_bookmarks.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)
                second_bookmark_key_value = second_sync_bookmarks.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)

                if expected_replication_method == self.INCREMENTAL:

                    # collect information specific to incremental streams from syncs 1 & 2
                    replication_key = next(
                        iter(expected_replication_keys[stream]))
                    first_bookmark_value = first_bookmark_key_value.get(
                        replication_key)
                    second_bookmark_value = second_bookmark_key_value.get(
                        replication_key)
                    first_bookmark_value_utc = self.convert_state_to_utc(
                        first_bookmark_value)
                    second_bookmark_value_utc = self.convert_state_to_utc(
                        second_bookmark_value)
                    simulated_bookmark_value = new_states['bookmarks'][stream][
                        replication_key]
                    simulated_bookmark_minus_lookback = self.timedelta_formatted(
                        simulated_bookmark_value,
                        days=expected_insights_buffer) if self.is_insight(
                            stream) else simulated_bookmark_value

                    # Verify the first sync sets a bookmark of the expected form
                    self.assertIsNotNone(first_bookmark_key_value)
                    self.assertIsNotNone(
                        first_bookmark_key_value.get(replication_key))

                    # Verify the second sync sets a bookmark of the expected form
                    self.assertIsNotNone(second_bookmark_key_value)
                    self.assertIsNotNone(
                        second_bookmark_key_value.get(replication_key))

                    # Verify the second sync bookmark is Equal to the first sync bookmark
                    self.assertEqual(
                        second_bookmark_value, first_bookmark_value
                    )  # assumes no changes to data during test

                    for record in second_sync_messages:

                        # Verify the second sync records respect the previous (simulated) bookmark value
                        replication_key_value = record.get(replication_key)
                        if stream == 'ads_insights_age_and_gender':  # BUG | https://stitchdata.atlassian.net/browse/SRCE-4873
                            replication_key_value = datetime.datetime.strftime(
                                dateutil.parser.parse(replication_key_value),
                                self.BOOKMARK_COMPARISON_FORMAT)
                        self.assertGreaterEqual(
                            replication_key_value,
                            simulated_bookmark_minus_lookback,
                            msg=
                            "Second sync records do not repect the previous bookmark."
                        )

                        # Verify the second sync bookmark value is the max replication key value for a given stream
                        self.assertLessEqual(
                            replication_key_value,
                            second_bookmark_value_utc,
                            msg=
                            "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                        )

                    for record in first_sync_messages:

                        # Verify the first sync bookmark value is the max replication key value for a given stream
                        replication_key_value = record.get(replication_key)
                        self.assertLessEqual(
                            replication_key_value,
                            first_bookmark_value_utc,
                            msg=
                            "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                        )

                    # Verify the number of records in the 2nd sync is less then the first
                    self.assertLess(second_sync_count, first_sync_count)

                elif expected_replication_method == self.FULL_TABLE:

                    # Verify the syncs do not set a bookmark for full table streams
                    self.assertIsNone(first_bookmark_key_value)
                    self.assertIsNone(second_bookmark_key_value)

                    # Verify the number of records in the second sync is the same as the first
                    self.assertEqual(second_sync_count, first_sync_count)

                else:

                    raise NotImplementedError(
                        "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}"
                        .format(stream, expected_replication_method))

                # Verify at least 1 record was replicated in the second sync
                self.assertGreater(
                    second_sync_count,
                    0,
                    msg="We are not fully testing bookmarking for {}".format(
                        stream))
Example #22
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]

        self.assertEqual('postgres_logical_replication_test',
                         test_catalog['stream_name'])

        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        #don't selcted our_text_2
        _ = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog,
            menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']),
            additional_md, ['our_text_2'])

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        self.assertEqual(record_count_by_stream,
                         {'postgres_logical_replication_test': 4})
        records_by_stream = runner.get_records_from_target_output()

        table_version = records_by_stream['postgres_logical_replication_test'][
            'table_version']

        self.assertEqual(
            records_by_stream['postgres_logical_replication_test']['messages']
            [0]['action'], 'activate_version')

        self.assertEqual(
            records_by_stream['postgres_logical_replication_test']['messages']
            [1]['action'], 'upsert')

        self.assertEqual(
            records_by_stream['postgres_logical_replication_test']['messages']
            [2]['action'], 'upsert')

        self.assertEqual(
            records_by_stream['postgres_logical_replication_test']['messages']
            [3]['action'], 'upsert')

        self.assertEqual(
            records_by_stream['postgres_logical_replication_test']['messages']
            [4]['action'], 'upsert')

        self.assertEqual(
            records_by_stream['postgres_logical_replication_test']['messages']
            [5]['action'], 'activate_version')

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)

        bookmark = state['bookmarks'][
            'logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")

        self.assertIsNotNone(bookmark['lsn'],
                             msg="expected bookmark for stream to have an lsn")
        lsn_1 = bookmark['lsn']

        self.assertEqual(bookmark['version'],
                         table_version,
                         msg="expected bookmark for stream to match version")

        #----------------------------------------------------------------------
        # invoke the sync job again after adding a record
        #----------------------------------------------------------------------
        print("inserting a record 5")

        with db_utils.get_test_connection(test_db) as conn:
            conn.autocommit = True
            with conn.cursor() as cur:
                #insert fixture data 3
                our_ts = datetime.datetime(1993, 3, 3, 3, 3, 3, 333333)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(3, 4, 5)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1933, 3, 3)
                my_uuid = str(uuid.uuid1())

                #STRINGS:
                #OUR TS: '1993-03-03 03:03:03.333333'
                #OUR TS TZ: '1993-03-03 08:03:03.333333+00'
                #'OUR TIME': '03:04:05'
                #'OUR TIME TZ': '03:04:05+00'
                self.rec_5 = {
                    'our_varchar': "our_varchar 5",  # str
                    'our_varchar_10': "varchar13",  # str
                    'our_text': "some text 3",  #str
                    'our_text_2': "NOT SELECTED",
                    'our_integer': 96000,  #int
                    'our_smallint': 3,  # int
                    'our_bigint': 3000000,  #int
                    'our_decimal': decimal.Decimal(
                        '1234567890.03'
                    ),  #1234567890.03 / our_decimal is a <class 'float'>
                    quote_ident('OUR TS', cur):
                    our_ts,  # str '1993-03-03 03:03:03.333333'
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,  #str '1993-03-03 08:03:03.333333+00'
                    quote_ident('OUR TIME', cur): our_time,  # str '03:04:05'
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,  # str '03:04:05+00'
                    quote_ident('OUR DATE', cur):
                    our_date,  #1933-03-03 / OUR DATE is a <class 'str'>
                    'our_double': 3.3,  #3.3 / our_double is a <class 'float'>
                    'our_real': 6.6,  #6.6 / our_real is a <class 'float'>
                    'our_boolean': True,  #boolean
                    'our_bit': '1',  #string
                    'our_json': json.dumps({'secret': 33}),  #string
                    'our_jsonb': json.dumps(['burgers make me hungry']),
                    'our_uuid': my_uuid,  #string
                    'our_store': 'jumps=>"high",name=>"betty"',  #string
                    'our_citext': 'maGICKal 3',
                    'our_cidr': '192.168.102.128/32',
                    'our_inet': '192.168.102.128/32',
                    'our_mac': '08:00:2b:01:02:05',
                    'our_money': '$412.1234'
                }

                insert_record(cur, test_table_name, self.rec_5)

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        self.assertEqual(record_count_by_stream,
                         {'postgres_logical_replication_test': 1})
        records_by_stream = runner.get_records_from_target_output()

        self.assertTrue(len(records_by_stream) > 0)

        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertDictEqual(recs['schema'], expected_schemas[stream])

        self.assertEqual(
            1,
            len(records_by_stream['postgres_logical_replication_test']
                ['messages']))
        actual_record_2 = records_by_stream[
            'postgres_logical_replication_test']['messages'][0]['data']
        actual_sdc_lsn_2 = int(actual_record_2['_sdc_lsn'])
        del actual_record_2['_sdc_lsn']

        expected_inserted_record = {
            'our_text': 'some text 3',
            'our_real': decimal.Decimal('6.6'),
            '_sdc_deleted_at': None,
            'our_store': {
                'name': 'betty',
                'jumps': 'high'
            },
            'our_bigint': 3000000,
            'our_varchar': 'our_varchar 5',
            'our_double': decimal.Decimal('3.3'),
            'our_bit': True,
            'our_uuid': self.rec_5['our_uuid'],
            'OUR TS': '1993-03-03T03:03:03.333333+00:00',
            'OUR TS TZ': '1993-03-03T08:03:03.333333+00:00',
            'OUR TIME': '03:04:05',
            'OUR TIME TZ': '03:04:05-04:00',
            'OUR DATE': '1933-03-03T00:00:00+00:00',
            'our_decimal': decimal.Decimal('1234567890.03'),
            'id': 5,
            'our_varchar_10': 'varchar13',
            'our_json': '{"secret": 33}',
            'our_jsonb': self.rec_5['our_jsonb'],
            'our_smallint': 3,
            'our_integer': 96000,
            'our_boolean': True,
            'our_citext': 'maGICKal 3',
            'our_cidr': self.rec_5['our_cidr'],
            'our_inet': '192.168.102.128',
            'our_mac': self.rec_5['our_mac'],
            'our_alignment_enum': None,
            'our_money': '$412.12'
        }
        self.assertDictEqual(expected_inserted_record, actual_record_2)

        self.assertEqual(
            records_by_stream['postgres_logical_replication_test']['messages']
            [0]['action'], 'upsert')
        print("inserted record is correct")

        state = menagerie.get_state(conn_id)
        chicken_bookmark = state['bookmarks'][
            'logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")

        self.assertIsNotNone(
            chicken_bookmark['lsn'],
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to have an scn"
        )
        lsn_2 = chicken_bookmark['lsn']

        self.assertTrue(lsn_2 >= lsn_1)

        #table_version does NOT change
        self.assertEqual(
            chicken_bookmark['version'],
            table_version,
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to match version"
        )

        #----------------------------------------------------------------------
        # invoke the sync job again after deleting a record
        #----------------------------------------------------------------------
        print("delete row from source db")
        with db_utils.get_test_connection(test_db) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute("DELETE FROM {} WHERE id = 3".format(
                    canonicalized_table_name(test_schema_name, test_table_name,
                                             cur)))

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # verify the inserted record's lsn is less than or equal to the bookmarked lsn
        self.assertGreaterEqual(lsn_2, actual_sdc_lsn_2)
        expected_record_count = 1 if actual_sdc_lsn_2 < lsn_2 else 2
        self.assertEqual(
            record_count_by_stream,
            {'postgres_logical_replication_test': expected_record_count})

        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # if there are 2 records...
        if expected_record_count == 2:
            # the 1st message will be the previous insert
            insert_message = records_by_stream[
                'postgres_logical_replication_test']['messages'][0]['data']
            del insert_message['_sdc_lsn']

            self.assertDictEqual(insert_message, expected_inserted_record)

        #the 2nd message will be the delete
        delete_message = records_by_stream[
            'postgres_logical_replication_test']['messages'][
                expected_record_count - 1]
        self.assertEqual(delete_message['action'], 'upsert')

        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 3)
        print("deleted record is correct")

        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")

        self.assertIsNotNone(
            bookmark['lsn'],
            msg="expected bookmark for stream ROOT-CHICKEN to have an scn")

        lsn_3 = bookmark['lsn']
        self.assertTrue(lsn_3 >= lsn_2)
        #----------------------------------------------------------------------
        # invoke the sync job again after deleting a record using the 'id IN (SELECT ...)' format
        #----------------------------------------------------------------------
        print("delete row from source db")
        with db_utils.get_test_connection(test_db) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute(
                    "DELETE FROM {} WHERE id IN (SELECT id FROM {} WHERE id=2)"
                    .format(
                        canonicalized_table_name(test_schema_name,
                                                 test_table_name, cur),
                        canonicalized_table_name(test_schema_name,
                                                 test_table_name, cur)))

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        self.assertEqual(record_count_by_stream,
                         {'postgres_logical_replication_test': 2})
        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        #first record will be the previous delete
        delete_message = records_by_stream[
            'postgres_logical_replication_test']['messages'][0]
        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 3)

        #the 2nd message will be the more recent delete
        delete_message = records_by_stream[
            'postgres_logical_replication_test']['messages'][1]
        self.assertEqual(delete_message['action'], 'upsert')

        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 2)
        print("deleted record is correct")

        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")

        self.assertIsNotNone(
            bookmark['lsn'],
            msg="expected bookmark for stream ROOT-CHICKEN to have an scn")

        lsn_4 = bookmark['lsn']
        self.assertTrue(lsn_4 >= lsn_3)

        #table_version does NOT change
        self.assertEqual(
            bookmark['version'],
            table_version,
            msg=
            "expected bookmark for stream postgres_logical_replication_test to match version"
        )
        #----------------------------------------------------------------------
        # invoke the sync job again after deleting a record using the 'id IN (<id>, <id>)' format
        #----------------------------------------------------------------------
        print("delete row from source db")
        with db_utils.get_test_connection(test_db) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute("DELETE FROM {} WHERE id IN (4, 5)".format(
                    canonicalized_table_name(test_schema_name, test_table_name,
                                             cur)))

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        self.assertEqual(record_count_by_stream,
                         {'postgres_logical_replication_test': 3})
        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        #first record will be the previous delete
        delete_message = records_by_stream[
            'postgres_logical_replication_test']['messages'][0]
        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 2)

        #the 2nd message will be the more recent delete
        delete_message = records_by_stream[
            'postgres_logical_replication_test']['messages'][1]
        self.assertEqual(delete_message['action'], 'upsert')

        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 4)
        print("deleted record is correct")

        #the 3rd message will be the more recent delete
        delete_message = records_by_stream[
            'postgres_logical_replication_test']['messages'][2]
        self.assertEqual(delete_message['action'], 'upsert')

        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 5)
        print("deleted record is correct")

        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")

        self.assertIsNotNone(
            bookmark['lsn'],
            msg="expected bookmark for stream ROOT-CHICKEN to have an scn")

        lsn_5 = bookmark['lsn']
        self.assertTrue(lsn_5 >= lsn_4)

        #table_version does NOT change
        self.assertEqual(
            bookmark['version'],
            table_version,
            msg=
            "expected bookmark for stream postgres_logical_replication_test to match version"
        )

        #----------------------------------------------------------------------
        # invoke the sync job again after updating a record
        #----------------------------------------------------------------------
        print("updating row from source db")
        with db_utils.get_test_connection(test_db) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute(
                    "UPDATE {} SET our_varchar = 'THIS HAS BEEN UPDATED', our_money = '$56.811', our_decimal = 'NaN', our_real = '+Infinity', our_double = 'NaN' WHERE id = 1"
                    .format(
                        canonicalized_table_name(test_schema_name,
                                                 test_table_name, cur)))

        sync_job_name = runner.run_sync_mode(self, conn_id)
        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        self.assertEqual(record_count_by_stream,
                         {'postgres_logical_replication_test': 3})
        records_by_stream = runner.get_records_from_target_output()
        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        self.assertEqual(
            len(records_by_stream['postgres_logical_replication_test']
                ['messages']), 3)
        #first record will be the previous first delete
        delete_message = records_by_stream[
            'postgres_logical_replication_test']['messages'][0]
        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 4)

        #second record will be the previous second delete
        delete_message = records_by_stream[
            'postgres_logical_replication_test']['messages'][1]
        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 5)

        #third record will be the new update
        updated_message = records_by_stream[
            'postgres_logical_replication_test']['messages'][2]
        del updated_message['data']['_sdc_lsn']

        self.assertEqual(updated_message['action'], 'upsert')

        expected_updated_rec = {
            'our_varchar': 'THIS HAS BEEN UPDATED',
            'id': 1,
            'our_varchar_10': "varchar_10",
            'our_text': "some text",
            'our_integer': 44100,
            'our_smallint': 1,
            'our_bigint': 1000000,
            'our_decimal': None,
            'OUR TS': '1997-02-02T02:02:02.722184+00:00',
            'OUR TS TZ': '1997-02-02T07:02:02.722184+00:00',
            'OUR TIME': '12:11:10',
            'OUR TIME TZ': '12:11:10-04:00',
            'OUR DATE': '1998-03-04T00:00:00+00:00',
            'our_double': None,
            'our_real': None,
            'our_boolean': True,
            'our_bit': False,
            'our_json': '{"secret": 55}',
            'our_jsonb': self.rec_1['our_jsonb'],
            'our_uuid': self.rec_1['our_uuid'],
            '_sdc_deleted_at': None,
            'our_store': {
                'name': 'betty',
                'size': 'small'
            },
            'our_citext': 'maGICKal',
            'our_cidr': self.rec_1['our_cidr'],
            'our_inet': self.rec_1['our_inet'],
            'our_mac': self.rec_1['our_mac'],
            'our_alignment_enum': 'bad',
            'our_money': '$56.81'
        }

        self.assertDictEqual(expected_updated_rec, updated_message['data'])
        print("updated record is correct")

        #check state again
        state = menagerie.get_state(conn_id)
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        chicken_bookmark = state['bookmarks'][
            'logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        self.assertIsNotNone(
            chicken_bookmark['lsn'],
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to have an scn"
        )
        lsn_6 = chicken_bookmark['lsn']
        self.assertTrue(lsn_6 >= lsn_5)

        #table_version does NOT change
        self.assertEqual(
            chicken_bookmark['version'],
            table_version,
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to match version"
        )

        #----------------------------------------------------------------------
        # invoke the sync job one last time. should only get the PREVIOUS update
        #----------------------------------------------------------------------
        sync_job_name = runner.run_sync_mode(self, conn_id)
        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        # we will get the previous update record again
        self.assertEqual(record_count_by_stream,
                         {'postgres_logical_replication_test': 1})
        # TODO the next line is not grabing the record from the latest sync, opening potential for false negatives
        update_message = records_by_stream[
            'postgres_logical_replication_test']['messages'][2]
        self.assertEqual(update_message['action'], 'upsert')

        self.assertEqual(
            set(update_message['data'].keys()),
            set(expected_updated_rec.keys()),
            msg="keys for expected_record_1 are wrong: {}".format(
                set(update_message['data'].keys()).symmetric_difference(
                    set(expected_updated_rec.keys()))))

        for k, v in update_message['data'].items():
            self.assertEqual(v,
                             expected_updated_rec[k],
                             msg="{} != {} for key {}".format(
                                 v, expected_updated_rec[k], k))

        #check state again
        state = menagerie.get_state(conn_id)
        chicken_bookmark = state['bookmarks'][
            'logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        self.assertIsNotNone(
            chicken_bookmark['lsn'],
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to have an scn"
        )
        lsn_7 = chicken_bookmark['lsn']
        self.assertTrue(lsn_7 >= lsn_6)

        #table_version does NOT change
        self.assertEqual(
            chicken_bookmark['version'],
            table_version,
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to match version"
        )
Example #23
0
    def test_run(self):
        """
        Verify for each stream that you can do a sync which records bookmarks.
        Verify that the bookmark is the max value sent to the target for the `date` PK field
        Verify that the 2nd sync respects the bookmark
        Verify that all data of the 2nd sync is >= the bookmark from the first sync
        Verify that the number of records in the 2nd sync is less then the first
        Verify inclusivivity of bookmarks

        PREREQUISITE
        For EACH stream that is incrementally replicated there are multiple rows of data with
            different values for the replication key
        """
        untested_streams = self.child_streams().union({
            'transfers',
            'payout_transactions',  # BUG see create test
            'balance_transactions',  # join stream, can't be updated
            'disputes',
        })
        cannot_update_streams = {
            'invoice_line_items',  # updates not available via api
        }

        # Ensure tested streams have existing records
        expected_records_first_sync = {stream: [] for stream in self.streams_to_create}
        for _ in range(2): # create 3 records for each stream but only expect the 3rd
            for stream in self.streams_to_create:
                self.new_objects[stream].append(create_object(stream))
        for stream in self.streams_to_create:
            self.new_objects[stream].append(create_object(stream))
            expected_records_first_sync[stream].append({"id": self.new_objects[stream][-1]['id']})

        self.START_DATE = self.get_properties().get('start_date')

        # Instantiate connection with default start
        conn_id = connections.ensure_connection(self)

        # run in check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # Select all testable streams and all fields within streams
        streams_to_select = self.expected_incremental_streams().difference(untested_streams)
        our_catalogs = [catalog for catalog in found_catalogs
                        if catalog.get('tap_stream_id') in
                        streams_to_select]
        self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True)

        # Run a sync job using orchestrator
        first_sync_start = self.local_to_utc(dt.utcnow())
        first_sync_record_count = self.run_and_verify_sync(conn_id)
        first_sync_end = self.local_to_utc(dt.utcnow())

        # verify that the sync only sent records to the target for selected streams (catalogs)
        self.assertEqual(
            streams_to_select, set(first_sync_record_count.keys()),
            msg="Expected only testable streams to be replicated: {}".format(first_sync_record_count)
        )

        first_sync_state = menagerie.get_state(conn_id)

        # Get the set of records from a first sync
        first_sync_records = runner.get_records_from_target_output()

        # Add data before next sync via insert and update, and set expectations
        created_records = {x: [] for x in self.expected_streams()}
        updated_records = {x: [] for x in self.expected_streams()}
        expected_records_second_sync = {x: [] for x in self.expected_streams()}


        # Update one record from each stream prior to 2nd sync
        first_sync_created, _ = self.split_records_into_created_and_updated(first_sync_records)
        for stream in self.streams_to_create.difference(cannot_update_streams):
            # There needs to be some test data for each stream, otherwise this will break
            record = expected_records_first_sync[stream][0]
            updated_record = update_object(stream, record["id"])
            updated_records[stream].append(updated_record)
            expected_records_second_sync[stream].append({"id": updated_record['id']})

        # Ensure different times between udpates and inserts
        sleep(2)

        # Insert (create) one record for each stream prior to 2nd sync
        for stream in self.streams_to_create:
            created_record = create_object(stream)
            self.new_objects[stream].append(created_record)
            created_records[stream].append(created_record)
            expected_records_second_sync[stream].append({"id": created_record['id']})

        # ensure validity of expected_records_second_sync
        for stream in self.streams_to_create:
            if stream in self.expected_incremental_streams():
                if stream in cannot_update_streams:
                    # Some streams will have only 1 record from the Insert
                    self.assertEqual(1, len(expected_records_second_sync.get(stream)),
                                     msg="Expectations are invalid for incremental stream {}".format(stream)
                    )
                    continue
                # Most streams will have 2 records from the Update and Insert
                self.assertEqual(2, len(expected_records_second_sync.get(stream)),
                                 msg="Expectations are invalid for incremental stream {}".format(stream)
                )
            elif stream in self.expected_full_table_streams():
                self.assertEqual(
                    len(expected_records_second_sync.get(stream)),
                    len(expected_records_first_sync.get(stream)) + len(created_records[stream]),
                    msg="Expectations are invalid for full table stream {}".format(stream)
                )

            # created_records[stream] = self.records_data_type_conversions(created_records.get(stream))
            # updated_records[stream] = self.records_data_type_conversions(updated_records.get(stream))


        # Run a second sync job using orchestrator
        second_sync_start = self.local_to_utc(dt.utcnow())
        second_sync_record_count = self.run_and_verify_sync(conn_id)
        second_sync_end = self.local_to_utc(dt.utcnow())

        second_sync_state = menagerie.get_state(conn_id)

        # Get the set of records from a second sync
        second_sync_records = runner.get_records_from_target_output()
        second_sync_created, second_sync_updated = self.split_records_into_created_and_updated(second_sync_records)

        # Loop first_sync_records and compare against second_sync_records
        for stream in self.streams_to_create.difference(untested_streams):
            with self.subTest(stream=stream):

                second_sync_data = [record.get("data") for record
                                    in second_sync_records.get(stream, {}).get("messages", [])]
                stream_replication_keys = self.expected_replication_keys()
                stream_primary_keys = self.expected_primary_keys()

                # TESTING INCREMENTAL STREAMS
                if stream in self.expected_incremental_streams():

                    replication_keys = stream_replication_keys.get(stream)

                    # Verify both syncs write / keep the same bookmark keys
                    self.assertEqual(set(first_sync_state.get('bookmarks', {}).keys()),
                                     set(second_sync_state.get('bookmarks', {}).keys()))

                    # verify that there is more than 1 record of data - setup necessary
                    self.assertGreater(first_sync_record_count.get(stream, 0), 1,
                                       msg="Data isn't set up to be able to test full sync")

                    # verify that you get less data on the 2nd sync
                    self.assertGreater(
                        first_sync_record_count.get(stream, 0),
                        second_sync_record_count.get(stream, 0),
                        msg="first sync didn't have more records, bookmark usage not verified")

                    if stream in self.streams_to_create:
                        for replication_key in replication_keys:
                            updates_replication_key = "updates_created"
                            updates_stream = stream + "_events"

                            # Verify second sync's bookmarks move past the first sync's
                            self.assertGreater(
                                second_sync_state.get('bookmarks', {updates_stream: {}}).get(
                                    updates_stream, {replication_key: -1}).get(updates_replication_key),
                                first_sync_state.get('bookmarks', {updates_stream: {}}).get(
                                    updates_stream, {updates_replication_key: -1}).get(updates_replication_key)
                            )

                            # Verify that all data of the 2nd sync is >= the bookmark from the first sync
                            first_sync_bookmark = dt.fromtimestamp(
                                first_sync_state.get('bookmarks').get(updates_stream).get(updates_replication_key)
                            )
                            for record in second_sync_data:
                                date_value = record["updated"]
                                self.assertGreaterEqual(date_value,
                                                        dt.strftime(first_sync_bookmark, self.COMPARISON_FORMAT),
                                                        msg="A 2nd sync record has a replication-key that is less than or equal to the 1st sync bookmark.")

                elif stream in self.expected_full_table_streams():
                    raise Exception("Expectations changed, but this test was not updated to reflect them.")

                # TESTING APPLICABLE TO ALL STREAMS

                # Verify that the expected records are replicated in the 2nd sync
                # For incremental streams we should see at least 2 records (a new record and an updated record)
                # but we may see more as the bookmmark is inclusive and there are hidden creates/updates due to
                # dependencies between streams.
                # For full table streams we should see 1 more record than the first sync
                expected_records = expected_records_second_sync.get(stream)
                primary_keys = stream_primary_keys.get(stream)

                updated_pk_values = {tuple([record.get(pk) for pk in primary_keys])
                                     for record in updated_records[stream]}
                self.assertLessEqual(
                    len(expected_records), len(second_sync_data),
                    msg="Expected number of records are not less than or equal to actual for 2nd sync.\n" +
                    "Expected: {}\nActual: {}".format(len(expected_records), len(second_sync_data))
                )
                if (len(second_sync_data) - len(expected_records)) > 0:
                    logging.warn('Second sync replicated %s records more than our create and update for %s',
                                 len(second_sync_data), stream)

                if not primary_keys:
                    raise NotImplementedError("PKs are needed for comparing records")

                # Verify that the inserted and updated records are replicated by the 2nd sync
                for expected_record in expected_records:
                    expected_pk_value = expected_record.get('id')
                    sync_pk_values = [sync_record.get('id')
                                      for sync_record in second_sync_data
                                      if sync_record.get('id') == expected_pk_value]
                    self.assertTrue(
                        len(sync_pk_values) > 0,
                        msg="A record is missing from our sync: \nSTREAM: {}\tPK: {}".format(stream, expected_pk_value)
                    )
                    self.assertIn(expected_pk_value, sync_pk_values)

                # Verify updated fields are replicated as expected
                for updated_record in updated_records[stream]:
                    expected_updated_key = 'metadata'
                    expected_updated_value_substring = 'bob'
                    updated_pk_value = updated_record.get('id')
                    sync_records_metadata = [sync_record.get('metadata')
                                             for sync_record in second_sync_data
                                             if sync_record.get('id') == updated_pk_value]
                    self.assertTrue(len(sync_records_metadata) == 1)
                    self.assertIn(expected_updated_value_substring,
                                  sync_records_metadata[0].get('test_value'))
Example #24
0
    def test_run(self):
        """
        Verify that for each stream you can do a sync which records bookmarks.
        That the bookmark is the maximum value sent to the target for the replication key.
        That a second sync respects the bookmark
            All data of the second sync is >= the bookmark from the first sync
            The number of records in the 2nd sync is less then the first (This assumes that
                new data added to the stream is done at a rate slow enough that you haven't
                doubled the amount of data from the start date to the first sync between
                the first sync and second sync run in this test)

        Verify that for full table stream, all data replicated in sync 1 is replicated again in sync 2.

        PREREQUISITE
        For EACH stream that is incrementally replicated there are multiple rows of data with
            different values for the replication key
        """

        expected_streams = self.expected_check_streams()

        expected_replication_keys = self.expected_replication_keys()
        expected_replication_methods = self.expected_replication_method()

        ##########################################################################
        # First Sync
        ##########################################################################
        conn_id = connections.ensure_connection(self)

        # Run in check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # table and field selection
        catalog_entries = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in expected_streams
        ]

        self.perform_and_verify_table_and_field_selection(
            conn_id, catalog_entries)

        # Run a first sync job using orchestrator
        first_sync_record_count = self.run_and_verify_sync(conn_id)
        first_sync_records = runner.get_records_from_target_output()
        first_sync_bookmarks = menagerie.get_state(conn_id)

        ##########################################################################
        # Update State Between Syncs
        ##########################################################################

        new_states = {'bookmarks': dict()}
        simulated_states = self.calculated_states_by_stream(
            first_sync_bookmarks)
        for stream, new_state in simulated_states.items():
            new_states['bookmarks'][stream] = new_state
        menagerie.set_state(conn_id, new_states)

        ##########################################################################
        # Second Sync
        ##########################################################################

        second_sync_record_count = self.run_and_verify_sync(conn_id)
        second_sync_records = runner.get_records_from_target_output()
        second_sync_bookmarks = menagerie.get_state(conn_id)

        ##########################################################################
        # Test By Stream
        ##########################################################################

        for stream in expected_streams:
            with self.subTest(stream=stream):

                # expected values
                expected_replication_method = expected_replication_methods[
                    stream]

                # collect information for assertions from syncs 1 & 2 base on expected values
                first_sync_count = first_sync_record_count.get(stream, 0)
                second_sync_count = second_sync_record_count.get(stream, 0)
                first_sync_messages = [
                    record.get('data') for record in first_sync_records.get(
                        stream, {}).get('messages', [])
                    if record.get('action') == 'upsert'
                ]
                second_sync_messages = [
                    record.get('data') for record in second_sync_records.get(
                        stream, {}).get('messages', [])
                    if record.get('action') == 'upsert'
                ]
                first_bookmark_key_value = first_sync_bookmarks.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)
                second_bookmark_key_value = second_sync_bookmarks.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)

                if expected_replication_method == self.INCREMENTAL:

                    # collect information specific to incremental streams from syncs 1 & 2
                    replication_key = next(
                        iter(expected_replication_keys[stream]))
                    first_bookmark_value = first_bookmark_key_value.get(
                        replication_key)
                    second_bookmark_value = second_bookmark_key_value.get(
                        replication_key)
                    first_bookmark_value_utc = self.convert_state_to_utc(
                        first_bookmark_value)
                    second_bookmark_value_utc = self.convert_state_to_utc(
                        second_bookmark_value)

                    simulated_bookmark_value = self.convert_state_to_utc(
                        new_states['bookmarks'][stream][replication_key])

                    # Verify the first sync sets a bookmark of the expected form
                    self.assertIsNotNone(first_bookmark_key_value)
                    self.assertIsNotNone(first_bookmark_value)

                    # Verify the second sync sets a bookmark of the expected form
                    self.assertIsNotNone(second_bookmark_key_value)
                    self.assertIsNotNone(second_bookmark_value)

                    # Verify the second sync bookmark is Equal to the first sync bookmark
                    # assumes no changes to data during test
                    if not stream == "users":
                        self.assertEqual(second_bookmark_value,
                                         first_bookmark_value)
                    else:
                        # For `users` stream it stores bookmark as 1 minute less than current time if `updated_at` of
                        # last records less than it. So, if there is no data change then second_bookmark_value will be
                        # 1 minute less than current time. Therefore second_bookmark_value will always be
                        # greater or equal to first_bookmark_value
                        self.assertGreaterEqual(second_bookmark_value,
                                                first_bookmark_value)

                    for record in first_sync_messages:

                        # Verify the first sync bookmark value is the max replication key value for a given stream
                        replication_key_value = record.get(replication_key)
                        # For `ticket` stream it stores bookmarks as int timestamp. So, converting it to the string.
                        if stream == "tickets":
                            replication_key_value = datetime.utcfromtimestamp(
                                replication_key_value).strftime(
                                    '%Y-%m-%dT%H:%M:%SZ')

                        self.assertLessEqual(
                            replication_key_value,
                            first_bookmark_value_utc,
                            msg=
                            "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                        )

                    for record in second_sync_messages:
                        # Verify the second sync replication key value is Greater or Equal to the first sync bookmark
                        replication_key_value = record.get(replication_key)

                        if stream == "tickets":
                            replication_key_value = datetime.utcfromtimestamp(
                                replication_key_value).strftime(
                                    '%Y-%m-%dT%H:%M:%SZ')

                        self.assertGreaterEqual(
                            replication_key_value,
                            simulated_bookmark_value,
                            msg=
                            "Second sync records do not repect the previous bookmark."
                        )

                        # Verify the second sync bookmark value is the max replication key value for a given stream
                        self.assertLessEqual(
                            replication_key_value,
                            second_bookmark_value_utc,
                            msg=
                            "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                        )

                elif expected_replication_method == self.FULL_TABLE:

                    # Verify the syncs do not set a bookmark for full table streams
                    self.assertIsNone(first_bookmark_key_value)
                    self.assertIsNone(second_bookmark_key_value)

                    # Verify the number of records in the second sync is the same as the first

                    # Given below streams are child stremas of parent stream `tickets` and tickets is incremental streams
                    # Child streams also behave like incremental streams but does not save it's own state. So, it don't
                    # have same no of record on second sync and first sync.
                    if not stream in [
                            "ticket_comments", "ticket_audits",
                            "ticket_metrics"
                    ]:
                        self.assertEqual(second_sync_count, first_sync_count)

                else:

                    raise NotImplementedError(
                        "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}"
                        .format(stream, expected_replication_method))

                # Verify at least 1 record was replicated in the second sync
                self.assertGreater(
                    second_sync_count,
                    0,
                    msg="We are not fully testing bookmarking for {}".format(
                        stream))
    def bookmarks_test(self, conn_id, testable_streams):

        # Select all streams and no fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        incremental_streams = {
            key
            for key, value in self.expected_replication_method().items()
            if value == self.INCREMENTAL and key in testable_streams
        }

        # Our test data sets for Shopify do not have any abandoned_checkouts
        our_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in incremental_streams
        ]
        self.select_all_streams_and_fields(conn_id,
                                           our_catalogs,
                                           select_all_fields=False)

        #################################
        # Run first sync
        #################################

        first_sync_record_count = self.run_sync(conn_id)

        # verify that the sync only sent records to the target for selected streams (catalogs)
        self.assertEqual(set(first_sync_record_count.keys()),
                         incremental_streams)

        first_sync_bookmark = menagerie.get_state(conn_id)
        first_sync_records = runner.get_records_from_target_output()
        # BUG:TDL-17087 : State has additional values which are not streams
        # Need to remove additional values from bookmark value
        extra_stuff = {
            'transaction_orders', 'metafield_products', 'refund_orders',
            'product_variants'
        }
        for keys in list(first_sync_bookmark['bookmarks'].keys()):
            if keys in extra_stuff:
                first_sync_bookmark['bookmarks'].pop(keys)

        #######################
        # Update State between Syncs
        #######################

        new_state = {'bookmarks': dict()}
        #simulated_states = self.calculated_states_by_stream(first_sync_bookmark)

        # We are hardcoding the updated state to ensure that we get atleast 1 record in second sync. These values have been provided after reviewing the max bookmark value for each of the streams
        simulated_states = {
            'products': {
                'updated_at': '2021-12-20T05:10:05.000000Z'
            },
            'collects': {
                'updated_at': '2021-09-01T09:08:28.000000Z'
            },
            'abandoned_checkouts': {
                'updated_at': '2022-02-02T16:00:00.000000Z'
            },
            'inventory_levels': {
                'updated_at': '2021-12-20T05:09:34.000000Z'
            },
            'locations': {
                'updated_at': '2021-07-20T09:00:22.000000Z'
            },
            'events': {
                'created_at': '2021-12-20T05:09:01.000000Z'
            },
            'inventory_items': {
                'updated_at': '2021-09-15T19:44:11.000000Z'
            },
            'transactions': {
                'created_at': '2021-12-20T00:08:52-05:00'
            },
            'metafields': {
                'updated_at': '2021-09-07T21:18:05.000000Z'
            },
            'order_refunds': {
                'created_at': '2021-05-01T17:41:18.000000Z'
            },
            'customers': {
                'updated_at': '2021-12-20T05:08:17.000000Z'
            },
            'orders': {
                'updated_at': '2021-12-20T05:09:01.000000Z'
            },
            'custom_collections': {
                'updated_at': '2021-12-20T17:41:18.000000Z'
            }
        }

        for stream, updated_state in simulated_states.items():
            new_state['bookmarks'][stream] = updated_state
        menagerie.set_state(conn_id, new_state)

        ###############################
        # Run Second Sync
        ###############################

        second_sync_record_count = self.run_sync(conn_id)
        second_sync_records = runner.get_records_from_target_output()
        second_sync_bookmark = menagerie.get_state(conn_id)

        for stream in testable_streams:
            with self.subTest(stream=stream):

                # expected values
                expected_replication_method = self.expected_replication_method(
                )
                expected_replication_keys = self.expected_replication_keys()
                # information required for assertions from sync 1 and 2 based on expected values
                first_sync_count = first_sync_record_count.get(stream, 0)
                second_sync_count = second_sync_record_count.get(stream, 0)
                first_sync_messages = [
                    record.get('data') for record in first_sync_records.get(
                        stream, {}).get('messages', [])
                    if record.get('action') == 'upsert'
                ]
                second_sync_messages = [
                    record.get('data') for record in second_sync_records.get(
                        stream, {}).get('messages', [])
                    if record.get('action') == 'upsert'
                ]
                first_bookmark_value = first_sync_bookmark.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)
                first_bookmark_value = list(first_bookmark_value.values())[0]
                second_bookmark_value = second_sync_bookmark.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)
                second_bookmark_value = list(second_bookmark_value.values())[0]

                replication_key = next(iter(expected_replication_keys[stream]))
                first_bookmark_value_utc = self.convert_state_to_utc(
                    first_bookmark_value)
                second_bookmark_value_utc = self.convert_state_to_utc(
                    second_bookmark_value)
                simulated_bookmark = new_state['bookmarks'][stream]
                simulated_bookmark_value = list(simulated_bookmark.values())[0]

                # verify the syncs sets a bookmark of the expected form
                self.assertIsNotNone(first_bookmark_value)
                self.assertTrue(
                    self.is_expected_date_format(first_bookmark_value))
                self.assertIsNotNone(second_bookmark_value)
                self.assertTrue(
                    self.is_expected_date_format(second_bookmark_value))

                # verify the 2nd bookmark is equal to 1st sync bookmark
                #NOT A BUG (IS the expected behaviour for shopify as they are using date windowing : TDL-17096 : 2nd bookmark value is getting assigned from the execution time rather than the actual bookmark time. This is an invalid assertion for shopify
                #self.assertEqual(first_bookmark_value, second_bookmark_value)

                for record in first_sync_messages:
                    replication_key_value = record.get(replication_key)
                    # verify 1st sync bookmark value is the max replication key value for a given stream
                    self.assertLessEqual(
                        replication_key_value,
                        first_bookmark_value_utc,
                        msg=
                        "First sync bookmark was set incorrectly, a record with a greater replication key value was synced"
                    )

                for record in second_sync_messages:
                    replication_key_value = record.get(replication_key)
                    # verify the 2nd sync replication key value is greater or equal to the 1st sync bookmarks
                    self.assertGreaterEqual(
                        replication_key_value,
                        simulated_bookmark_value,
                        msg=
                        "Second sync records do not respect the previous                                                  bookmark"
                    )
                    # verify the 2nd sync bookmark value is the max replication key value for a given stream
                    self.assertLessEqual(
                        replication_key_value,
                        second_bookmark_value_utc,
                        msg=
                        "Second sync bookmark was set incorrectly, a record with a greater replication key value was synced"
                    )

                # verify that we get less data in the 2nd sync
                # collects has all the records with the same value of replication key, so we are removing from this assertion
                if stream not in ('collects'):
                    self.assertLess(
                        second_sync_count,
                        first_sync_count,
                        msg=
                        "Second sync does not have less records, bookmark usage not verified"
                    )

                # verify that we get atleast 1 record in the second sync
                if stream not in ('collects'):
                    self.assertGreater(
                        second_sync_count,
                        0,
                        msg="Second sync did not yield any records")
Example #26
0
    def binlog_edge_test(self, expected_records=[]):
        """
        Test binlog replication edge cases
        • Verify an initial sync returns expected records of various datatypes
        • Verify we bookmark correctly when a transaction spans multiple files
        • Insert and delete a record prior to sync. Verify both events are replicated
        • Insert and update a record prior to sync. Verify both events are replicated
        • Verify a valid log_file and log_pos state are persisted after each sync
        """

        conn_id = connections.ensure_connection(self)

        # prior to first sync update a record...
        updated_timestamp = datetime.datetime.now()
        updated_id = 1
        expected_records[1]['our_timestamp_2'] = datetime.datetime.strftime(
            updated_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")

        # insert a record and...
        inserted_record = self.generate_record_n(len(expected_records))
        expected_records += [inserted_record]  # TODO need to format

        # delete a record
        deleted_id = 2

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            cur.execute(
                "UPDATE {}.{} SET our_timestamp_2 = '{}' WHERE id = {}".format(
                    self.database_name(), self.table_name_1(),
                    updated_timestamp, updated_id))

            self.insert_record(cur, inserted_record, self.table_name_1())

            delete_time = datetime.datetime.now()
            cur.execute("DELETE FROM {}.{} WHERE id = {}".format(
                self.database_name(), self.table_name_1(), deleted_id))

        print(
            "\n\nMySQL DB Actions." + \
            "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \
            "\nEVENTS: {} records updated".format(1) + \
            "\n        {} records deleted\n\n".format(1)
        )

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        t1 = self.table_name_1()
        t2 = self.table_name_2()
        expected_check_streams = {
            self.tap_stream_id(t1),
            self.tap_stream_id(t2)
        }
        expected_sync_streams = {t1, t2}
        expected_pks = {t1: {'id'}, t2: {'id'}}

        # verify the tap discovered the right streams
        found_catalogs = [
            catalog for catalog in menagerie.get_catalogs(conn_id)
            if catalog['tap_stream_id'] in expected_check_streams
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = expected_check_streams.symmetric_difference(found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        self.assertEqual(self.table_name_1(), found_catalogs[0]['stream_name'])
        self.assertEqual(self.table_name_2(), found_catalogs[1]['stream_name'])
        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        for catalog in found_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            _ = connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, catalog, additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run initial full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        self.maxDiff = None
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        # BUG missing deleted record | https://stitchdata.atlassian.net/browse/SRCE-4258
        # self.assertEqual({self.table_name_1(): len(expected_records)}, record_count_by_stream)
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name_1()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        # verify activate version messages are present
        self.assertEqual('activate_version', message_actions[0])
        self.assertEqual('activate_version', message_actions[-1])

        # ensure some log_file and log_pos state was persisted
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertIsNotNone(bookmark['log_file'])
        self.assertIsNotNone(bookmark['log_pos'])

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        # grab version, log_file and log_pos from state to check later
        expected_table_version = records_for_stream['table_version']

        self.assertEqual(expected_table_version, bookmark['version'])

        # check for expected records
        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]
        # we need to compare record by record since there are so many.
        # a failure comparing expected_records to upsert_records would result in
        # an output message greater in length than a standard tmux buffer
        # BUG missing datetime precision | https://stitchdata.atlassian.net/browse/SRCE-4257
        # for expected_record in expected_records:
        #     upsert_record = [rec for rec in upsert_records
        #                      if rec['id'] == expected_record['id']]
        #     self.assertEqual(1, len(upsert_record),
        #                      msg="multiple upsert_recs with same pk: {}".format(upsert_record))
        #     self.assertEqual(expected_record, upsert_record.pop())

        # TODO add check for _sdc_delete_at for deleted record once bug addressed

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertEqual(expected_table_version, bookmark['version'])

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # record count should be empty as we did not persist anything to the gate
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)
        self.assertEqual(record_count_by_stream, {})

        # Create 1 more record prior to 2nd sync
        new_record = self.generate_record_n(len(expected_records))
        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            self.insert_record(cur, new_record, self.table_name_1())
        print(
            "\n\nMySQL DB Actions." + \
            "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \
            "\nEVENTS: {} records inserted".format(1)
        )

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertEqual(expected_table_version, bookmark['version'])

        # Either the log_file is the same but the log_pos has increased or the log_file
        # has rotated and the numeric suffix has increased
        if expected_log_file == bookmark['log_file']:
            print("PATH A")
            self.assertGreater(bookmark['log_pos'], expected_log_pos)
        else:
            expected_log_file_suffix = re.search('^.*\.(\d+)$',
                                                 expected_log_file).groups()[0]
            updated_log_file_suffix = re.search(
                '^.*\.(\d+)$', bookmark['log_file']).groups()[0]
            print("PATH B")
            self.assertGreater(int(updated_log_file_suffix),
                               int(expected_log_file_suffix))

        # Execute delete across tables using join prior to 3rd sync
        deleted_id = 4

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:

            delete_time = datetime.datetime.now()
            # DELETE T1, T2
            # FROM T1
            # INNER JOIN T2 ON T1.key = T2.key
            # WHERE condition;
            db = self.database_name()
            db_t1 = db + "." + t1
            db_t2 = db + "." + t2
            t1_key = db_t1 + ".id"
            t2_key = db_t2 + ".id"
            statement = "DELETE {}, {} ".format(db_t1, db_t2) + \
                "FROM {} ".format(t1) + \
                "INNER JOIN {} ON {} = {} ".format(db_t2, t1_key, t2_key) + \
                "WHERE {} = {}".format(t1_key, deleted_id)
            cur.execute(statement)

        print(
            "\n\nMySQL DB Actions." + \
            "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_2()) + \
            "\nTABLE: {}".format(self.table_name_2()) + \
            "\nEVENTS:  {} records deleted\n\n".format(1)
        )

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertEqual(expected_table_version, bookmark['version'])

        target_records = runner.get_records_from_target_output()
        records_stream_1 = target_records[self.table_name_1()]
        upsert_records_1 = [
            m['data'] for m in records_stream_1['messages']
            if m['action'] == 'upsert'
        ]
        records_stream_2 = target_records[self.table_name_2()]
        upsert_records_2 = [
            m['data'] for m in records_stream_2['messages']
            if m['action'] == 'upsert'
        ]

        # make sure the record is in the target for both tables with a delete time
        deleted_at_t1 = upsert_records_1[0].get('_sdc_deleted_at')
        deleted_at_t1_timestamp = utils.strptime_to_utc(
            deleted_at_t1).timestamp()
        self.assertIsNotNone(deleted_at_t1)

        deleted_at_t2 = upsert_records_2[0].get('_sdc_deleted_at')
        deleted_at_t2_timestamp = utils.strptime_to_utc(
            deleted_at_t2).timestamp()
        self.assertIsNotNone(deleted_at_t2)

        # the delete times should be equal since it was a single transaction
        self.assertEqual(deleted_at_t1_timestamp, deleted_at_t2_timestamp)
        time_delta = delete_time.timestamp() - deleted_at_t1_timestamp
        print("Delete time vs record: difference in seconds", time_delta)
        self.assertLess(time_delta,
                        3)  # time delta less than 3 seconds in magnitude
Example #27
0
    def test_run(self):
        """
        Verify that a bookmark doesn't exist for the stream
        Verify that the second sync includes the same number or more records than the first sync
        Verify that all records in the first sync are included in the second sync
        Verify that the sync only sent records to the target for selected streams (catalogs)

        PREREQUISITE
        For EACH stream that is fully replicated there are multiple rows of data with
            different values for the replication key
        """
        conn_id = self.create_connection_with_initial_discovery()

        # Select all streams and no fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        full_streams = {
            key
            for key, value in self.expected_replication_method().items()
            if value == self.FULL
        }
        our_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in full_streams
        ]
        self.select_all_streams_and_fields(conn_id,
                                           our_catalogs,
                                           select_all_fields=True)

        # Run a sync job using orchestrator
        first_sync_record_count = self.run_sync(conn_id)

        # verify that the sync only sent records to the target for selected streams (catalogs)
        self.assertEqual(
            set(first_sync_record_count.keys()),
            full_streams,
            logging="verify only full table streams were replicated")

        first_sync_state = menagerie.get_state(conn_id)

        # Get the set of records from a first sync
        first_sync_records_by_stream = runner.get_records_from_target_output()

        # Run a second sync job using orchestrator
        second_sync_record_count = self.run_sync(conn_id)

        # Get the set of records from a second sync
        second_sync_records_by_stream = runner.get_records_from_target_output()
        for stream in full_streams:
            with self.subTest(stream=stream):

                # verify there is no bookmark values from state
                state_value = first_sync_state.get("bookmarks", {}).get(stream)
                self.assertIsNone(
                    state_value,
                    logging="verify no bookmark value is saved in state")

                # verify that there is more than 1 record of data - setup necessary
                self.assertGreater(
                    first_sync_record_count.get(stream, 0),
                    1,
                    logging="verify multiple records are replicatied")

                # verify that you get the same or more data the 2nd time around
                self.assertGreaterEqual(
                    second_sync_record_count.get(stream, 0),
                    first_sync_record_count.get(stream, 0),
                    logging=
                    "verify the second full table sync replicates at least as many records as the first sync"
                )

                # verify all data from 1st sync included in 2nd sync
                first_sync_records = [
                    record["data"] for record in
                    first_sync_records_by_stream[stream]["messages"]
                ]
                second_sync_records = [
                    record["data"] for record in
                    second_sync_records_by_stream[stream]["messages"]
                ]
                LOGGER.info(
                    "verify all records from the first sync are replicated in the second sync"
                )
                for record in first_sync_records:
                    self.assertIn(record, second_sync_records)
Example #28
0
    def binlog_test(self):
        """
        Test binlog replication
        • Verify an initial sync returns expected records of various datatypes
        • Verify no changes and a subsequent sync results in no replicated records
        • Update, Delete, and Insert records then verify the next sync captures these changes
        • Verify some log_file and log_pos state was persisted after each sync
        """
        print("RUNNING {}\n\n".format(self.name()))

        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        expected_check_streams = {self.tap_stream_id()}
        expected_sync_streams = {self.table_name()}
        expected_pks = {self.table_name(): {'id'}}

        # verify the tap discovered the right streams
        found_catalogs = [
            catalog for catalog in menagerie.get_catalogs(conn_id)
            if catalog['tap_stream_id'] in expected_check_streams
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = expected_check_streams.symmetric_difference(found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]

        self.assertEqual(self.table_name(), test_catalog['stream_name'])

        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        selected_metadata = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog,
            menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']),
            additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run initial full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        self.maxDiff = None
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {self.table_name(): 2})
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(
            message_actions,
            ['activate_version', 'upsert', 'upsert', 'activate_version'])

        # ensure some log_file and log_pos state was persisted
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertIsNotNone(bookmark['log_file'])
        self.assertIsNotNone(bookmark['log_pos'])

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        # grab version, log_file and log_pos from state to check later
        expected_table_version = records_for_stream['table_version']

        self.assertEqual(expected_table_version, bookmark['version'])

        # check for expected records
        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]

        self.assertEqual([expected_rec_1, expected_rec_2], upsert_records)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # record count should be empty as we did not persist anything to the gate
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {})

        # run some inserts, updates, and deletes in source
        updated_rec_1_varchar = 'THIS HAS BEEN UPDATED'

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            cur.execute(
                "UPDATE {}.{} SET our_varchar = '{}' WHERE id = {}".format(
                    self.database_name(), self.table_name(),
                    updated_rec_1_varchar, rec_1['id']))

            delete_time = datetime.datetime.now()
            cur.execute("DELETE FROM {}.{} WHERE id = {}".format(
                self.database_name(), self.table_name(), rec_2['id']))

            self.insert_record(cur, rec_3)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # Either the log_file is the same but the log_pos has increased or the log_file
        # has rotated and the numeric suffix has increased
        if expected_log_file == bookmark['log_file']:
            print("PATH A")
            self.assertGreater(bookmark['log_pos'], expected_log_pos)
        else:
            expected_log_file_suffix = re.search('^.*\.(\d+)$',
                                                 expected_log_file).groups()[0]
            updated_log_file_suffix = re.search(
                '^.*\.(\d+)$', bookmark['log_file']).groups()[0]
            print("PATH B")
            self.assertGreater(int(updated_log_file_suffix),
                               int(expected_log_file_suffix))

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        updated_expected_rec_1 = copy.deepcopy(expected_rec_1)
        updated_expected_rec_2 = copy.deepcopy(expected_rec_2)
        updated_expected_rec_3 = copy.deepcopy(expected_rec_3)

        updated_expected_rec_1['our_varchar'] = updated_rec_1_varchar

        # Floats that come back from binlog provide more precision
        # than from SELECT based queries
        updated_expected_rec_1['our_unsigned_float'] = Decimal(
            "1.2345000505447388")
        updated_expected_rec_1['our_signed_float'] = -Decimal(
            "1.2345000505447388")
        #        updated_expected_rec_1['_sdc_deleted_at'] = None
        updated_expected_rec_2['our_unsigned_float'] = Decimal(
            "2.4690001010894775")
        updated_expected_rec_2['our_signed_float'] = -Decimal(
            "2.4690001010894775")
        #        updated_expected_rec_2['_sdc_deleted_at'] = None
        #        updated_expected_rec_3['_sdc_deleted_at'] = None

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # check for expected records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {self.table_name(): 3})

        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(message_actions, ['upsert', 'upsert', 'upsert'])

        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]

        deleted_at_rec = upsert_records[1].get('_sdc_deleted_at')
        deleted_at_rec_timestamp = utils.strptime_to_utc(
            deleted_at_rec).timestamp()
        time_delta = delete_time.timestamp() - deleted_at_rec_timestamp
        print("Delete time vs record: difference in seconds", time_delta)
        self.assertIsNotNone(deleted_at_rec)
        assert (time_delta < 3)  #i dunno

        # since we don't know exactly what the _sdc_deleted_at value will be
        # we will make the assertions we can make on that field here
        # and then remove it from all records prior to doing a full
        # record-level comparison
        self.assertIn('_sdc_deleted_at', upsert_records[0])
        self.assertIn('_sdc_deleted_at', upsert_records[1])
        self.assertIn('_sdc_deleted_at', upsert_records[2])
        self.assertIsNone(upsert_records[0].get('_sdc_deleted_at'))
        self.assertIsNotNone(upsert_records[1].get('_sdc_deleted_at'))
        self.assertIsNone(upsert_records[2].get('_sdc_deleted_at'))
        del upsert_records[0]['_sdc_deleted_at']
        del upsert_records[1]['_sdc_deleted_at']
        del upsert_records[2]['_sdc_deleted_at']

        self.assertEqual([
            updated_expected_rec_1, updated_expected_rec_2,
            updated_expected_rec_3
        ], upsert_records)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        self.maxDiff = None
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # record count should be empty as we did not persist anything to the gate
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {})
    def test_run(self):
        """
        Verify for each stream that you can do a sync which records bookmarks.
        Verify that the bookmark is the max value sent to the target for the `date` PK field
        Verify that the 2nd sync respects the bookmark
        Verify that all data of the 2nd sync is >= the bookmark from the first sync
        Verify that the number of records in the 2nd sync is less then the first
        Verify inclusivivity of bookmarks

        PREREQUISITE
        For EACH stream that is incrementally replicated there are multiple rows of data with
            different values for the replication key
        """
        print("\n\nTESTING IN SQUARE_ENVIRONMENT: {}".format(
            os.getenv('TAP_SQUARE_ENVIRONMENT')))

        print("\n\nRUNNING {}\n\n".format(self.name()))

        # Instatiate static start date
        self.START_DATE = self.STATIC_START_DATE

        # Ensure tested streams have data
        expected_records_first_sync = self.create_test_data(
            self.testable_streams_static(), self.START_DATE)

        # Instantiate connection with default start
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Select all testable streams and no fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        streams_to_select = self.testable_streams_static()
        our_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in streams_to_select
        ]
        self.select_all_streams_and_fields(conn_id, our_catalogs)

        # Run a sync job using orchestrator
        first_sync_record_count = self.run_sync(conn_id)

        # verify that the sync only sent records to the target for selected streams (catalogs)
        self.assertEqual(
            streams_to_select,
            set(first_sync_record_count.keys()),
            msg=
            "Expect first_sync_record_count keys {} to equal testable streams {},"
            " first_sync_record_count was {}".format(
                first_sync_record_count.keys(), streams_to_select,
                first_sync_record_count))

        first_sync_state = menagerie.get_state(conn_id)

        # Get the set of records from a first sync
        runner.get_records_from_target_output()

        # Set expectations for 2nd sync
        expected_records_second_sync = {x: [] for x in self.expected_streams()}
        # adjust expectations for full table streams to include the expected records from sync 1
        for stream in self.testable_streams_static():
            if stream in self.expected_full_table_streams():
                for record in expected_records_first_sync.get(stream, []):
                    expected_records_second_sync[stream].append(record)

        # Run a second sync job using orchestrator
        second_sync_record_count = self.run_sync(conn_id)

        # Get the set of records from a second sync
        second_sync_records = runner.get_records_from_target_output()

        second_sync_state = menagerie.get_state(conn_id)

        # Loop first_sync_records and compare against second_sync_records
        for stream in self.testable_streams_static():
            with self.subTest(stream=stream):

                second_sync_data = [
                    record.get("data") for record in second_sync_records.get(
                        stream, {}).get("messages", {"data": {}})
                ]

                # TESTING INCREMENTAL STREAMS
                if stream in self.expected_incremental_streams():

                    # Verify both syncs write / keep the same bookmark
                    self.assertEqual(
                        set(first_sync_state.get('bookmarks', {}).keys()),
                        set(second_sync_state.get('bookmarks', {}).keys()))

                    # Verify second sync's bookmarks move past the first sync's
                    self.assertGreater(
                        second_sync_state.get('bookmarks', {
                            stream: {}
                        }).get(stream, {
                            'updated_at': -1
                        }).get('updated_at'),
                        first_sync_state.get('bookmarks', {
                            stream: {}
                        }).get(stream, {
                            'updated_at': -1
                        }).get('updated_at'))

                    # verify that there is more than 1 record of data - setup necessary
                    self.assertGreater(
                        first_sync_record_count.get(stream, 0),
                        1,
                        msg="Data isn't set up to be able to test full sync")

                    # verify that you get no data on the 2nd sync
                    self.assertGreaterEqual(
                        0,
                        second_sync_record_count.get(stream, 0),
                        msg=
                        "first sync didn't have more records, bookmark usage not verified"
                    )

                elif stream in self.expected_full_table_streams():

                    # TESTING FULL TABLE STREAMS

                    # Verify no bookmarks are present
                    first_state = first_sync_state.get('bookmarks',
                                                       {}).get(stream)
                    self.assertEqual({}, first_state,
                                     msg="Unexpected state for {}\n".format(stream) + \
                                     "\tState: {}\n".format(first_sync_state) + \
                                     "\tBookmark: {}".format(first_state))
                    second_state = second_sync_state.get('bookmarks',
                                                         {}).get(stream)
                    self.assertEqual({}, second_state,
                                     msg="Unexpected state for {}\n".format(stream) + \
                                     "\tState: {}\n".format(second_sync_state) + \
                                     "\tBookmark: {}".format(second_state))

            # TESTING APPLICABLE TO ALL STREAMS

            # Verify that the expected records are replicated in the 2nd sync
            # For incremental streams we should see 0 records
            # For full table streams we should see the same records from the first sync
                expected_records = expected_records_second_sync.get(stream, [])
                self.assertEqual(
                    len(expected_records),
                    len(second_sync_data),
                    msg=
                    "Expected number of records do not match actual for 2nd sync.\n"
                    + "Expected: {}\nActual: {}".format(
                        len(expected_records), len(second_sync_data)))
Example #30
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'FULL_TABLE'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                # TODO - test schema matches expectations based on data type, nullable, not nullable, datetimes as string +, etc
                #   This needs to be consistent based on replication method so you can change replication methods
                table_version = records_by_stream[stream]['table_version']

                # verify on the first sync you get activate version message before and after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row[
                                "data"].items():
                            self.assertEqual(
                                expected_value,
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                # TODO - change this to something for mssql once binlog (cdc) is finalized and we know what it is
                self.assertIsNone(
                    bookmark.get('lsn'),
                    msg=
                    "expected bookmark for stream to have NO lsn because we are using full-table replication"
                )

                self.assertEqual(
                    bookmark['version'],
                    table_version,
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))