def do_test(self, conn_id):
        # Select our catalogs
        our_catalogs = [c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream,{}).get('messages',[])
            if stream in  ['tickets', 'groups', 'users']:
                self.assertGreater(len(messages), 100, msg="Stream {} has fewer than 100 records synced".format(stream))
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk), msg="Missing primary-key for message {}".format(m))
    def test_run(self):

        self.setUpTestEnvironment(COMPRESSION_FOLDER_PATH)

        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(
                self.conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        self.select_specific_catalog(found_catalogs,
                                     "gz_file_having_empty_csv")

        runner.run_sync_job_and_check_status(self)

        expected_records = 0
        # Verify actual rows were synced
        records = runner.get_upserts_from_target_output()

        self.assertEqual(expected_records, len(records))
    def test_run(self):
        # Select our catalogs
        # found_catalogs = menagerie.get_catalogs(conn_id)
        # our_catalogs = [c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()]
        # for c in our_catalogs:
        #     c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id'])
        #     c_metadata = metadata.to_map(c_annotated['metadata'])
        #     connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], [])

        conn_id = self.create_connection()

        # Clear state before our run
        menagerie.set_state(conn_id, {})
        # Select a stream
        found_catalogs = menagerie.get_catalogs(conn_id)
        our_catalogs = [catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in self.expected_sync_streams()]
        self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=False)

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  sum(record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream, {}).get('messages')
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk), msg="oh no! {}".format(m))

        bookmarks = menagerie.get_state(conn_id)['bookmarks']

        replication_methods = self.expected_replication_method()

        for stream in self.expected_sync_streams():
            with self.subTest(stream=stream):
                replication_method = replication_methods.get(stream)
                if replication_method is self.INCREMENTAL:
                    self.assertTrue(stream in bookmarks)

                elif replication_method is self.FULL_TABLE:
                    self.assertTrue(stream not in bookmarks)

                else:
                    raise NotImplementedError(
                        "stream {} has an invalid replication method {}".format(stream, replication_method)
                    )
Exemple #4
0
    def first_sync_test(self, table_configs, conn_id):
        # run first full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        expected_pks = {}

        for config in table_configs:
            key = {config['HashKey']}
            if config.get('SortKey'):
                key |= {config.get('SortKey')}
            expected_pks[config['TableName']] = key

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, {x['TableName']
                            for x in table_configs}, expected_pks)

        state = menagerie.get_state(conn_id)
        state_version = menagerie.get_state_version(conn_id)

        first_versions = {}

        # assert that we get the correct number of records for each stream
        for config in table_configs:
            table_name = config['TableName']

            self.assertEqual(config['num_rows'],
                             record_count_by_stream[table_name])

            # assert that an activate_version_message is first and last message sent for each stream
            self.assertEqual(
                'activate_version',
                records_by_stream[table_name]['messages'][0]['action'])
            self.assertEqual(
                'activate_version',
                records_by_stream[table_name]['messages'][-1]['action'])

            # assert that the state has an initial_full_table_complete == True
            self.assertTrue(
                state['bookmarks'][table_name]['initial_full_table_complete'])
            # assert that there is a version bookmark in state
            first_versions[table_name] = state['bookmarks'][table_name][
                'version']
            self.assertIsNotNone(first_versions[table_name])

            # Write state with missing finished_shards so it
            # re-reads data from all shards
            # This should result in the next sync having same number of records
            # as the full table sync
            state['bookmarks'][table_name].pop('finished_shards')
            menagerie.set_state(conn_id, state, version=state_version)
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        #select all catalogs
        for catalog in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id']))

        future_time = "2050-01-01T00:00:00.000000Z"

        #clear state
        future_bookmarks = {"currently_syncing" : None,
                            "bookmarks":  {"contacts" : {"offset" : {},
                                                         "versionTimestamp" :  future_time},
                                           "subscription_changes" : {"startTimestamp" : future_time,
                                                                     "offset" :  {}},
                                           "campaigns" :  {"offset" : {}},
                                           "forms" : {"updatedAt" :  future_time},
                                           "deals" :  {"offset" :  {},
                                                       "hs_lastmodifieddate" :  future_time},
                                           "workflows" :  {"updatedAt" : future_time},
                                           "owners" :  {"updatedAt" :  future_time},
                                           "contact_lists" :  {"updatedAt" :  future_time,
                                                               "offset" :  {}},
                                           "email_events" :  {"startTimestamp" : future_time,
                                                              "offset" : {}},
                                           "companies" :  {"offset" : {},
                                                           "hs_lastmodifieddate" :  future_time},
                                           "engagements" :  {"lastUpdated" :  future_time,
                                                             "offset" : {}}}}

        menagerie.set_state(conn_id, future_bookmarks)

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        #because the bookmarks were set into the future, we should NOT actually replicate any data.
        #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks
        streams_with_bookmarks = self.expected_sync_streams()
        streams_with_bookmarks.remove('campaigns')
        streams_with_bookmarks.remove('deal_pipelines')
        bad_streams = streams_with_bookmarks.intersection(record_count_by_stream.keys())
        self.assertEqual(len(bad_streams), 0, msg="still pulled down records from {} despite future bookmarks".format(bad_streams))


        state = menagerie.get_state(conn_id)

        # NB: Companies and engagements won't set a bookmark in the future.
        state["bookmarks"].pop("companies")
        state["bookmarks"].pop("engagements")
        future_bookmarks["bookmarks"].pop("companies")
        future_bookmarks["bookmarks"].pop("engagements")

        self.assertEqual(state, future_bookmarks, msg="state should not have been modified because we didn't replicate any data")
        bookmarks = state.get('bookmarks')
        bookmark_streams = set(state.get('bookmarks').keys())
Exemple #6
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")

        #select all catalogs

        for c in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id'])
            if c['stream_name'] in self.expected_sync_streams().keys():
                stream = c['stream_name']
                pks = self.expected_sync_streams()[stream]

                for pk in pks:
                    mdata = next((m for m in catalog_entry['metadata']
                                  if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == pk), None)
                    print("Validating inclusion on {}: {}".format(c['stream_name'], mdata))
                    self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic')

                connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry)

        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        first_record_count_by_stream = runner.examine_target_output_file(self, conn_id, set(self.expected_sync_streams().keys()), self.expected_sync_streams())
        replicated_row_count =  reduce(lambda accum,c : accum + c, first_record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Verify that automatic fields are all emitted with records
        synced_records = runner.get_records_from_target_output()
        for stream_name, data in synced_records.items():
            record_messages = [set(row['data'].keys()) for row in data['messages']]
            self.assertGreater(len(record_messages), 0, msg="stream {} did not sync any records.".format(stream_name))
            for record_keys in record_messages:
                self.assertEqual(self.expected_sync_streams().get(stream_name, set()) - record_keys, set())
Exemple #7
0
    def test_catalog_without_properties(self):

        self.setUpTestEnvironment()

        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertEqual(len(found_catalogs), 1,
                         msg="unable to locate schemas for connection {}".format(self.conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset, msg="Expected check streams are not subset of discovered catalog")

        our_catalogs = [c for c in found_catalogs if c.get(
            'tap_stream_id') in self.expected_streams()]

        # Select our catalogs
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                self.conn_id, c['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                self.conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, self.conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        synced_records = runner.get_records_from_target_output()
        upsert_messages = [m for m in synced_records.get(
            'csv_with_empty_lines').get('messages') if m['action'] == 'upsert']

        records = [message.get('data') for message in upsert_messages]

        #Empty line should be ignored in emitted records.

        expected_records = [
            {'id': 1, 'name': 'John', '_sdc_extra': [{'name': 'carl'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 2},
            {'id': 2, 'name': 'Bob', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 3},
            {'id': 3, '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 4},
            {'id': 4, 'name': 'Alice', '_sdc_extra': [{'no_headers': ['Ben', '5']}, {
                'name': 'Barak'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 5}
        ]

        self.assertListEqual(expected_records, records)
Exemple #8
0
    def test_run(self):

        conn_id = connections.ensure_connection(self, payload_hook=None)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")
        #
        # # Select some catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for catalog in our_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], [])

        # # Verify that all streams sync at least one row for initial sync
        # # This test is also verifying access token expiration handling. If test fails with
        # # authentication error, refresh token was not replaced after expiring.
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        zero_count_streams = {
            k
            for k, v in record_count_by_stream.items() if v == 0
        }
        self.assertFalse(
            zero_count_streams,
            msg="The following streams did not sync any rows {}".format(
                zero_count_streams))
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        all_excluded_fields = {}
        # select all catalogs
        for c in found_catalogs:
            if c['stream_name'] == 'ads':
                continue

            discovered_schema = menagerie.get_annotated_schema(conn_id, c['stream_id'])['annotated-schema']
            all_excluded_fields[c['stream_name']] = list(set(discovered_schema.keys()) - self.expected_automatic_fields().get(c['stream_name'], set()))[:5]
            connections.select_catalog_and_fields_via_metadata(
                conn_id,
                c,
                discovered_schema,
                non_selected_fields=all_excluded_fields[c['stream_name']])

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # This should be validating the the PKs are written in each record
        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        synced_records = runner.get_records_from_target_output()
        self.assertTrue('ads' not in synced_records.keys())
        for stream_name, data in synced_records.items():
            record_messages = [set(row['data'].keys()) for row in data['messages']]
            for record_keys in record_messages:
                # The intersection should be empty
                self.assertFalse(record_keys.intersection(all_excluded_fields[stream_name]))
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        # select all catalogs
        for c in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])
            connections.select_catalog_via_metadata(conn_id, c, catalog_entry)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # This should be validating the the PKs are written in each record

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))
Exemple #11
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")

        # Select some catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for catalog in our_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema)

        # Clear State and run sync
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))
Exemple #12
0
    def test_run(self):

        # sync 1
        conn_id = connections.ensure_connection(self)

        found_catalogs_1 = self.run_and_verify_check_mode(conn_id)

        self.perform_and_verify_table_and_field_selection(conn_id,found_catalogs_1)

        record_count_by_stream_1 = self.run_and_verify_sync(conn_id)

        # checking if we got any data from sync 1
        self.assertGreater(sum(record_count_by_stream_1.values()), 0)

        for tap_stream_id in self.expected_first_sync_streams():
            self.assertEqual(self.expected_first_sync_row_counts()[tap_stream_id],
                             record_count_by_stream_1[tap_stream_id])

        # getting state
        state = menagerie.get_state(conn_id)

        # creating file "table_1_fileB"
        with self.get_test_connection() as client:
            root_dir = os.getenv('TAP_SFTP_ROOT_DIR')
            client.chdir(root_dir + '/tap_tester/folderA')

            file_group = self.get_files()[0]
            with client.open('table_1_fileB.csv', 'w') as f:
                writer = csv.writer(f)
                lines = [file_group['headers']] + file_group['generator'](file_group['num_rows'])
                writer.writerows(lines)

        # adding some data to file "table_1_fileA" and "table_3_fileA"
        self.append_to_files()

        # setting state
        menagerie.set_state(conn_id, state)

        # sync 2
        record_count_by_stream_2 = self.run_and_verify_sync(conn_id, second_sync = True)

        # checking if we got any data from sync 2
        self.assertGreater(sum(record_count_by_stream_2.values()), 0)

        # checking if data after in 2nd sync is as expected
        # here as we have modified start date, so we should recieve only modified data
        # ie. after appending and creating file
        for tap_stream_id in self.expected_second_sync_streams():
            self.assertEqual(self.expected_second_sync_row_counts()[tap_stream_id],
                             record_count_by_stream_2[tap_stream_id])
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        #select all catalogs
        #selected_catalogs = list(map(lambda catalog: self.perform_field_selection(conn_id, catalog), found_catalogs))
        #menagerie.post_annotated_catalogs(conn_id, selected_catalogs)

        for c in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(conn_id, c,
                                                               menagerie.get_annotated_schema(conn_id, c['stream_id']))

        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # bookmarks for the 4 streams should be 2015-03-16
        states = menagerie.get_state(conn_id)["bookmarks"]
        end_date = self.get_properties()["end_date"].split()[0]
        for k, v in states.items():
            if "insights" in k:
                bm_date = v.get("date_start")
                self.assertEqual(end_date, bm_date)
        print("bookmarks match end_date of {}".format(end_date))
Exemple #14
0
    def test_run(self):
        conn_id = self.create_connection()

        # Select our catalogs
        our_catalogs = [
            c for c in self.found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream).get('messages')
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk),
                                         msg="oh no! {}".format(m))

        bookmarks = menagerie.get_state(conn_id)['bookmarks']

        self.assertTrue('orders' in bookmarks)
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # Select all Catalogs
        for catalog in found_catalogs:
            if catalog['tap_stream_id'] in self.expected_sync_streams():
                connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id']))

        #clear state
        menagerie.set_state(conn_id, {})

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        max_bookmarks_from_records = runner.get_most_recent_records_from_target(self, self.expected_bookmarks(), self.get_properties()['start_date'])

        start_of_today =  utils.strftime(datetime.datetime(datetime.datetime.utcnow().year, datetime.datetime.utcnow().month, datetime.datetime.utcnow().day, 0, 0, 0, 0, datetime.timezone.utc))
        max_bookmarks_from_records['subscription_changes'] = start_of_today
        max_bookmarks_from_records['email_events'] = start_of_today


        #if we didn't replicate data, the bookmark should be the start_date
        for k in self.expected_bookmarks().keys():
            if max_bookmarks_from_records.get(k) is None:
                max_bookmarks_from_records[k] = utils.strftime(datetime.datetime(2017, 5, 1, 0, 0, 0, 0, datetime.timezone.utc))

        state = menagerie.get_state(conn_id)
        bookmarks = state.get('bookmarks')
        bookmark_streams = set(state.get('bookmarks').keys())

        #verify bookmarks and offsets
        for k,v in sorted(list(self.expected_bookmarks().items())):
            for w in v:
                bk_value = bookmarks.get(k,{}).get(w)
                self.assertEqual(utils.strptime_with_tz(bk_value),
                                 utils.strptime_with_tz(max_bookmarks_from_records[k]),
                                 "Bookmark {} ({}) for stream {} should have been updated to {}".format(bk_value, w, k, max_bookmarks_from_records[k]))
                print("bookmark {}({}) updated to {} from max record value {}".format(k, w, bk_value, max_bookmarks_from_records[k]))

        for k,v in self.expected_offsets().items():
            self.assertEqual(bookmarks.get(k,{}).get('offset', {}), v, msg="unexpected offset found for stream {} {}. state: {}".format(k, v, state))
            print("offsets {} cleared".format(k))

        diff = bookmark_streams.difference(self.acceptable_bookmarks())
        self.assertEqual(len(diff), 0, msg="Unexpected bookmarks: {} Expected: {} Actual: {}".format(diff, self.acceptable_bookmarks(), bookmarks))

        self.assertEqual(state.get('currently_syncing'), None,"Unexpected `currently_syncing` bookmark value: {} Expected: None".format(state.get('currently_syncing')))
Exemple #16
0
    def test_run(self):
        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.check_all_streams_in_catalogs(found_catalogs)
        self.select_found_catalogs(found_catalogs)

        # clear state and run the actual sync
        menagerie.set_state(self.conn_id, {})
        runner.run_sync_job_and_check_status(self)
        self.check_output_record_counts()

        max_bookmarks_from_records = runner.get_max_bookmarks_from_target(self)
        state = menagerie.get_state(self.conn_id)
        bookmarks = state.get("bookmarks", {})
        self.check_bookmarks(bookmarks, max_bookmarks_from_records)
        self.check_offsets(bookmarks)
        self.look_for_unexpected_bookmarks(bookmarks)
        self.assertIsNone(state.get("currently_syncing"))
Exemple #17
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        #select certain... catalogs
        expected_streams = self.expected_sync_streams()
        allowed_catalogs = [
            catalog for catalog in found_catalogs
            if not self.is_unsupported_by_bulk_api(catalog['stream_name'])
            and catalog['stream_name'] in expected_streams
        ]

        self.select_all_streams_and_fields(conn_id, allowed_catalogs)

        # Run sync
        menagerie.set_state(conn_id, {})
        _ = self.run_and_verify_sync(conn_id)
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        #select certain... catalogs
        # TODO: This might need to exclude Datacloud objects. So we don't blow up on permissions issues
        expected_streams = self.expected_sync_streams()
        allowed_catalogs = [
            catalog for catalog in found_catalogs
            if not self.is_unsupported_by_bulk_api(catalog['stream_name'])
            and catalog['stream_name'] in expected_streams
        ]

        self.select_all_streams_and_fields(conn_id, allowed_catalogs)

        # Run sync
        menagerie.set_state(conn_id, {})
        _ = self.run_and_verify_sync(conn_id)
Exemple #19
0
    def run_and_verify_sync(self, conn_id, state):
        """
        Run a sync job and make sure it exited properly.
        Return a dictionary with keys of streams synced
        and values of records synced for each stream
        """
        # reset state to the state at the start of the sync in case we got interrupted
        menagerie.set_state(conn_id, state)

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        try:
            menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        except AssertionError as e:
            if exit_status['discovery_error_message'] or exit_status[
                    'tap_error_message']:
                print(
                    "*******************RETRYING SYNC FOR TAP/DISCOVERY FAILURE*******************"
                )
                raise RetryableTapError(e)

            raise

        # Verify actual rows were synced
        sync_record_count = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys())
        self.assertGreater(
            sum(sync_record_count.values()),
            0,
            msg="failed to replicate any data: {}".format(sync_record_count))
        print("total replicated row count: {}".format(
            sum(sync_record_count.values())))

        return sync_record_count
Exemple #20
0
        def run_and_verify_sync(self, conn_id):
            """
            Clear the connections state in menagerie and Run a Sync.
            Verify the exit code following the sync.

            Return the connection id and record count by stream
            """
            #clear state
            menagerie.set_state(conn_id, {})

            # run sync
            sync_job_name = runner.run_sync_mode(self, conn_id)

            # Verify tap exit codes
            exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
            menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

            # read target output
            first_record_count_by_stream = runner.examine_target_output_file(
                self, conn_id, self.expected_streams(),
                self.expected_primary_keys())

            return first_record_count_by_stream
Exemple #21
0
    def test_future_date_in_state(self):
        conn_id = connections.ensure_connection(self)

        expected_streams = self.streams_to_select()

        future_date = datetime.datetime.strftime(
            datetime.datetime.today() + datetime.timedelta(days=1),
            "%Y-%m-%dT00:00:00Z")

        state = {'bookmarks': dict()}
        replication_keys = self.expected_replication_keys()
        for stream in expected_streams:
            if self.is_incremental(stream):
                state['bookmarks'][stream] = dict()
                state['bookmarks'][stream]['field'] = next(
                    iter(replication_keys[stream]))
                state['bookmarks'][stream]['last_record'] = future_date

        # set state for running sync mode
        menagerie.set_state(conn_id, state)

        runner.run_check_mode(self, conn_id)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.select_found_catalogs(conn_id,
                                   found_catalogs,
                                   only_streams=expected_streams)

        # run sync mode
        self.run_and_verify_sync(conn_id)

        # get the state after running sync mode
        latest_state = menagerie.get_state(conn_id)

        # verify that the state passed before sync
        # and the state we got after sync are same
        self.assertEquals(latest_state, state)
Exemple #22
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # select all catalogs
        for c in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])
            for k in self.expected_primary_keys()[c['stream_name']]:
                mdata = next(
                    (m for m in catalog_entry['metadata']
                     if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k),
                    None)
                print("Validating inclusion on {}: {}".format(
                    c['stream_name'], mdata))
                self.assertTrue(
                    mdata and mdata['metadata']['inclusion'] == 'automatic')
            connections.select_catalog_via_metadata(conn_id, c, catalog_entry)

        # clear state
        menagerie.set_state(conn_id, {})

        # run a sync
        _ = self.run_and_verify_sync(conn_id)

        synced_records = runner.get_records_from_target_output()
        for stream_name, data in synced_records.items():
            record_messages = [
                set(row['data'].keys()) for row in data['messages']
            ]
            for record_keys in record_messages:
                # The symmetric difference should be empty
                self.assertEqual(
                    record_keys,
                    self.expected_automatic_fields().get(stream_name, set()))
Exemple #23
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify discovery produced (at least) 1 expected catalog
        found_catalogs = [
            found_catalog for found_catalog in menagerie.get_catalogs(conn_id)
            if found_catalog['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreaterEqual(len(found_catalogs), 1)

        # verify the tap discovered the expected streams
        found_catalog_names = {
            catalog['tap_stream_id']
            for catalog in found_catalogs
        }
        self.assertSetEqual(self.expected_check_streams(), found_catalog_names)

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]
        self.assertEqual(test_table_name, test_catalog['stream_name'])
        print("discovered streams are correct")

        # perform table selection
        print('selecting {} and all fields within the table'.format(
            test_table_name))
        schema_and_metadata = menagerie.get_annotated_schema(
            conn_id, test_catalog['stream_id'])
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'FULL_TABLE'
            }
        }]
        _ = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog, schema_and_metadata, additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run sync job 1 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_1 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(3, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(5, len(messages))
        self.assertEqual('activate_version', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('upsert', messages[3]['action'])
        self.assertEqual('activate_version', messages[4]['action'])

        # verify the persisted schema matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # verify replicated records match expectations
        self.assertDictEqual(self.expected_records[0], messages[1]['data'])
        self.assertDictEqual(self.expected_records[1], messages[2]['data'])
        self.assertDictEqual(self.expected_records[2], messages[3]['data'])

        print("records are correct")

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_1, bookmark['version'])

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN and get the same 3 records
        #----------------------------------------------------------------------

        # run sync job 2 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_2 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(3, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(4, len(messages))
        self.assertEqual('upsert', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('activate_version', messages[3]['action'])

        # verify the new table version increased on the second sync
        self.assertGreater(table_version_2, table_version_1)

        # verify the persisted schema still matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # verify replicated records still match expectations
        self.assertDictEqual(self.expected_records[0], messages[0]['data'])
        self.assertDictEqual(self.expected_records[1], messages[1]['data'])
        self.assertDictEqual(self.expected_records[2], messages[2]['data'])

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_2, bookmark['version'])

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN following various manipulations to the data
        #----------------------------------------------------------------------

        with db_utils.get_test_connection('dev') as conn:
            conn.autocommit = True
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:

                # NB | We will perform the following actions prior to the next sync:
                #      [Action (EXPECTED RESULT)]

                #      Insert a record
                #      Insert a record to be updated prior to sync
                #      Insert a record to be deleted prior to sync (NOT REPLICATED)

                #      Update an existing record
                #      Update a newly inserted record

                #      Delete an existing record
                #      Delete  a newly inserted record

                # inserting...
                # a new record
                nyc_tz = pytz.timezone('America/New_York')
                our_time_offset = "-04:00"
                our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(6, 6, 6)
                our_time_tz = our_time.isoformat() + our_time_offset
                our_date = datetime.date(1970, 7, 1)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 2",
                    'our_varchar_10':
                    "varchar_10",
                    'our_text':
                    "some text 2",
                    'our_integer':
                    44101,
                    'our_smallint':
                    2,
                    'our_bigint':
                    1000001,
                    'our_decimal':
                    decimal.Decimal('9876543210.02'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '1',
                    'our_json':
                    json.dumps({'nymn': 77}),
                    'our_jsonb':
                    json.dumps({'burgers': 'good++'}),
                    'our_uuid':
                    my_uuid,
                    'our_citext':
                    'cyclops 2',
                    'our_store':
                    'dances=>"floor",name=>"betty"',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    '$0.98789'
                })
                self.expected_records.append({
                    'id':
                    4,
                    'our_varchar':
                    "our_varchar 2",
                    'our_varchar_10':
                    "varchar_10",
                    'our_text':
                    "some text 2",
                    'our_integer':
                    44101,
                    'our_smallint':
                    2,
                    'our_bigint':
                    1000001,
                    'our_decimal':
                    decimal.Decimal('9876543210.02'),
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'OUR DATE':
                    '1970-07-01T00:00:00+00:00',
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    True,
                    'our_json':
                    '{"nymn": 77}',
                    'our_jsonb':
                    '{"burgers": "good++"}',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_citext':
                    self.inserted_records[-1]['our_citext'],
                    'our_store': {
                        "name": "betty",
                        "dances": "floor"
                    },
                    'our_cidr':
                    self.inserted_records[-1]['our_cidr'],
                    'our_inet':
                    self.inserted_records[-1]['our_inet'],
                    'our_mac':
                    self.inserted_records[-1]['our_mac'],
                    'our_money':
                    '$0.99',
                    'our_alignment_enum':
                    None,
                })
                # a new record which we will then update prior to sync
                our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(12, 11, 10)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1999, 9, 9)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 4",
                    'our_varchar_10':
                    "varchar_3",
                    'our_text':
                    "some text 4",
                    'our_integer':
                    55200,
                    'our_smallint':
                    1,
                    'our_bigint':
                    100000,
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '0',
                    'our_json':
                    json.dumps('some string'),
                    'our_jsonb':
                    json.dumps(['burgers are good']),
                    'our_uuid':
                    my_uuid,
                    'our_store':
                    'size=>"small",name=>"betty"',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                })
                self.expected_records.append({
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    'our_text':
                    'some text 4',
                    'our_bit':
                    False,
                    'our_integer':
                    55200,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'id':
                    5,
                    'our_json':
                    self.inserted_records[-1]['our_json'],
                    'our_boolean':
                    True,
                    'our_jsonb':
                    self.inserted_records[-1]['our_jsonb'],
                    'our_bigint':
                    100000,
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'our_store': {
                        "name": "betty",
                        "size": "small"
                    },
                    'our_smallint':
                    1,
                    'OUR DATE':
                    '1999-09-09T00:00:00+00:00',
                    'our_varchar':
                    'our_varchar 4',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_varchar_10':
                    'varchar_3',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                    'our_alignment_enum':
                    None,
                })
                # a new record to be deleted prior to sync
                our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(12, 11, 10)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1999, 9, 9)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 4",
                    'our_varchar_10':
                    "varchar_3",
                    'our_text':
                    "some text 4",
                    'our_integer':
                    55200,
                    'our_smallint':
                    1,
                    'our_bigint':
                    100000,
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '0',
                    'our_json':
                    json.dumps('some string'),
                    'our_jsonb':
                    json.dumps(['burgers are good']),
                    'our_uuid':
                    my_uuid,
                    'our_store':
                    'size=>"small",name=>"betty"',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                })
                self.expected_records.append({
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    'our_text':
                    'some text 4',
                    'our_bit':
                    False,
                    'our_integer':
                    55200,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'id':
                    6,
                    'our_json':
                    self.inserted_records[-1]['our_json'],
                    'our_boolean':
                    True,
                    'our_jsonb':
                    self.inserted_records[-1]['our_jsonb'],
                    'our_bigint':
                    100000,
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'our_store': {
                        "name": "betty",
                        "size": "small"
                    },
                    'our_smallint':
                    1,
                    'OUR DATE':
                    '1999-09-09T00:00:00+00:00',
                    'our_varchar':
                    'our_varchar 4',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_varchar_10':
                    'varchar_3',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                    'our_alignment_enum':
                    None,
                })

                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[3])
                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[4])
                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[5])

                # updating ...
                # an existing record
                canon_table_name = db_utils.canonicalized_table_name(
                    cur, test_schema_name, test_table_name)
                record_pk = 1
                our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                updated_data = {
                    "OUR TS TZ": our_ts_tz,
                    "our_double": decimal.Decimal("6.6"),
                    "our_money": "$0.00"
                }
                self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz(
                    our_ts_tz)
                self.expected_records[0]["our_double"] = decimal.Decimal("6.6")
                self.expected_records[0]["our_money"] = "$0.00"

                db_utils.update_record(cur, canon_table_name, record_pk,
                                       updated_data)

                # a newly inserted record
                canon_table_name = db_utils.canonicalized_table_name(
                    cur, test_schema_name, test_table_name)
                record_pk = 5
                our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                updated_data = {
                    "OUR TS TZ": our_ts_tz,
                    "our_double": decimal.Decimal("6.6"),
                    "our_money": "$0.00"
                }
                self.expected_records[4]["OUR TS TZ"] = self.expected_ts_tz(
                    our_ts_tz)
                self.expected_records[4]["our_double"] = decimal.Decimal("6.6")
                self.expected_records[4]["our_money"] = "$0.00"

                db_utils.update_record(cur, canon_table_name, record_pk,
                                       updated_data)

                # deleting
                # an existing record
                record_pk = 2
                db_utils.delete_record(cur, canon_table_name, record_pk)

                # a newly inserted record
                record_pk = 6
                db_utils.delete_record(cur, canon_table_name, record_pk)

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN after vairous manipulations
        #----------------------------------------------------------------------

        # run sync job 3 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_3 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(4, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(5, len(messages))
        self.assertEqual('upsert', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('upsert', messages[3]['action'])
        self.assertEqual('activate_version', messages[4]['action'])

        # verify the new table version increased on the second sync
        self.assertGreater(table_version_3, table_version_2)

        # verify the persisted schema still matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # NB | This is a little tough to track mentally so here's a breakdown of
        #      the order of operations by expected records indexes:

        #      Prior to Sync 1
        #        insert 0, 1, 2

        #      Prior to Sync 2
        #        No db changes

        #      Prior to Sync 3
        #        insert 3, 4, 5
        #        update 0, 4
        #        delete 1, 5

        #      Resulting Synced Records: 2, 3, 0, 4

        # verify replicated records still match expectations
        self.assertDictEqual(self.expected_records[2],
                             messages[0]['data'])  # existing insert
        self.assertDictEqual(self.expected_records[3],
                             messages[1]['data'])  # new insert
        self.assertDictEqual(self.expected_records[0],
                             messages[2]['data'])  # existing update
        self.assertDictEqual(self.expected_records[4],
                             messages[3]['data'])  # new insert / update

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_3, bookmark['version'])
    def bookmarks_test(self, conn_id, testable_streams):

        # Select all streams and no fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        incremental_streams = {
            key
            for key, value in self.expected_replication_method().items()
            if value == self.INCREMENTAL and key in testable_streams
        }

        # Our test data sets for Shopify do not have any abandoned_checkouts
        our_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in incremental_streams
        ]
        self.select_all_streams_and_fields(conn_id,
                                           our_catalogs,
                                           select_all_fields=False)

        #################################
        # Run first sync
        #################################

        first_sync_record_count = self.run_sync(conn_id)

        # verify that the sync only sent records to the target for selected streams (catalogs)
        self.assertEqual(set(first_sync_record_count.keys()),
                         incremental_streams)

        first_sync_bookmark = menagerie.get_state(conn_id)
        first_sync_records = runner.get_records_from_target_output()
        # BUG:TDL-17087 : State has additional values which are not streams
        # Need to remove additional values from bookmark value
        extra_stuff = {
            'transaction_orders', 'metafield_products', 'refund_orders',
            'product_variants'
        }
        for keys in list(first_sync_bookmark['bookmarks'].keys()):
            if keys in extra_stuff:
                first_sync_bookmark['bookmarks'].pop(keys)

        #######################
        # Update State between Syncs
        #######################

        new_state = {'bookmarks': dict()}
        #simulated_states = self.calculated_states_by_stream(first_sync_bookmark)

        # We are hardcoding the updated state to ensure that we get atleast 1 record in second sync. These values have been provided after reviewing the max bookmark value for each of the streams
        simulated_states = {
            'products': {
                'updated_at': '2021-12-20T05:10:05.000000Z'
            },
            'collects': {
                'updated_at': '2021-09-01T09:08:28.000000Z'
            },
            'abandoned_checkouts': {
                'updated_at': '2022-02-02T16:00:00.000000Z'
            },
            'inventory_levels': {
                'updated_at': '2021-12-20T05:09:34.000000Z'
            },
            'locations': {
                'updated_at': '2021-07-20T09:00:22.000000Z'
            },
            'events': {
                'created_at': '2021-12-20T05:09:01.000000Z'
            },
            'inventory_items': {
                'updated_at': '2021-09-15T19:44:11.000000Z'
            },
            'transactions': {
                'created_at': '2021-12-20T00:08:52-05:00'
            },
            'metafields': {
                'updated_at': '2021-09-07T21:18:05.000000Z'
            },
            'order_refunds': {
                'created_at': '2021-05-01T17:41:18.000000Z'
            },
            'customers': {
                'updated_at': '2021-12-20T05:08:17.000000Z'
            },
            'orders': {
                'updated_at': '2021-12-20T05:09:01.000000Z'
            },
            'custom_collections': {
                'updated_at': '2021-12-20T17:41:18.000000Z'
            }
        }

        for stream, updated_state in simulated_states.items():
            new_state['bookmarks'][stream] = updated_state
        menagerie.set_state(conn_id, new_state)

        ###############################
        # Run Second Sync
        ###############################

        second_sync_record_count = self.run_sync(conn_id)
        second_sync_records = runner.get_records_from_target_output()
        second_sync_bookmark = menagerie.get_state(conn_id)

        for stream in testable_streams:
            with self.subTest(stream=stream):

                # expected values
                expected_replication_method = self.expected_replication_method(
                )
                expected_replication_keys = self.expected_replication_keys()
                # information required for assertions from sync 1 and 2 based on expected values
                first_sync_count = first_sync_record_count.get(stream, 0)
                second_sync_count = second_sync_record_count.get(stream, 0)
                first_sync_messages = [
                    record.get('data') for record in first_sync_records.get(
                        stream, {}).get('messages', [])
                    if record.get('action') == 'upsert'
                ]
                second_sync_messages = [
                    record.get('data') for record in second_sync_records.get(
                        stream, {}).get('messages', [])
                    if record.get('action') == 'upsert'
                ]
                first_bookmark_value = first_sync_bookmark.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)
                first_bookmark_value = list(first_bookmark_value.values())[0]
                second_bookmark_value = second_sync_bookmark.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)
                second_bookmark_value = list(second_bookmark_value.values())[0]

                replication_key = next(iter(expected_replication_keys[stream]))
                first_bookmark_value_utc = self.convert_state_to_utc(
                    first_bookmark_value)
                second_bookmark_value_utc = self.convert_state_to_utc(
                    second_bookmark_value)
                simulated_bookmark = new_state['bookmarks'][stream]
                simulated_bookmark_value = list(simulated_bookmark.values())[0]

                # verify the syncs sets a bookmark of the expected form
                self.assertIsNotNone(first_bookmark_value)
                self.assertTrue(
                    self.is_expected_date_format(first_bookmark_value))
                self.assertIsNotNone(second_bookmark_value)
                self.assertTrue(
                    self.is_expected_date_format(second_bookmark_value))

                # verify the 2nd bookmark is equal to 1st sync bookmark
                #NOT A BUG (IS the expected behaviour for shopify as they are using date windowing : TDL-17096 : 2nd bookmark value is getting assigned from the execution time rather than the actual bookmark time. This is an invalid assertion for shopify
                #self.assertEqual(first_bookmark_value, second_bookmark_value)

                for record in first_sync_messages:
                    replication_key_value = record.get(replication_key)
                    # verify 1st sync bookmark value is the max replication key value for a given stream
                    self.assertLessEqual(
                        replication_key_value,
                        first_bookmark_value_utc,
                        msg=
                        "First sync bookmark was set incorrectly, a record with a greater replication key value was synced"
                    )

                for record in second_sync_messages:
                    replication_key_value = record.get(replication_key)
                    # verify the 2nd sync replication key value is greater or equal to the 1st sync bookmarks
                    self.assertGreaterEqual(
                        replication_key_value,
                        simulated_bookmark_value,
                        msg=
                        "Second sync records do not respect the previous                                                  bookmark"
                    )
                    # verify the 2nd sync bookmark value is the max replication key value for a given stream
                    self.assertLessEqual(
                        replication_key_value,
                        second_bookmark_value_utc,
                        msg=
                        "Second sync bookmark was set incorrectly, a record with a greater replication key value was synced"
                    )

                # verify that we get less data in the 2nd sync
                # collects has all the records with the same value of replication key, so we are removing from this assertion
                if stream not in ('collects'):
                    self.assertLess(
                        second_sync_count,
                        first_sync_count,
                        msg=
                        "Second sync does not have less records, bookmark usage not verified"
                    )

                # verify that we get atleast 1 record in the second sync
                if stream not in ('collects'):
                    self.assertGreater(
                        second_sync_count,
                        0,
                        msg="Second sync did not yield any records")
    def bookmarks_test(self, expected_streams):
        """A Parametrized Bookmarks Test"""
        expected_replication_keys = self.expected_replication_keys()
        expected_replication_methods = self.expected_replication_method()
        expected_insights_buffer = -1 * int(
            self.get_properties()['insights_buffer_days'])  # lookback window

        ##########################################################################
        ### First Sync
        ##########################################################################

        conn_id = connections.ensure_connection(self,
                                                original_properties=False)

        # Run in check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # Select only the expected streams tables
        catalog_entries = [
            ce for ce in found_catalogs
            if ce['tap_stream_id'] in expected_streams
        ]
        self.perform_and_verify_table_and_field_selection(
            conn_id, catalog_entries, select_all_fields=True)

        # Run a sync job using orchestrator
        first_sync_record_count = self.run_and_verify_sync(conn_id)
        first_sync_records = runner.get_records_from_target_output()
        first_sync_bookmarks = menagerie.get_state(conn_id)

        ##########################################################################
        ### Update State Between Syncs
        ##########################################################################

        new_states = {'bookmarks': dict()}
        simulated_states = self.calculated_states_by_stream(
            first_sync_bookmarks)
        for stream, new_state in simulated_states.items():
            new_states['bookmarks'][stream] = new_state
        menagerie.set_state(conn_id, new_states)

        ##########################################################################
        ### Second Sync
        ##########################################################################

        second_sync_record_count = self.run_and_verify_sync(conn_id)
        second_sync_records = runner.get_records_from_target_output()
        second_sync_bookmarks = menagerie.get_state(conn_id)

        ##########################################################################
        ### Test By Stream
        ##########################################################################

        for stream in expected_streams:
            with self.subTest(stream=stream):

                # expected values
                expected_replication_method = expected_replication_methods[
                    stream]

                # collect information for assertions from syncs 1 & 2 base on expected values
                first_sync_count = first_sync_record_count.get(stream, 0)
                second_sync_count = second_sync_record_count.get(stream, 0)
                first_sync_messages = [
                    record.get('data') for record in first_sync_records.get(
                        stream).get('messages')
                    if record.get('action') == 'upsert'
                ]
                second_sync_messages = [
                    record.get('data') for record in second_sync_records.get(
                        stream).get('messages')
                    if record.get('action') == 'upsert'
                ]
                first_bookmark_key_value = first_sync_bookmarks.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)
                second_bookmark_key_value = second_sync_bookmarks.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)

                if expected_replication_method == self.INCREMENTAL:

                    # collect information specific to incremental streams from syncs 1 & 2
                    replication_key = next(
                        iter(expected_replication_keys[stream]))
                    first_bookmark_value = first_bookmark_key_value.get(
                        replication_key)
                    second_bookmark_value = second_bookmark_key_value.get(
                        replication_key)
                    first_bookmark_value_utc = self.convert_state_to_utc(
                        first_bookmark_value)
                    second_bookmark_value_utc = self.convert_state_to_utc(
                        second_bookmark_value)
                    simulated_bookmark_value = new_states['bookmarks'][stream][
                        replication_key]
                    simulated_bookmark_minus_lookback = self.timedelta_formatted(
                        simulated_bookmark_value,
                        days=expected_insights_buffer) if self.is_insight(
                            stream) else simulated_bookmark_value

                    # Verify the first sync sets a bookmark of the expected form
                    self.assertIsNotNone(first_bookmark_key_value)
                    self.assertIsNotNone(
                        first_bookmark_key_value.get(replication_key))

                    # Verify the second sync sets a bookmark of the expected form
                    self.assertIsNotNone(second_bookmark_key_value)
                    self.assertIsNotNone(
                        second_bookmark_key_value.get(replication_key))

                    # Verify the second sync bookmark is Equal to the first sync bookmark
                    self.assertEqual(
                        second_bookmark_value, first_bookmark_value
                    )  # assumes no changes to data during test

                    for record in second_sync_messages:

                        # Verify the second sync records respect the previous (simulated) bookmark value
                        replication_key_value = record.get(replication_key)
                        if stream == 'ads_insights_age_and_gender':  # BUG | https://stitchdata.atlassian.net/browse/SRCE-4873
                            replication_key_value = datetime.datetime.strftime(
                                dateutil.parser.parse(replication_key_value),
                                self.BOOKMARK_COMPARISON_FORMAT)
                        self.assertGreaterEqual(
                            replication_key_value,
                            simulated_bookmark_minus_lookback,
                            msg=
                            "Second sync records do not repect the previous bookmark."
                        )

                        # Verify the second sync bookmark value is the max replication key value for a given stream
                        self.assertLessEqual(
                            replication_key_value,
                            second_bookmark_value_utc,
                            msg=
                            "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                        )

                    for record in first_sync_messages:

                        # Verify the first sync bookmark value is the max replication key value for a given stream
                        replication_key_value = record.get(replication_key)
                        self.assertLessEqual(
                            replication_key_value,
                            first_bookmark_value_utc,
                            msg=
                            "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                        )

                    # Verify the number of records in the 2nd sync is less then the first
                    self.assertLess(second_sync_count, first_sync_count)

                elif expected_replication_method == self.FULL_TABLE:

                    # Verify the syncs do not set a bookmark for full table streams
                    self.assertIsNone(first_bookmark_key_value)
                    self.assertIsNone(second_bookmark_key_value)

                    # Verify the number of records in the second sync is the same as the first
                    self.assertEqual(second_sync_count, first_sync_count)

                else:

                    raise NotImplementedError(
                        "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}"
                        .format(stream, expected_replication_method))

                # Verify at least 1 record was replicated in the second sync
                self.assertGreater(
                    second_sync_count,
                    0,
                    msg="We are not fully testing bookmarking for {}".format(
                        stream))
Exemple #26
0
    def test_run(self):
        """
        Verify that for each stream you can do a sync which records bookmarks.
        That the bookmark is the maximum value sent to the target for the replication key.
        That a second sync respects the bookmark
            All data of the second sync is >= the bookmark from the first sync
            The number of records in the 2nd sync is less then the first (This assumes that
                new data added to the stream is done at a rate slow enough that you haven't
                doubled the amount of data from the start date to the first sync between
                the first sync and second sync run in this test)

        Verify that for full table stream, all data replicated in sync 1 is replicated again in sync 2.

        PREREQUISITE
        For EACH stream that is incrementally replicated there are multiple rows of data with
            different values for the replication key
        """

        expected_streams = self.expected_check_streams()

        expected_replication_keys = self.expected_replication_keys()
        expected_replication_methods = self.expected_replication_method()

        ##########################################################################
        # First Sync
        ##########################################################################
        conn_id = connections.ensure_connection(self)

        # Run in check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # table and field selection
        catalog_entries = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in expected_streams
        ]

        self.perform_and_verify_table_and_field_selection(
            conn_id, catalog_entries)

        # Run a first sync job using orchestrator
        first_sync_record_count = self.run_and_verify_sync(conn_id)
        first_sync_records = runner.get_records_from_target_output()
        first_sync_bookmarks = menagerie.get_state(conn_id)

        ##########################################################################
        # Update State Between Syncs
        ##########################################################################

        new_states = {'bookmarks': dict()}
        simulated_states = self.calculated_states_by_stream(
            first_sync_bookmarks)
        for stream, new_state in simulated_states.items():
            new_states['bookmarks'][stream] = new_state
        menagerie.set_state(conn_id, new_states)

        ##########################################################################
        # Second Sync
        ##########################################################################

        second_sync_record_count = self.run_and_verify_sync(conn_id)
        second_sync_records = runner.get_records_from_target_output()
        second_sync_bookmarks = menagerie.get_state(conn_id)

        ##########################################################################
        # Test By Stream
        ##########################################################################

        for stream in expected_streams:
            with self.subTest(stream=stream):

                # expected values
                expected_replication_method = expected_replication_methods[
                    stream]

                # collect information for assertions from syncs 1 & 2 base on expected values
                first_sync_count = first_sync_record_count.get(stream, 0)
                second_sync_count = second_sync_record_count.get(stream, 0)
                first_sync_messages = [
                    record.get('data') for record in first_sync_records.get(
                        stream, {}).get('messages', [])
                    if record.get('action') == 'upsert'
                ]
                second_sync_messages = [
                    record.get('data') for record in second_sync_records.get(
                        stream, {}).get('messages', [])
                    if record.get('action') == 'upsert'
                ]
                first_bookmark_key_value = first_sync_bookmarks.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)
                second_bookmark_key_value = second_sync_bookmarks.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)

                if expected_replication_method == self.INCREMENTAL:

                    # collect information specific to incremental streams from syncs 1 & 2
                    replication_key = next(
                        iter(expected_replication_keys[stream]))
                    first_bookmark_value = first_bookmark_key_value.get(
                        replication_key)
                    second_bookmark_value = second_bookmark_key_value.get(
                        replication_key)
                    first_bookmark_value_utc = self.convert_state_to_utc(
                        first_bookmark_value)
                    second_bookmark_value_utc = self.convert_state_to_utc(
                        second_bookmark_value)

                    simulated_bookmark_value = self.convert_state_to_utc(
                        new_states['bookmarks'][stream][replication_key])

                    # Verify the first sync sets a bookmark of the expected form
                    self.assertIsNotNone(first_bookmark_key_value)
                    self.assertIsNotNone(first_bookmark_value)

                    # Verify the second sync sets a bookmark of the expected form
                    self.assertIsNotNone(second_bookmark_key_value)
                    self.assertIsNotNone(second_bookmark_value)

                    # Verify the second sync bookmark is Equal to the first sync bookmark
                    # assumes no changes to data during test
                    if not stream == "users":
                        self.assertEqual(second_bookmark_value,
                                         first_bookmark_value)
                    else:
                        # For `users` stream it stores bookmark as 1 minute less than current time if `updated_at` of
                        # last records less than it. So, if there is no data change then second_bookmark_value will be
                        # 1 minute less than current time. Therefore second_bookmark_value will always be
                        # greater or equal to first_bookmark_value
                        self.assertGreaterEqual(second_bookmark_value,
                                                first_bookmark_value)

                    for record in first_sync_messages:

                        # Verify the first sync bookmark value is the max replication key value for a given stream
                        replication_key_value = record.get(replication_key)
                        # For `ticket` stream it stores bookmarks as int timestamp. So, converting it to the string.
                        if stream == "tickets":
                            replication_key_value = datetime.utcfromtimestamp(
                                replication_key_value).strftime(
                                    '%Y-%m-%dT%H:%M:%SZ')

                        self.assertLessEqual(
                            replication_key_value,
                            first_bookmark_value_utc,
                            msg=
                            "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                        )

                    for record in second_sync_messages:
                        # Verify the second sync replication key value is Greater or Equal to the first sync bookmark
                        replication_key_value = record.get(replication_key)

                        if stream == "tickets":
                            replication_key_value = datetime.utcfromtimestamp(
                                replication_key_value).strftime(
                                    '%Y-%m-%dT%H:%M:%SZ')

                        self.assertGreaterEqual(
                            replication_key_value,
                            simulated_bookmark_value,
                            msg=
                            "Second sync records do not repect the previous bookmark."
                        )

                        # Verify the second sync bookmark value is the max replication key value for a given stream
                        self.assertLessEqual(
                            replication_key_value,
                            second_bookmark_value_utc,
                            msg=
                            "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                        )

                elif expected_replication_method == self.FULL_TABLE:

                    # Verify the syncs do not set a bookmark for full table streams
                    self.assertIsNone(first_bookmark_key_value)
                    self.assertIsNone(second_bookmark_key_value)

                    # Verify the number of records in the second sync is the same as the first

                    # Given below streams are child stremas of parent stream `tickets` and tickets is incremental streams
                    # Child streams also behave like incremental streams but does not save it's own state. So, it don't
                    # have same no of record on second sync and first sync.
                    if not stream in [
                            "ticket_comments", "ticket_audits",
                            "ticket_metrics"
                    ]:
                        self.assertEqual(second_sync_count, first_sync_count)

                else:

                    raise NotImplementedError(
                        "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}"
                        .format(stream, expected_replication_method))

                # Verify at least 1 record was replicated in the second sync
                self.assertGreater(
                    second_sync_count,
                    0,
                    msg="We are not fully testing bookmarking for {}".format(
                        stream))
Exemple #27
0
    def test_run(self):
        expected_streams = self.expected_streams()

        expected_replication_keys = self.expected_replication_keys()
        expected_replication_methods = self.expected_replication_method()

        ##########################################################################
        ### First Sync
        ##########################################################################
        self.start_date_1 = self.get_properties().get("start_date")
        self.start_date_2 = self.timedelta_formatted(self.start_date_1, days=3)

        self.start_date = self.start_date_1
        conn_id = connections.ensure_connection(self,
                                                original_properties=False)

        # Run in check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # Select only the expected streams tables
        catalog_entries = [
            ce for ce in found_catalogs
            if ce['tap_stream_id'] in expected_streams
        ]
        self.perform_and_verify_table_and_field_selection(
            conn_id, catalog_entries, select_all_fields=True)

        # Run a sync job using orchestrator
        first_sync_record_count = self.run_and_verify_sync(conn_id)
        first_sync_records = runner.get_records_from_target_output()
        first_sync_bookmarks = menagerie.get_state(conn_id)

        ##########################################################################
        ### Update State Between Syncs
        ##########################################################################

        new_states = {'bookmarks': dict()}
        simulated_states = self.calculated_states_by_stream(
            first_sync_bookmarks)
        for stream, new_state in simulated_states.items():
            new_states['bookmarks'][stream] = new_state
        menagerie.set_state(conn_id, new_states)

        for stream in simulated_states.keys():
            for state_key, state_value in simulated_states[stream].items():
                if stream not in new_states['bookmarks']:
                    new_states['bookmarks'][stream] = {}
                if state_key not in new_states['bookmarks'][stream]:
                    new_states['bookmarks'][stream][state_key] = state_value

        ##########################################################################
        ### Second Sync
        ##########################################################################
        self.start_date = self.start_date_2

        # run check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # table and field selection
        test_catalogs_2_all_fields = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in expected_streams
        ]
        self.perform_and_verify_table_and_field_selection(
            conn_id, test_catalogs_2_all_fields, select_all_fields=True)

        second_sync_record_count = self.run_and_verify_sync(conn_id)
        second_sync_records = runner.get_records_from_target_output()
        second_sync_bookmarks = menagerie.get_state(conn_id)

        ##########################################################################
        ### Test By Stream
        ##########################################################################

        for stream in expected_streams:
            with self.subTest(stream=stream):
                expected_replication_method = expected_replication_methods[
                    stream]
                first_bookmark_key_value = first_sync_bookmarks.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)
                second_bookmark_key_value = second_sync_bookmarks.get(
                    'bookmarks', {
                        stream: None
                    }).get(stream)

                # expected values
                first_sync_count = first_sync_record_count.get(stream, 0)
                second_sync_count = second_sync_record_count.get(stream, 0)

                # collect information for assertions from syncs 1 & 2 base on expected values
                first_sync_messages = [
                    record.get('data') for record in first_sync_records.get(
                        stream).get('messages')
                    if record.get('action') == 'upsert'
                ]
                second_sync_messages = [
                    record.get('data') for record in second_sync_records.get(
                        stream).get('messages')
                    if record.get('action') == 'upsert'
                ]

                if expected_replication_method == self.INCREMENTAL:

                    replication_key = next(
                        iter(expected_replication_keys[stream]))

                    if stream != 'forms':
                        for form_key in self.get_forms():
                            first_bookmark_value = first_bookmark_key_value.get(
                                form_key, {}).get(replication_key)
                            second_bookmark_value = second_bookmark_key_value.get(
                                form_key, {}).get(replication_key)
                            first_bookmark_value_utc = self.convert_state_to_utc(
                                first_bookmark_value)
                            second_bookmark_value_utc = self.convert_state_to_utc(
                                second_bookmark_value)
                            simulated_bookmark_value = new_states['bookmarks'][
                                stream][form_key]
                            simulated_bookmark_minus_lookback = simulated_bookmark_value

                            # Verify the first sync sets a bookmark of the expected form
                            self.assertIsNotNone(first_bookmark_key_value)

                            # Verify the second sync sets a bookmark of the expected form
                            self.assertIsNotNone(second_bookmark_key_value)

                            # Verify the second sync bookmark is Greater or Equal to the first sync bookmark
                            self.assertGreaterEqual(
                                second_bookmark_value, first_bookmark_value
                            )  # new responses could be picked up for the form in the second sync

                            for record in second_sync_messages:

                                # Verify the second sync records respect the previous (simulated) bookmark value
                                replication_key_value = record.get(
                                    replication_key)
                                self.assertGreaterEqual(
                                    replication_key_value,
                                    simulated_bookmark_minus_lookback,
                                    msg=
                                    "Second sync records do not repect the previous bookmark."
                                )

                                # Verify the second sync bookmark value is the max replication key value for a given stream
                                self.assertLessEqual(
                                    replication_key_value,
                                    second_bookmark_value_utc,
                                    msg=
                                    "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                                )

                            for record in first_sync_messages:
                                # Verify the first sync bookmark value is the max replication key value for a given stream
                                replication_key_value = record.get(
                                    replication_key)
                                self.assertLessEqual(
                                    replication_key_value,
                                    first_bookmark_value_utc,
                                    msg=
                                    "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                                )

                            # Verify the number of records in the 2nd sync is less then the first
                            self.assertLess(second_sync_count,
                                            first_sync_count)

                    else:
                        # collect information specific to incremental streams from syncs 1 & 2
                        first_bookmark_value = first_bookmark_key_value.get(
                            replication_key)
                        second_bookmark_value = second_bookmark_key_value.get(
                            replication_key)
                        first_bookmark_value_utc = self.convert_state_to_utc(
                            first_bookmark_value)
                        second_bookmark_value_utc = self.convert_state_to_utc(
                            second_bookmark_value)
                        simulated_bookmark_value = new_states['bookmarks'][
                            stream][replication_key]
                        simulated_bookmark_minus_lookback = simulated_bookmark_value

                    # Verify the first sync sets a bookmark of the expected form
                    self.assertIsNotNone(first_bookmark_key_value)

                    # Verify the second sync sets a bookmark of the expected form
                    self.assertIsNotNone(second_bookmark_key_value)

                    # Verify the second sync bookmark is Greater or Equal to the first sync bookmark
                    self.assertGreaterEqual(
                        second_bookmark_value, first_bookmark_value
                    )  # new responses could be picked up for the form in the second sync

                    for record in second_sync_messages:

                        # Verify the second sync records respect the previous (simulated) bookmark value
                        replication_key_value = record.get(replication_key)
                        self.assertGreaterEqual(
                            replication_key_value,
                            simulated_bookmark_minus_lookback,
                            msg=
                            "Second sync records do not repect the previous bookmark."
                        )

                        # Verify the second sync bookmark value is the max replication key value for a given stream
                        self.assertLessEqual(
                            replication_key_value,
                            second_bookmark_value_utc,
                            msg=
                            "Second sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                        )

                    for record in first_sync_messages:

                        # Verify the first sync bookmark value is the max replication key value for a given stream
                        replication_key_value = record.get(replication_key)
                        self.assertLessEqual(
                            replication_key_value,
                            first_bookmark_value_utc,
                            msg=
                            "First sync bookmark was set incorrectly, a record with a greater replication-key value was synced."
                        )

                    # Verify the number of records in the 2nd sync is less then the first
                    self.assertLess(second_sync_count, first_sync_count)

                elif expected_replication_method == self.FULL_TABLE:

                    # Verify the syncs do not set a bookmark for full table streams
                    self.assertIsNone(first_bookmark_key_value)
                    self.assertIsNone(second_bookmark_key_value)

                    # Verify the number of records in the second sync is the same as the first
                    self.assertEqual(second_sync_count, first_sync_count)

                else:

                    raise NotImplementedError(
                        "INVALID EXPECTATIONS\t\tSTREAM: {} REPLICATION_METHOD: {}"
                        .format(stream, expected_replication_method))

                # Verify at least 1 record was replicated in the second sync
                self.assertGreater(
                    second_sync_count,
                    0,
                    msg="We are not fully testing bookmarking for {}".format(
                        stream))
Exemple #28
0
    def binlog_json_test(self):
        print("RUNNING {}\n\n".format(self.name()))

        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        expected_check_streams = {self.tap_stream_id()}
        expected_sync_streams = {self.table_name()}
        expected_pks = {self.table_name(): {'id'}}

        # verify the tap discovered the right streams
        found_catalogs = [
            catalog for catalog in menagerie.get_catalogs(conn_id)
            if catalog['tap_stream_id'] in expected_check_streams
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = expected_check_streams.symmetric_difference(found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]

        self.assertEqual(self.table_name(), test_catalog['stream_name'])

        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        selected_metadata = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog,
            menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']),
            additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run initial full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        self.maxDiff = None
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {self.table_name(): 1})
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(message_actions,
                         ['activate_version', 'upsert', 'activate_version'])

        # ensure some log_file and log_pos state was persisted
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertIsNotNone(bookmark['log_file'])
        self.assertIsNotNone(bookmark['log_pos'])

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        # grab version, log_file and log_pos from state to check later
        expected_table_version = records_for_stream['table_version']

        self.assertEqual(expected_table_version, bookmark['version'])

        # check for expected records
        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]

        self.assertEqual([expected_rec_1], upsert_records)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # record count should be empty as we did not persist anything to the gate
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {})

        # insert a new huge row
        data = dict([('foooo%i' % i, 'baaaaar%i' % i) for i in range(2560)],
                    literal=True)
        rec = {'id': 2, 'our_json': json.dumps(data)}

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            self.insert_record(cur, rec)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # Either the log_file is the same but the log_pos has increased or the log_file
        # has rotated and the numeric suffix has increased
        if expected_log_file == bookmark['log_file']:
            self.assertGreater(bookmark['log_pos'], expected_log_pos)
        else:
            expected_log_file_suffix = re.search('^.*\.(\d+)$',
                                                 expected_log_file).groups()[0]
            updated_log_file_suffix = re.search(
                '^.*\.(\d+)$', bookmark['log_file']).groups()[0]

            self.assertGreater(int(updated_log_file_suffix),
                               int(expected_log_file_suffix))

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        expected_rec_2 = copy.deepcopy(rec)

        # check for expected records
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(message_actions, ['upsert'])

        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]
        del upsert_records[0]['_sdc_deleted_at']

        expected_json = json.loads(expected_rec_2.get('our_json', {}))
        actual_json = json.loads(upsert_records[0].get('our_json', {}))

        self.assertTrue(len(actual_json.keys()) > 0)
        self.assertEqual(expected_json, actual_json)
Exemple #29
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]

        self.assertEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        chicken_catalog = found_catalogs[0]

        self.assertEqual('chicken_view', chicken_catalog['stream_name'])
        print("discovered streams are correct")

        print('checking discoverd metadata for ROOT-CHICKEN_VIEW')
        md = menagerie.get_annotated_schema(
            conn_id, chicken_catalog['stream_id'])['metadata']

        self.assertEqual(
            {
                (): {
                    'database-name': 'postgres',
                    'is-view': True,
                    'row-count': 0,
                    'schema-name': 'public',
                    'table-key-properties': []
                },
                ('properties', 'fk_id'): {
                    'inclusion': 'available',
                    'sql-datatype': 'bigint',
                    'selected-by-default': True
                },
                ('properties', 'name'): {
                    'inclusion': 'available',
                    'sql-datatype': 'character varying',
                    'selected-by-default': True
                },
                ('properties', 'age'): {
                    'inclusion': 'available',
                    'sql-datatype': 'integer',
                    'selected-by-default': True
                },
                ('properties', 'size'): {
                    'inclusion': 'available',
                    'sql-datatype': 'character varying',
                    'selected-by-default': True
                },
                ('properties', 'id'): {
                    'inclusion': 'available',
                    'sql-datatype': 'integer',
                    'selected-by-default': True
                }
            }, metadata.to_map(md))

        # 'ID' selected as view-key-properties
        replication_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-key': None,
                "replication-method": "FULL_TABLE",
                'view-key-properties': ["id"]
            }
        }]

        connections.select_catalog_and_fields_via_metadata(
            conn_id, chicken_catalog,
            menagerie.get_annotated_schema(conn_id,
                                           chicken_catalog['stream_id']),
            replication_md)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        self.assertEqual(record_count_by_stream, {'chicken_view': 1})
        records_by_stream = runner.get_records_from_target_output()

        table_version = records_by_stream['chicken_view']['table_version']
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][0]['action'],
            'activate_version')
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][1]['action'],
            'upsert')
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][2]['action'],
            'activate_version')

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        actual_chicken_record = records_by_stream['chicken_view']['messages'][
            1]['data']

        expected_chicken_record = {
            'id': 1,
            'fk_id': 1,
            'name': 'fred',
            'age': 99,
            'size': 'big'
        }
        self.assertEqual(
            actual_chicken_record,
            expected_chicken_record,
            msg=
            "Expected `various_types` upsert record data to be {}, but target output {}"
            .format(expected_chicken_record, actual_chicken_record))

        print("records are correct")

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)

        chicken_bookmark = state['bookmarks']['postgres-public-chicken_view']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        self.assertEqual(
            chicken_bookmark['version'],
            table_version,
            msg="expected bookmark for stream ROOT-CHICKEN to match version")
Exemple #30
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        for c in found_catalogs:
            catalog_props_to_check = ['stream_name', 'tap_stream_id']
            stream = c['stream_name']

            for prop in catalog_props_to_check:
                self.assertEqual(
                    c[prop],
                    expected_catalogs[stream][prop],
                    msg=
                    "unexpected stream catalog property `{}` for stream `{}`: `{}` != `{}`"
                    .format(prop, stream, expected_catalogs[stream][prop],
                            c[prop]))

        print("discovered streams are correct")

        print('checking discoverd metadata for tap_tester_mysql_0-incremental')
        incremental_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_0-incremental'
        ][0]
        md = menagerie.get_annotated_schema(
            conn_id, incremental_catalog['stream_id'])['metadata']

        incremental_stream_metadata = {
            'database-name': 'tap_tester_mysql_0',
            'row-count': 3,
            'is-view': False,
            'selected-by-default': False,
            'table-key-properties': ['c_pk']
        }

        self.assertEqual(
            sorted(md, key=lambda x: x['breadcrumb']),
            [{
                'breadcrumb': [],
                'metadata': incremental_stream_metadata
            }, {
                'breadcrumb': ['properties', 'c_dt'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'datetime'
                }
            }, {
                'breadcrumb': ['properties', 'c_pk'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'int(11)'
                }
            }, {
                'breadcrumb': ['properties', 'c_varchar'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'varchar(255)'
                }
            }, {
                'breadcrumb': ['properties', 'c_varchar_to_deselect'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'varchar(255)'
                }
            }])

        print('checking discovered metadata for tap_tester_mysql_1-view')
        view_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_1-view'
        ][0]
        view_catalog_key_properties_md = [{
            'breadcrumb': [],
            'metadata': {
                'view-key-properties': ['c_pk']
            }
        }]

        connections.set_non_discoverable_metadata(
            conn_id, view_catalog,
            menagerie.get_annotated_schema(conn_id, view_catalog['stream_id']),
            view_catalog_key_properties_md)
        md = menagerie.get_annotated_schema(
            conn_id, view_catalog['stream_id'])['metadata']

        view_stream_metadata = {
            'database-name': 'tap_tester_mysql_1',
            'is-view': True,
            'selected-by-default': False,
            'view-key-properties': ['c_pk']
        }

        self.assertEqual(sorted(md, key=lambda x: x['breadcrumb']),
                         [{
                             'breadcrumb': [],
                             'metadata': view_stream_metadata
                         }, {
                             'breadcrumb': ['properties', 'c_pk'],
                             'metadata': {
                                 'selected-by-default': True,
                                 'sql-datatype': 'int(11)'
                             }
                         }, {
                             'breadcrumb': ['properties', 'c_varchar'],
                             'metadata': {
                                 'selected-by-default': True,
                                 'sql-datatype': 'varchar(255)'
                             }
                         }])

        #No selected-by-default MD for c_year because it is an unsupported type
        various_types_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_0-various_types'
        ][0]
        md = menagerie.get_annotated_schema(
            conn_id, various_types_catalog['stream_id'])['metadata']
        c_year_md = [
            x for x in md if x['breadcrumb'] == ['properties', 'c_year']
        ]
        self.assertEqual(c_year_md, [{
            'breadcrumb': ['properties', 'c_year'],
            'metadata': {
                'selected-by-default': False,
                'sql-datatype': 'year(4)'
            }
        }])

        ##select_simple_example
        catalogs_to_select = [
            c for c in found_catalogs
            if c['tap_stream_id'] != 'tap_tester_mysql_0-simple_example'
        ]

        for a_catalog in catalogs_to_select:
            additional_md = []
            unselected_fields = []
            if a_catalog['tap_stream_id'] == 'tap_tester_mysql_0-incremental':
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-key': 'c_dt',
                        'replication-method': 'INCREMENTAL'
                    }
                }]
                unselected_fields = ['c_varchar_to_deselect']

            elif a_catalog['tap_stream_id'] == 'tap_tester_mysql_1-view':
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'view-key-properties': ['c_pk'],
                        'replication-method': 'FULL_TABLE'
                    }
                }]
            else:
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-method': 'FULL_TABLE'
                    }
                }]

            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, a_catalog,
                menagerie.get_annotated_schema(conn_id,
                                               a_catalog['stream_id']),
                additional_md, unselected_fields)
        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        expected_row_count = 8  # {'my_isam': 1, 'various_types': 3, 'incremental': 3, 'view': 1}
        self.assertEqual(
            replicated_row_count,
            expected_row_count,
            msg="failed to replicate correct number of rows: {}".format(
                record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        records_by_stream = runner.get_records_from_target_output()

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify that activate version messages were sent in the proper position
            self.assertEqual(
                recs['messages'][0]['action'],
                'activate_version',
                msg=
                "Expected first message sent for stream `{}` to have action `activate_version`"
                .format(stream))

            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # verify that the target output the proper numeric and date representations
        expected_various_types_records = [{
            'c_time':
            '1970-01-01T12:34:56.000000Z',
            'c_mediumint':
            8388607,
            'c_smallint':
            32767,
            'c_tinyint':
            127,
            'c_date':
            '2017-09-13T00:00:00.000000Z',
            'c_bigint':
            9223372036854775807,
            'c_decimal':
            -1,
            'c_int':
            2147483647,
            'c_bit':
            True,
            'c_decimal_2':
            Decimal('123456789.0'),
            'c_pk':
            1,
            'c_double':
            Decimal("1.234"),
            'c_float':
            Decimal("1.234"),
            'c_decimal_2_unsigned':
            Decimal("1.23"),
            'c_tinyint_1':
            True
        }, {
            'c_time':
            '1970-01-01T12:34:57.000000Z',
            'c_mediumint':
            -8388608,
            'c_smallint':
            -32768,
            'c_tinyint':
            -128,
            'c_date':
            '2017-09-14T00:00:00.000000Z',
            'c_bigint':
            -9223372036854775808,
            'c_decimal':
            0,
            'c_int':
            -2147483648,
            'c_bit':
            False,
            'c_decimal_2':
            Decimal("123456790.0"),
            'c_pk':
            2,
            'c_double':
            Decimal("2.234"),
            'c_float':
            Decimal("2.234"),
            'c_decimal_2_unsigned':
            Decimal("0.23"),
            'c_tinyint_1':
            False
        }, {
            'c_time':
            '1970-01-01T12:34:57.000000Z',
            'c_mediumint':
            -8388608,
            'c_smallint':
            -32768,
            'c_tinyint':
            -128,
            'c_date':
            '2017-09-14T00:00:00.000000Z',
            'c_bigint':
            -9223372036854775808,
            'c_decimal':
            0,
            'c_int':
            -2147483648,
            'c_bit':
            None,
            'c_decimal_2':
            Decimal("123456790.0"),
            'c_pk':
            3,
            'c_double':
            Decimal("2.234"),
            'c_float':
            Decimal("2.234"),
            'c_decimal_2_unsigned':
            Decimal("0.23"),
            'c_tinyint_1':
            None
        }]

        actual_various_types_records = [
            r['data']
            for r in records_by_stream['various_types']['messages'][1:4]
        ]

        self.assertEqual(
            actual_various_types_records,
            expected_various_types_records,
            msg=
            "Expected `various_types` upsert record data to be {}, but target output {}"
            .format(expected_various_types_records,
                    actual_various_types_records))

        # verify that deselected property was not output
        expected_incremental_record = {
            'c_pk': 1,
            'c_dt': '2017-01-01T00:00:00.000000Z',
            'c_varchar': 'a'
        }

        actual_incremental_record = records_by_stream['incremental'][
            'messages'][1]['data']

        self.assertEqual(
            actual_incremental_record,
            expected_incremental_record,
            msg=
            "Expected first `incremental` upsert record data to be {}, but target output {}"
            .format(expected_incremental_record, actual_incremental_record))

        print("records are correct")

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)
        bookmarks = state['bookmarks']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        for k, v in bookmarks.items():
            if k == 'tap_tester_mysql_0-incremental':
                self.assertIsNotNone(
                    v['version'],
                    msg="expected bookmark for stream `{}` to have a version set"
                    .format(k))
                self.assertEqual(
                    v['replication_key_value'],
                    '2017-01-01T00:00:02.000000Z',
                    msg=
                    "incorrect replication_key_value in bookmark for stream `{}`"
                    .format(k))
                self.assertEqual(
                    v['replication_key'],
                    'c_dt',
                    msg=
                    "incorrect replication_key specified in bookmark for stream `{}`"
                    .format(k))
            else:
                self.assertFalse(
                    'version' in v,
                    msg=
                    "expected bookmark for stream `{}` to not have a version key"
                    .format(k))
                self.assertTrue(
                    'initial_full_table_complete' in v,
                    msg=
                    "expected bookmark for stream `{}` to have a true initial_full_table_complete key"
                    .format(k))
        print("state and bookmarks are correct")

        incremental_table_initial_table_version = bookmarks[
            'tap_tester_mysql_0-incremental']['version']

        #----------------------------------------------------------------------
        # invoke the sync job again after some modifications
        #----------------------------------------------------------------------

        print("adding a column to an existing table in the source db")
        connection = db_utils.get_db_connection(self.get_properties(),
                                                self.get_credentials())

        with connection.cursor() as cursor:
            add_column_sql = '''
                ALTER TABLE tap_tester_mysql_0.incremental
                  ADD COLUMN favorite_number INTEGER;
                INSERT INTO tap_tester_mysql_0.incremental VALUES (4, '4', '2017-01-01 00:00:03', 'yeehaw', 999);
            '''
            cursor.execute(add_column_sql)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        expected_row_count = 7  # {'my_isam': 1, 'various_types': 3, 'incremental': 2, 'view': 1}
        self.assertEqual(
            replicated_row_count,
            expected_row_count,
            msg="failed to replicate correct number of rows: {}".format(
                record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        records_by_stream = runner.get_records_from_target_output()

        expected_schema_of_new_column = {
            'maximum': 2147483647,
            'selected': True,
            'inclusion': 'available',
            'type': ['null', 'integer'],
            'minimum': -2147483648
        }

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify that a activate version messages were sent in the proper position
            if stream == 'incremental':
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version',
                    msg=
                    "Expected first message sent for stream `{}` not to have action `activate_version`"
                    .format(stream))
                expected_schema_of_new_column = {
                    'maximum': 2147483647,
                    'inclusion': 'available',
                    'type': ['null', 'integer'],
                    'minimum': -2147483648
                }
                self.assertEqual(
                    records_by_stream[stream]['schema']['properties']
                    ['favorite_number'],
                    expected_schema_of_new_column,
                    msg=
                    "Expected newly-added column to be present in schema for stream `{}`, but it was not."
                    .format(stream))
            else:
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'upsert',
                    msg=
                    "Expected first message sent for stream `{}` to have action `upsert`"
                    .format(stream))
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version',
                    msg=
                    "Expected last message sent for stream `{}` to have action `activate_version`"
                    .format(stream))

        state = menagerie.get_state(conn_id)
        bookmarks = state['bookmarks']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        for k, v in bookmarks.items():
            if k == 'tap_tester_mysql_0-incremental':
                self.assertIsNotNone(
                    v['version'],
                    msg="expected bookmark for stream `{}` to have a version set"
                    .format(k))
                self.assertEqual(
                    v['replication_key_value'],
                    '2017-01-01T00:00:03.000000Z',
                    msg=
                    "incorrect replication_key_value in bookmark for stream `{}`"
                    .format(k))
                self.assertEqual(
                    v['replication_key'],
                    'c_dt',
                    msg=
                    "incorrect replication_key specified in bookmark for stream `{}`"
                    .format(k))
            else:
                self.assertFalse(
                    'version' in v,
                    msg=
                    "expected bookmark for stream `{}` to not have a version key"
                    .format(k))
                self.assertTrue(
                    'initial_full_table_complete' in v,
                    msg=
                    "expected bookmark for stream `{}` to have a true initial_full_table_complete key"
                    .format(k))

        print("state and bookmarks are correct")

        # verify incremental table_version didn't change
        incremental_table_new_table_version = bookmarks[
            'tap_tester_mysql_0-incremental']['version']

        self.assertEqual(
            incremental_table_initial_table_version,
            incremental_table_new_table_version,
            msg=
            "Expected incrementally-replicated table's table_version to remain unchanged over multiple invocations."
        )