Ejemplo n.º 1
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            catalog for catalog in menagerie.get_catalogs(conn_id)
            if catalog['tap_stream_id'] in self.tap_stream_ids()
        ]

        self.assertEqual(
            len(found_catalogs),
            2,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.tap_stream_ids().symmetric_difference(found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        found_table_names = set(map(lambda c: c['stream_name'],
                                    found_catalogs))
        tables_diff = self.table_names().symmetric_difference(
            found_table_names)
        self.assertEqual(
            len(tables_diff),
            0,
            msg="discovered schemas do not match: {}".format(tables_diff))

        print("discovered streams are correct")

        # Select all catalogs for full table sync
        for test_catalog in found_catalogs:
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    'replication-method': 'FULL_TABLE'
                }
            }]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, test_catalog,
                menagerie.get_annotated_schema(conn_id,
                                               test_catalog['stream_id']),
                additional_md)

        # Set state to mimic that the a full table sync did not complete
        menagerie.set_state(conn_id, self.get_interrupted_state())

        # run full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the target output (schema, record count, message actions)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.table_names(), self.expected_pks())
        self.assertEqual(record_count_by_stream, {
            'full_table': 4,
            'full_table_composite_key': 4
        })

        records_by_stream = runner.get_records_from_target_output()
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                self.expected_schemas()[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

            messages_for_stream = recs['messages']
            message_actions = [rec['action'] for rec in messages_for_stream]

            self.assertEqual(
                message_actions,
                ['upsert', 'upsert', 'upsert', 'upsert', 'activate_version'])

        state = menagerie.get_state(conn_id)
        for tap_stream_id in self.tap_stream_ids():
            bookmark = state['bookmarks'][tap_stream_id]

            # last_pk_fetched and max_pk_values are cleared after success
            self.assertEqual(bookmark, {'initial_full_table_complete': True})
Ejemplo n.º 2
0
    def test_zzzu_run_sync_mode(self):
        # Select our catalogs
        our_catalogs = menagerie.get_catalogs(S3TypesAndData.conn_id)
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                self.conn_id, c['stream_id'])
            metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(
                self.conn_id, c, c_annotated)

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, self.conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, self.conn_id, self.expected_sync_streams(),
            self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())

        for stream in self.expected_sync_streams():
            with self.subTest(stream=stream):
                self.assertEqual(
                    record_count_by_stream.get(stream, 0),
                    S3TypesAndData.expected_stream_row_counts()[stream],
                    msg=
                    "actual rows: {}, expected_rows: {} for stream {} don't match"
                    .format(
                        record_count_by_stream.get(stream, 0),
                        S3TypesAndData.expected_stream_row_counts()[stream],
                        stream))

        print("total replicated row count: {}".format(replicated_row_count))

        synced_records = runner.get_records_from_target_output()

        # verify that when header is longer, the end columns have null values
        upsert_message_header_longer = [
            m for m in synced_records.get('header_longer').get('messages')
            if m['action'] == 'upsert'
        ]
        data_null = [
            d for d in upsert_message_header_longer
            if d["data"]["aa0"] == d["data"]["ab0"] == d["data"]["ac0"] ==
            d["data"]["ad0"] == d["data"]["ae0"] is None
        ]
        self.assertEqual(
            S3TypesAndData.expected_stream_row_counts()['header_longer'],
            len(data_null))

        # verify that when header is shorter, the _sdc_extra has the values
        upsert_message_header_shorter = [
            m for m in synced_records.get('header_shorter').get('messages')
            if m['action'] == 'upsert'
        ]
        s3_extra = [
            d for d in upsert_message_header_shorter
            if len(d["data"]["_sdc_extra"]) == 5
        ]
        self.assertEqual(
            S3TypesAndData.expected_stream_row_counts()['header_shorter'],
            len(s3_extra))

        # verify when one row is shorter and one longer one has _sdc_extra other has null
        upsert_message_rows_longer_shorter = [
            m for m in synced_records.get('rows_longer_and_shorter').get(
                'messages') if m['action'] == 'upsert'
        ]
        data_null = [
            d for d in upsert_message_rows_longer_shorter
            if d["data"]["v0"] == d["data"]["w0"] == d["data"]["x0"] ==
            d["data"]["y0"] == d["data"]["z0"] is None
        ]
        s3_extra = [
            d for d in upsert_message_rows_longer_shorter
            if len(d["data"].get("_sdc_extra", [])) == 5
        ]
        self.assertTrue(len(data_null) == len(s3_extra) == 1)
Ejemplo n.º 3
0
    def test_run(self):
        """
        Verify that for each stream you can get multiple pages of data
        when no fields are selected and only the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """
        print("\n\nRUNNING {}\n\n".format(self.name()))

        # Resetting tracked parent objects prior to test
        utils.reset_tracked_parent_objects()

        # ensure data exists for sync streams and set expectations
        _, existing_boards = utils.get_total_record_count_and_objects('boards')
        custom_fields_dict = {x: []
                              for x in self.expected_custom_fields()
                              }  # ids by stream
        custom_fields_by_board = {
            x.get('id'): copy.deepcopy(custom_fields_dict)
            for x in existing_boards
        }  # ids by stream

        # get existing custom fields for each board
        print("Getting objects on baord with static custom field set")
        for board_id, board_cfields in custom_fields_by_board.items():
            cfields = utils.get_custom_fields('boards', board_id)
            for field in self.expected_custom_fields():
                cfields_type_field = [f for f in cfields if f['type'] == field]
                if cfields_type_field:
                    board_cfields[field] += cfields_type_field

        # get expected cards with custom fields
        expected_records_cfields = list()
        board_id = utils.NEVER_DELETE_BOARD_ID
        all_cards_on_board = utils.get_objects('cards', parent_id=board_id)
        print("Setting custom fields expectations based on static data")
        for card in all_cards_on_board:
            card_with_cfields = utils.get_objects('cards',
                                                  obj_id=card.get('id'),
                                                  parent_id=board_id,
                                                  custom_fields=True)

            if card_with_cfields:
                expected_records_cfields += card_with_cfields

        # veryify at least 1 record exists for each custom field type or else our assertions are invalid
        fields_exist = {x: False for x in self.expected_custom_fields()}
        for record in expected_records_cfields:
            if all(v for _, v in fields_exist.items()):
                break
            value = record.get('value')
            if value:
                key = next(iter(value))
                if key in self.expected_custom_fields(
                ) and not fields_exist.get(key):
                    fields_exist[key] = True
                elif key == 'checked':
                    fields_exist['checkbox'] = True
                elif key == 'option':
                    fields_exist['list'] = True

        self.assertTrue(all(v for _, v in fields_exist.items()),
                        msg="Not all custom field types have data. Data must be restored manually on Trello account" +\
                        "\nCurrent data: {}".format(fields_exist))

        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")

        # Select all streams and all fields
        self.select_all_streams_and_fields(conn_id,
                                           found_catalogs,
                                           select_all_fields=True)

        for cat in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, cat['stream_id'])
            for k in self.expected_automatic_fields()[cat['stream_name']]:
                mdata = next(
                    (m for m in catalog_entry['metadata']
                     if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k),
                    None)
                print("Validating inclusion on {}: {}".format(
                    cat['stream_name'], mdata))
                self.assertTrue(
                    mdata and mdata['metadata']['inclusion'] == 'automatic')

        catalogs = menagerie.get_catalogs(conn_id)

        #clear state
        menagerie.set_state(conn_id, {})

        # run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # read target output
        first_record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      first_record_count_by_stream.values())
        synced_records = runner.get_records_from_target_output()

        # Verify target has records for all synced streams
        for stream, count in first_record_count_by_stream.items():
            assert stream in self.expected_sync_streams()
            self.assertGreater(
                count,
                0,
                msg="failed to replicate any data for: {}".format(stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Testing streams with custom fields
        for stream in self.testable_streams():
            with self.subTest(stream=stream):

                data = synced_records.get(stream)
                record_messages = [row['data'] for row in data['messages']]
                record_ids = [message.get('id') for message in record_messages]

                record_custom_fields = [
                    message.get('customFieldItems')
                    for message in record_messages
                    if message.get('customFieldItems', None)
                ]
                record_cfield_ids = []
                for record in record_custom_fields:
                    for cfield in record:
                        record_cfield_ids.append(cfield.get('id'))

                # Verify that we replicated the records with custom_fields
                for card in all_cards_on_board:
                    if card.get('id') in expected_records_cfields:
                        self.assertIn(
                            card.get('id'),
                            records_ids,
                            msg="Missing a record that has custom fields:\n{}".
                            format(card.get('id')))

                # Verify that we replicated the expected custom fields on those records
                for expected_cfield in expected_records_cfields:
                    self.assertIn(
                        expected_cfield.get('id'),
                        record_cfield_ids,
                        msg="Missing custom field from expected {} record id={}"
                        .format(stream, expected_cfield.get('id')))

                    # Verify the expected custom field attributes match the replicated data
                    for actual_cfields in record_custom_fields:
                        expected_cfield_replicated = expected_cfield in actual_cfields
                        if expected_cfield_replicated:
                            break
                    self.assertTrue(expected_cfield_replicated)

        # Reset the parent objects that we have been tracking
        utils.reset_tracked_parent_objects()
Ejemplo n.º 4
0
    def test_run(self):
        """Test we get a lot of data back based on the start date configured in base"""

        streams_under_test = self.streams_under_test()

        conn_id = self.create_connection_with_initial_discovery()

        # Select streams and all fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        our_catalogs = [catalog for catalog in found_catalogs if
                        catalog.get('tap_stream_id') in streams_under_test]
        self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True)

        # Run a sync job using orchestrator
        first_sync_record_count = self.run_sync(conn_id)

        # get results
        first_sync_records = runner.get_records_from_target_output()
        state = menagerie.get_state(conn_id)

        # set the start date for a new connection based off state
        bookmarked_values = []
        expected_replication_keys_by_stream = self.expected_replication_keys()
        for stream in streams_under_test:
            replication_key = list(expected_replication_keys_by_stream[stream])[0]
            bookmarked_values.append(state['bookmarks'][stream][replication_key])

        # grab the most recent bookmark from state
        greatest_bookmark_value = sorted(bookmarked_values)[-1].split("T")[0]
        start_date = self.timedelta_formatted(greatest_bookmark_value, days=-1, str_format="%Y-%m-%d") # BUG_TDL-19582
        # start_date = self.timedelta_formatted(greatest_bookmark_value, days=0, str_format="%Y-%m-%d")  # BUG_TDL-19582
        self.start_date = start_date + "T00:00:00Z"

        # create a new connection with the new  more recent start_date
        conn_id = self.create_connection_with_initial_discovery(original_properties=False)

        # Select all streams and all fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        our_catalogs = [catalog for catalog in found_catalogs if
                        catalog.get('tap_stream_id') in streams_under_test]
        self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True)

        # Run a sync job using orchestrator with a more recent start date
        second_sync_record_count = self.run_sync(conn_id)

        # get results
        second_sync_records = runner.get_records_from_target_output()

        for stream in streams_under_test:
            with self.subTest(stream=stream):

                # gather expectations
                replication_key = list(expected_replication_keys_by_stream[stream])[0]

                # get results
                record_messages = [message['data']
                                   for message in second_sync_records[stream]['messages']
                                   if message.get('action') == 'upsert']
                if stream == 'issues':
                    replication_key_values = [record_message['fields'][replication_key] for record_message in record_messages]
                else:
                    replication_key_values = [record_message[replication_key] for record_message in record_messages]
                max_replication_key_value = sorted(replication_key_values)[-1]

                # verify that each stream has less records than the first connection sync
                self.assertGreater(
                    first_sync_record_count.get(stream, 0),
                    second_sync_record_count.get(stream, 0),
                    msg="second had more records, start_date usage not verified",
                    logging="verify less records are replicated with a more recent start date"
                )

                # verify all data from 2nd sync >= start_date
                self.assertGreaterEqual(
                    parse(max_replication_key_value), parse(self.start_date),
                    logging="verify on second sync no records are replicated prior to the start date"
                )
Ejemplo n.º 5
0
    def test_discovery(self):
        """
        Verify that discover creates the appropriate catalog, schema, metadata, etc.
        """
        found_catalogs = menagerie.get_catalogs(S3TypesAndData.conn_id)

        # verify that the number of streams is correct based on the configuration
        self.assertEqual(
            len(found_catalogs), len(self.expected_streams()),
            "The number of catalogs doesn't match "
            "the number of tables in the configuration")

        # verify the stream names are the names in the config file -- with " " -> "_"?
        found_stream_names = {x["stream_name"] for x in found_catalogs}
        self.assertEqual(found_stream_names, self.expected_stream_names())

        # verify the number of top level objects in the schema is correct
        for catalog in found_catalogs:
            with self.subTest(c=catalog):
                stream_name = catalog["stream_name"]
                files_for_stream = list(
                    EXPECTED_STREAMS_TO_RESOURCES[stream_name])
                expected_properties = S3TypesAndData.columns_in_header_of_csv_file(
                    files_for_stream).union(
                        S3TypesAndData.stitch_added_columns())

                metadata_and_annotated_schema = menagerie.get_annotated_schema(
                    S3TypesAndData.conn_id, catalog['stream_id'])
                annotated_schema = metadata_and_annotated_schema[
                    "annotated-schema"]
                metadata = metadata_and_annotated_schema["metadata"]

                # verify that the annotated schema has the correct number of properties
                self.assertEqual(
                    len(expected_properties),
                    len(annotated_schema.get("properties").keys()))

                # verify that the metadata has the correct number of breadcrumbs with properties
                properties_metadata = [
                    x for x in metadata if "properties" in x.get("breadcrumb")
                ]
                self.assertEqual(len(expected_properties),
                                 len(properties_metadata))

                # verify that all non pk's are given the inclusion of available in annotated schema.
                expected_key_properties = \
                    S3TypesAndData.expected_pks()[stream_name]
                expected_not_pk_properties = expected_properties.difference(
                    expected_key_properties)
                actual_available_properties = {
                    k
                    for k, v in annotated_schema["properties"].items()
                    if v.get("inclusion") == "available"
                }
                self.assertEqual(actual_available_properties,
                                 expected_not_pk_properties)

                # verify that all non pk's are given the inclusion of available in metadata.
                # make sure that we use problematic characters for header names
                #   - space" ", dash"-", underscore"_", comma"," etc.
                actual_available_properties = \
                    {item.get("breadcrumb", ["", ""])[1]
                     for item in metadata
                     if item.get("metadata").get("inclusion") == "available"}
                self.assertEqual(actual_available_properties,
                                 expected_not_pk_properties)
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        #select all catalogs
        for catalog in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog,
                menagerie.get_annotated_schema(conn_id, catalog['stream_id']))

        future_time = "2050-01-01T00:00:00.000000Z"

        #clear state
        future_bookmarks = {
            "currently_syncing": None,
            "bookmarks": {
                "contacts": {
                    "offset": {},
                    "versionTimestamp": future_time
                },
                "subscription_changes": {
                    "startTimestamp": future_time,
                    "offset": {}
                },
                "campaigns": {
                    "offset": {}
                },
                "forms": {
                    "updatedAt": future_time
                },
                "deals": {
                    "offset": {},
                    "hs_lastmodifieddate": future_time
                },
                "workflows": {
                    "updatedAt": future_time
                },
                "owners": {
                    "updatedAt": future_time
                },
                "contact_lists": {
                    "updatedAt": future_time,
                    "offset": {}
                },
                "email_events": {
                    "startTimestamp": future_time,
                    "offset": {}
                },
                "companies": {
                    "offset": {},
                    "hs_lastmodifieddate": future_time
                },
                "engagements": {
                    "lastUpdated": future_time,
                    "offset": {}
                }
            }
        }

        menagerie.set_state(conn_id, future_bookmarks)

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        #because the bookmarks were set into the future, we should NOT actually replicate any data.
        #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks
        streams_with_bookmarks = self.expected_sync_streams()
        streams_with_bookmarks.remove('campaigns')
        streams_with_bookmarks.remove('deal_pipelines')
        bad_streams = streams_with_bookmarks.intersection(
            record_count_by_stream.keys())
        self.assertEqual(
            len(bad_streams),
            0,
            msg="still pulled down records from {} despite future bookmarks".
            format(bad_streams))

        state = menagerie.get_state(conn_id)

        # NB: Companies and engagements won't set a bookmark in the future.
        state["bookmarks"].pop("companies")
        state["bookmarks"].pop("engagements")
        future_bookmarks["bookmarks"].pop("companies")
        future_bookmarks["bookmarks"].pop("engagements")

        self.assertEqual(
            state,
            future_bookmarks,
            msg=
            "state should not have been modified because we didn't replicate any data"
        )
        bookmarks = state.get('bookmarks')
        bookmark_streams = set(state.get('bookmarks').keys())
    def run_single_projection(self, projection_mapping):
        self.setUpDatabase()
        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id']
                          for c in found_catalogs})

        for tap_stream_id in self.expected_check_streams():
            found_stream = [
                c for c in found_catalogs
                if c['tap_stream_id'] == tap_stream_id
            ][0]

            # assert that the pks are correct
            self.assertEqual(
                self.expected_pks()[found_stream['stream_name']],
                set(
                    found_stream.get('metadata',
                                     {}).get('table-key-properties')))

            # assert that the row counts are correct
            self.assertEqual(
                self.expected_row_counts()[found_stream['stream_name']],
                found_stream.get('metadata', {}).get('row-count'))

        #  -----------------------------------
        # ----------- Initial Full Table ---------
        #  -----------------------------------
        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    'replication-method': 'LOG_BASED'
                }
            }]
            if projection_mapping['projection'] is not None:
                additional_md[0]['metadata'][
                    'tap_mongodb.projection'] = json.dumps(
                        projection_mapping['projection'])
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema, additional_md)

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()

        for stream_name in self.expected_sync_streams():
            stream_records = [
                x for x in messages_by_stream[stream_name]['messages']
                if x.get('action') == 'upsert'
            ]
            #actual_keys = set()

            for record in stream_records:
                self.assertIn(record['data'].keys(),
                              projection_mapping['expected_keys'])
                #actual_keys = actual_keys.union(set(record['data'].keys()))

            #self.assertTrue(actual_keys.issubset(projection_mapping['expected_keys']))

        self.modify_database()

        #  -----------------------------------
        # ----------- Subsequent Oplog Sync ---------
        #  -----------------------------------

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()

        for stream_name in self.expected_sync_streams():
            stream_records = [
                x for x in messages_by_stream[stream_name]['messages']
                if x.get('action') == 'upsert'
            ]
            #actual_keys = set()
            for record in stream_records:
                self.assertIn(record['data'].keys(),
                              projection_mapping['expected_keys'])
    def test_run(self):
        """
        Verify that a bookmark doesn't exist for the stream
        Verify that the second sync includes the same number or more records than the first sync
        Verify that all records in the first sync are included in the second sync
        Verify that the sync only sent records to the target for selected streams (catalogs)

        PREREQUISITE
        For EACH stream that is fully replicated there are multiple rows of data with
            different values for the replication key
        """
        conn_id = self.create_connection()

        # Select all streams and no fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        full_streams = {
            key
            for key, value in self.expected_replication_method().items()
            if value == self.FULL
        }
        our_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in full_streams
        ]
        self.select_all_streams_and_fields(conn_id,
                                           our_catalogs,
                                           select_all_fields=True)

        # Run a sync job using orchestrator
        first_sync_record_count = self.run_sync(conn_id)

        # verify that the sync only sent records to the target for selected streams (catalogs)
        self.assertEqual(set(first_sync_record_count.keys()), full_streams)

        first_sync_state = menagerie.get_state(conn_id)

        # Get the set of records from a first sync
        first_sync_records = runner.get_records_from_target_output()

        # Run a second sync job using orchestrator
        second_sync_record_count = self.run_sync(conn_id)

        # Get the set of records from a second sync
        second_sync_records = runner.get_records_from_target_output()

        # THIS MAKES AN ASSUMPTION THAT CHILD STREAMS DO NOT NEED TESTING.
        # ADJUST IF NECESSARY
        for stream in full_streams.difference(self.child_streams()):
            with self.subTest(stream=stream):

                # verify there is no bookmark values from state
                state_value = first_sync_state.get("bookmarks", {}).get(stream)
                self.assertIsNone(state_value)

                # verify that there is more than 1 record of data - setup necessary
                self.assertGreater(
                    first_sync_record_count.get(stream, 0),
                    1,
                    msg="Data isn't set up to be able to test full sync")

                # verify that you get the same or more data the 2nd time around
                self.assertGreaterEqual(
                    second_sync_record_count.get(stream, 0),
                    first_sync_record_count.get(stream, 0),
                    msg=
                    "second syc didn't have more records, full sync not verified"
                )

                # verify all data from 1st sync included in 2nd sync
                first_data = [
                    record["data"] for record in first_sync_records.get(
                        stream, {}).get("messages", {"data": {}})
                ]
                second_data = [
                    record["data"] for record in second_sync_records.get(
                        stream, {}).get("messages", {"data": {}})
                ]

                same_records = 0
                for first_record in first_data:
                    first_value = json.dumps(first_record, sort_keys=True)

                    for compare_record in second_data:
                        compare_value = json.dumps(compare_record,
                                                   sort_keys=True)

                        if first_value == compare_value:
                            second_data.remove(compare_record)
                            same_records += 1
                            break

                self.assertEqual(
                    len(first_data),
                    same_records,
                    msg=
                    "Not all data from the first sync was in the second sync")
Ejemplo n.º 9
0
    def test_run(self):
        (table_configs, conn_id, expected_streams) = self.pre_sync_test()

        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        found_catalogs = menagerie.get_catalogs(conn_id)
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    'replication-method':
                    'LOG_BASED',
                    'tap-mongodb.projection':
                    table_configs[0]['ProjectionExpression']
                }
            }]
            connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema, additional_md)

        self.first_sync_test(table_configs, conn_id, expected_streams)

        ################################
        # Run sync SECOND TIME and check that no records came through
        ################################
        # Disable streams forces shards to close
        self.disableStreams(expected_streams)
        sync_job_name = runner.run_sync_mode(self, conn_id)
        self.enableStreams(expected_streams)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()

        # Check that we only have 1 message (activate_version) on syncing
        # a stream without changes
        for stream in messages_by_stream.values():
            self.assertEqual(1, len(stream['messages']))

        menagerie.get_state(conn_id)

        # Add 10 rows to the DB
        self.addMoreData(10)
        # Delete some rows
        self.deleteData(range(40, 50))
        # Change some rows
        self.updateData(10, 60, 'boolean_field', False)

        ################################
        # Run sync THIRD TIME and check that records did come through
        ################################
        # Disable streams forces shards to close
        self.disableStreams(expected_streams)
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()

        for config in table_configs:
            table_name = config['TableName']

            for message in messages_by_stream[table_name]['messages']:
                if message['action'] == 'upsert':
                    if not message['data'].get('_sdc_deleted_at'):
                        top_level_keys = {*message['data'].keys()}
                        self.assertEqual(config['top_level_keys'],
                                         top_level_keys)
                        for list_key in config['top_level_list_keys']:
                            self.assertTrue(
                                isinstance(message['data'][list_key], list))
                        self.assertEqual(
                            config['nested_map_keys']['map_field'],
                            {*message['data']['map_field'].keys()})

        # Check that we have 31 messages come through (10 upserts, 10 deletes, 10 updated records and 1 activate version)
        for stream in messages_by_stream.values():
            self.assertEqual(31, len(stream['messages']))

        menagerie.get_state(conn_id)
Ejemplo n.º 10
0
    def discovery_test(self, conn_id):
        """
        Basic Discovery Test for a database tap.

        Test Description:
          Ensure discovery runs without exit codes and generates a catalog of the expected form

        Test Cases:
            - Verify discovery generated the expected catalogs by name.
            - Verify that the table_name is in the format <collection_name> for each stream.
            - Verify the caatalog is found for a given stream.
            - Verify there is only 1 top level breadcrumb in metadata for a given stream.
            - Verify replication key(s) match expectations for a given stream.
            - Verify primary key(s) match expectations for a given stream.
            - Verify the replication method matches our expectations for a given stream.
            - Verify that only primary keys are given the inclusion of automatic in metadata
              for a given stream.
            - Verify expected unsupported fields are given the inclusion of unsupported in
              metadata for a given stream.
            - Verify that all fields for a given stream which are not unsupported or automatic
              have inclusion of available.
            - Verify row-count metadata matches expectations for a given stream.
            - Verify selected metadata is None for all streams.
            - Verify is-view metadata is False for a given stream.
            - Verify no forced-replication-method is present in metadata for a given stream.
            - Verify schema and db match expectations for a given stream.
            - Verify schema types match expectations for a given stream.
        """
        ##########################################################################
        ### TODO
        ###   [] Generate multiple tables (streams) and maybe dbs too?
        ###   [] Investigate potential bug, see DOCS_BUG_1
        ##########################################################################

        # run discovery (check mode)
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify discovery generated a catalog
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0)

        # Verify discovery generated the expected catalogs by name
        found_catalog_names = {
            catalog['stream_name']
            for catalog in found_catalogs
        }
        self.assertSetEqual(self.expected_check_streams(), found_catalog_names)

        # Verify that the table_name is in the format <collection_name> for each stream
        found_catalog_stream_ids = {
            catalog['tap_stream_id']
            for catalog in found_catalogs
        }
        self.assertSetEqual(self.expected_check_stream_ids(),
                            found_catalog_stream_ids)

        # Test by stream
        for stream in self.expected_check_streams():
            with self.subTest(stream=stream):

                # Verify the caatalog is found for a given stream
                catalog = next(
                    iter([
                        catalog for catalog in found_catalogs
                        if catalog["stream_name"] == stream
                    ]))
                self.assertTrue(isinstance(catalog, dict))

                # collecting expected values
                expected_primary_keys = self.expected_primary_keys()[stream]
                expected_replication_keys = set()
                expected_unsupported_fields = self.expected_unsupported_fields(
                )
                expected_fields_to_datatypes = self.expected_schema_types()
                expected_row_count = len(self.recs)

                # collecting actual values...
                schema_and_metadata = menagerie.get_annotated_schema(
                    conn_id, catalog['stream_id'])
                stream_metadata = schema_and_metadata["metadata"]
                top_level_metadata = [
                    item for item in stream_metadata
                    if item.get("breadcrumb") == []
                ]
                stream_properties = top_level_metadata[0]['metadata']
                actual_primary_keys = set(
                    stream_properties.get(self.PRIMARY_KEYS, []))
                actual_replication_keys = set(
                    stream_properties.get(self.REPLICATION_KEYS, []))
                actual_replication_method = stream_properties.get(
                    self.REPLICATION_METHOD)
                actual_automatic_fields = set(
                    item.get("breadcrumb", ["properties", None])[1]
                    for item in stream_metadata
                    if item.get("metadata").get("inclusion") == "automatic")
                actual_unsupported_fields = set(
                    item.get("breadcrumb", ["properties", None])[1]
                    for item in stream_metadata
                    if item.get("metadata").get("inclusion") == "unsupported")
                actual_fields_to_datatypes = {
                    item['breadcrumb'][1]: item['metadata'].get('sql-datatype')
                    for item in stream_metadata[1:]
                }

                # Verify there is only 1 top level breadcrumb in metadata
                self.assertEqual(1, len(top_level_metadata))

                # Verify replication key(s) match expectations
                self.assertSetEqual(expected_replication_keys,
                                    actual_replication_keys)

                # NB | We expect primary keys and replication keys to have inclusion automatic for
                #      key-based incremental replication. But that is only true for primary keys here.
                #      This BUG should not be carried over into hp-postgres, but will not be fixed for this tap.

                # Verify primary key(s) match expectations
                self.assertSetEqual(
                    expected_primary_keys,
                    actual_primary_keys,
                )

                # Verify the replication method matches our expectations
                self.assertIsNone(actual_replication_method)

                # Verify that only primary keys
                # are given the inclusion of automatic in metadata.
                self.assertSetEqual(expected_primary_keys,
                                    actual_automatic_fields)

                # DOCS_BUG_1 ? | The following types were converted and selected, but docs say unsupported.
                #                Still need to investigate how the tap handles values of these datatypes
                #                during sync.
                KNOWN_MISSING = {
                    'invalid_bigserial',  # BIGSERIAL -> bigint
                    'invalid_serial',  # SERIAL -> integer
                    'invalid_smallserial',  # SMALLSERIAL -> smallint
                }
                # Verify expected unsupported fields
                # are given the inclusion of unsupported in metadata.
                self.assertSetEqual(expected_unsupported_fields,
                                    actual_unsupported_fields | KNOWN_MISSING)

                # Verify that all other fields have inclusion of available
                # This assumes there are no unsupported fields for SaaS sources
                self.assertTrue(
                    all(
                        {
                            item.get("metadata").get(
                                "inclusion") == "available"
                            for item in stream_metadata
                            if item.get("breadcrumb", []) != []
                            and item.get("breadcrumb", ["properties", None])[1]
                            not in actual_automatic_fields
                            and item.get("breadcrumb", ["properties", None])
                            [1] not in actual_unsupported_fields
                        }),
                    msg=
                    "Not all non key properties are set to available in metadata"
                )

                # Verify row-count metadata matches expectations
                self.assertEqual(expected_row_count,
                                 stream_properties['row-count'])

                # Verify selected metadata is None for all streams
                self.assertNotIn('selected', stream_properties.keys())

                # Verify is-view metadata is False
                self.assertFalse(stream_properties['is-view'])

                # Verify no forced-replication-method is present in metadata
                self.assertNotIn(self.REPLICATION_METHOD,
                                 stream_properties.keys())

                # Verify schema and db match expectations
                self.assertEqual(test_schema_name,
                                 stream_properties['schema-name'])
                self.assertEqual(test_db, stream_properties['database-name'])

                # Verify schema types match expectations
                self.assertDictEqual(expected_fields_to_datatypes,
                                     actual_fields_to_datatypes)
    def test_run(self):
        """
        Verify that for each stream you can get data when no fields are selected
        and only the automatic fields are replicated.
        """

        print("\n\nRUNNING {}\n\n".format(self.name()))

        # Initialize start_date state to capture ad_reports records
        self.START_DATE = self.timedelta_formatted(self.REPORTS_START_DATE, -1)
        self.END_DATE = self.REPORTS_END_DATE
        print(
            "INCREMENTAL STREAMS RELY ON A STATIC DATA SET. SO WE TEST WITH:\n"
            + "  START DATE 1 | {}\n".format(self.START_DATE) +
            "  END DATE 2 | {}".format(self.END_DATE))

        # ensure data exists for sync streams and set expectations
        expected_records_all = {x: []
                                for x in self.expected_streams()
                                }  # all fields selected
        expected_records_auto = {x: []
                                 for x in self.expected_streams()
                                 }  # no fields selected
        for stream in self.testable_streams():
            start_date = self.parse_date(self.START_DATE)
            end_date = self.parse_date(self.END_DATE)
            existing_objects = self.client.get_all(stream, start_date,
                                                   end_date)

            assert existing_objects, "Test data is not properly set for {}, test will fail.".format(
                stream)
            print("Data exists for stream: {}".format(stream))
            for obj in existing_objects:
                expected_records_all[stream].append(obj)
                expected_records_auto[stream].append({
                    field: obj.get(field)
                    for field in self.expected_automatic_fields().get(stream)
                })

        # format expected data to match expected output of tap
        self.format_expected_data(expected_records_all)

        # Instantiate connection with default start/end dates
        conn_id = connections.ensure_connection(self,
                                                original_properties=False)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")

        ##########################################################################
        ### ALL FIELDS SYNC
        ##########################################################################

        # Select all available fields from all streams
        exclude_streams = self.expected_streams().difference(
            self.testable_streams())
        self.select_all_streams_and_fields(conn_id=conn_id,
                                           catalogs=found_catalogs,
                                           select_all_fields=True,
                                           exclude_streams=exclude_streams)

        catalogs = menagerie.get_catalogs(conn_id)

        # Ensure our selection worked
        for cat in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, cat['stream_id'])
            # Verify only testable streams are selected
            selected = catalog_entry.get('annotated-schema').get('selected')
            print("Validating selection on {}: {}".format(
                cat['stream_name'], selected))
            if not cat['stream_name'] in self.testable_streams():
                # None expected for {'inclusion':'available'} happens when menagerie "deselects" stream
                self.assertTrue(selected is None,
                                msg="Stream is selected, but shouldn't be.")
                continue
            self.assertTrue(selected, msg="Stream not selected.")

            # Verify all fields within each selected stream are selected
            for field, field_props in catalog_entry.get(
                    'annotated-schema').get('properties').items():
                field_selected = field_props.get('selected')
                print("\tValidating selection on {}.{}: {}".format(
                    cat['stream_name'], field, field_selected))
                self.assertTrue(field_selected, msg="Field not selected.")

        #clear state
        menagerie.set_state(conn_id, {})

        # run sync with all fields selected
        sync_job_name_all = runner.run_sync_mode(self, conn_id)

        # Verify tap exit codes
        exit_status_all = menagerie.get_exit_status(conn_id, sync_job_name_all)
        menagerie.verify_sync_exit_status(self, exit_status_all,
                                          sync_job_name_all)

        # read target output
        record_count_by_stream_all = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys())
        replicated_row_count_all = reduce(lambda accum, c: accum + c,
                                          record_count_by_stream_all.values())
        synced_records_all = runner.get_records_from_target_output()

        # Verify target has records for all synced streams
        for stream, count in record_count_by_stream_all.items():
            assert stream in self.expected_streams()
            if stream in self.testable_streams():
                self.assertGreater(
                    count,
                    0,
                    msg="failed to replicate any data for: {}".format(stream))
        print(
            "total replicated row count: {}".format(replicated_row_count_all))

        ##########################################################################
        ### AUTOMATIC FIELDS SYNC
        ##########################################################################

        # Instantiate connection with default start/end dates
        conn_id = connections.ensure_connection(self,
                                                original_properties=False)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")

        # Select no available fields (only automatic) for  all testable streams
        self.select_all_streams_and_fields(conn_id=conn_id,
                                           catalogs=found_catalogs,
                                           select_all_fields=False,
                                           exclude_streams=exclude_streams)

        catalogs = menagerie.get_catalogs(conn_id)

        # Ensure our selection worked
        for cat in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, cat['stream_id'])
            # Verify all streams are selected
            selected = catalog_entry.get('annotated-schema').get('selected')
            print("Validating selection on {}: {}".format(
                cat['stream_name'], selected))
            if not cat['stream_name'] in self.testable_streams():
                self.assertTrue(selected is None,
                                msg="Stream is selected, but shouldn't be.")
                continue
            self.assertTrue(selected, msg="Stream not selected.")

            # Verify only automatic fields are selected
            for field, field_props in catalog_entry.get(
                    'annotated-schema').get('properties').items():
                field_selected = field_props.get('selected')
                print("\tValidating selection on {}.{}: {}".format(
                    cat['stream_name'], field, field_selected))

                if field in self.expected_automatic_fields().get(
                        cat['stream_name']):
                    # NOTE: AUTOMATIC FIELDS IGNORE THE SELECTED md {'selected': None}
                    print(
                        "NOTE: selection for {} is ignored by the Transformer "
                        .format(field) +
                        " so long as 'inlcusion' = 'automatic'")
                else:
                    self.assertFalse(
                        field_selected,
                        msg="Field is selected but not automatic.")

        # run sync with no fields selected (only automatic)
        sync_job_name_auto = runner.run_sync_mode(self, conn_id)

        # Verify tap exit codes
        exit_status_auto = menagerie.get_exit_status(conn_id,
                                                     sync_job_name_auto)
        menagerie.verify_sync_exit_status(self, exit_status_auto,
                                          sync_job_name_auto)

        # read target output
        record_count_by_stream_auto = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys())
        replicated_row_count_auto = reduce(
            lambda accum, c: accum + c, record_count_by_stream_auto.values())
        synced_records_auto = runner.get_records_from_target_output()

        # Verify target has records for all synced streams
        for stream, count in record_count_by_stream_auto.items():
            assert stream in self.expected_streams()
            if stream in self.testable_streams():
                self.assertGreater(
                    count,
                    0,
                    msg="failed to replicate any data for: {}".format(stream))
        print(
            "total replicated row count: {}".format(replicated_row_count_auto))

        # Test by Stream
        for stream in self.testable_streams():
            with self.subTest(stream=stream):
                ##########################################################################
                ### TESTING ALL FIELDS
                ##########################################################################

                data = synced_records_all.get(stream)
                record_messages_keys = [
                    set(row['data'].keys()) for row in data['messages']
                ]
                expected_keys = expected_records_all.get(stream)[0].keys()

                # Verify schema covers all fields
                schema_keys = set(self.expected_schema_keys(stream))
                self.assertEqual(
                    set(),
                    set(expected_keys).difference(schema_keys),
                    msg="\nFields missing from schema: {}\n".format(
                        set(expected_keys).difference(schema_keys)))

                # not a test, just logging the fields that are included in the schema but not in the expectations
                if schema_keys.difference(set(expected_keys)):
                    print(
                        "WARNING Fields missing from expectations: {}".format(
                            schema_keys.difference(set(expected_keys))))

                # Verify that all fields are sent to the target
                for actual_keys in record_messages_keys:
                    self.assertEqual(
                        actual_keys.symmetric_difference(schema_keys),
                        set(),
                        msg="Expected all fields, as defined by schemas/{}.json"
                        .format(stream))

                actual_records = [row['data'] for row in data['messages']]
                expected_records = expected_records_all.get(stream)

                # Verify the number of records match expectations
                self.assertEqual(len(expected_records),
                                 len(actual_records),
                                 msg="Number of actual records do match expectations. " +\
                                 "Check expectations, check for duplicate records in Target.")

                # verify there are no dup records in the target
                already_tracked = []
                for actual_record in actual_records:
                    if actual_record in already_tracked:
                        continue
                    already_tracked.append(actual_record)
                self.assertEqual(len(already_tracked),
                                 len(actual_records),
                                 msg="DUPLICATES PRESENT")

                # verify by values, that we replicated the expected records
                for actual_record in actual_records:
                    self.assertTrue(
                        actual_record in expected_records,
                        msg="Actual record missing from expectations\n" +
                        "Actual Record: {}".format(actual_record))
                for expected_record in expected_records:
                    self.assertTrue(
                        expected_record in actual_records,
                        msg="Expected record missing from target." +
                        "Expected Record: {}".format(expected_record))

                ##########################################################################
                ### TESTING AUTOMATIC FIELDS
                ##########################################################################

                data = synced_records_auto.get(stream)
                record_messages_keys = [
                    set(row['data'].keys()) for row in data['messages']
                ]
                expected_keys = self.expected_automatic_fields().get(stream)

                # Verify that only the automatic fields are sent to the target
                for actual_keys in record_messages_keys:
                    self.assertEqual(
                        actual_keys.symmetric_difference(expected_keys),
                        set(),
                        msg="Expected automatic fields and nothing else.")

                actual_records = [row['data'] for row in data['messages']]
                expected_records = expected_records_auto.get(stream)

                #Verify the number of records match expectations
                self.assertEqual(len(expected_records),
                                 len(actual_records),
                                 msg="Number of actual records do match expectations. " +\
                                 "We probably have duplicate records.")

                # verify there are no dup records in the target
                already_tracked = []
                for actual_record in actual_records:
                    if actual_record in already_tracked:
                        continue
                    already_tracked.append(actual_record)
                self.assertEqual(len(already_tracked),
                                 len(actual_records),
                                 msg="DUPLICATES PRESENT")

                # verify by values, that we replicated the expected records
                for actual_record in actual_records:
                    self.assertTrue(
                        actual_record in expected_records,
                        msg="Actual record missing from expectations\n" +
                        "Actual Record: {}".format(actual_record))
                for expected_record in expected_records:
                    self.assertTrue(
                        expected_record in actual_records,
                        msg="Expected record missing from target." +
                        "Expected Record: {}".format(expected_record))
Ejemplo n.º 12
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """

        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        # self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                if records_by_stream[stream]['messages'][-1].get("data"):
                    last_row_data = True
                else:
                    last_row_data = False

                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-2]['action'],
                    'activate_version')
                if last_row_data:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-3]['action'],
                        'activate_version')
                else:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-1]['action'],
                        'activate_version')
                self.assertEqual(
                    len([
                        m for m in records_by_stream[stream]['messages'][1:]
                        if m["action"] == "activate_version"
                    ]),
                    2,
                    msg=
                    "Expect 2 more activate version messages for end of full table and beginning of log based"
                )

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # Verify all data is correct for the full table part
                if last_row_data:
                    final_row = -3
                else:
                    final_row = -2

                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]
                            ['messages'][1:final_row])):
                    with self.subTest(expected_row=expected_row):

                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, time):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.replace(
                                        microsecond=0).isoformat().replace(
                                            '+00:00', ''),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, date):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.isoformat() +
                                    'T00:00:00+00:00',
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat() +
                                        'T00:00:00+00:00',
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value,
                                        actual_row["data"][column_name]))

                # Verify all data is correct for the log replication part if sent
                if records_by_stream[stream]['messages'][-1].get("data"):
                    for column_name, expected_value in expected_messages[-1][
                            "data"].items():
                        if isinstance(expected_value, datetime):
                            # sql server only keeps milliseconds not microseconds
                            self.assertEqual(
                                expected_value.isoformat().replace(
                                    '000+00:00', 'Z').replace('+00:00', 'Z'),
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name]))
                        elif isinstance(expected_value, time):
                            # sql server time has second resolution only
                            self.assertEqual(
                                expected_value.replace(
                                    microsecond=0).isoformat().replace(
                                        '+00:00', ''),
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_value.isoformat().replace(
                                        '+00:00', 'Z'),
                                    actual_row["data"][column_name]))
                        elif isinstance(expected_value, date):
                            # sql server time has second resolution only
                            self.assertEqual(
                                expected_value.isoformat() + 'T00:00:00+00:00',
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_value.isoformat() +
                                    'T00:00:00+00:00',
                                    actual_row["data"][column_name]))
                        else:
                            self.assertEqual(
                                expected_value,
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_value,
                                    actual_row["data"][column_name]))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                inital_log_version = bookmark['current_log_version']

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "dates_and_times"
        column_name = [
            "pk", "just_a_date", "date_and_time",
            "bigger_range_and_precision_datetime", "datetime_with_timezones",
            "datetime_no_seconds", "its_time"
        ]
        new_date_value = datetime(2019, 7, 22, 21, 11, 40, 573000)
        insert_value = [
            (6, new_date_value.date(), new_date_value,
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(hours=10,
                                                minutes=5))).isoformat(),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc))
        ]
        update_value = [
            (2, new_date_value.date(), new_date_value,
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(hours=10,
                                                minutes=5))).isoformat(),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc))
        ]
        delete_value = [(3, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [
            (6, new_date_value.date(), new_date_value,
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(
                          hours=10, minutes=5))).astimezone(timezone.utc),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc))
        ]
        update_value = [
            (2, new_date_value.date(), new_date_value,
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(
                          hours=10, minutes=5))).astimezone(timezone.utc),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc))
        ]
        insert_value = [insert_value[0] + (None, )]
        update_value = [update_value[0] + (None, )]
        delete_value = [(3, None, None, None, None, None, None,
                         datetime.utcnow())]
        self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"]["values"] =  \
            [self.expected_metadata()["data_types_database_dbo_dates_and_times"]["values"][-1]] + \
            insert_value + delete_value + update_value
        self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"][
            "fields"].append({
                "_sdc_deleted_at": {
                    'sql-datatype': 'datetime',
                    'selected-by-default': True,
                    'inclusion': 'automatic'
                }
            })

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:]
                    ]))

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertGreaterEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if column_name != "_sdc_deleted_at":
                                if isinstance(expected_value, datetime):
                                    # sql server only keeps milliseconds not microseconds
                                    self.assertEqual(
                                        expected_value.isoformat().replace(
                                            '000+00:00', 'Z').replace(
                                                '+00:00',
                                                'Z').replace('000', 'Z'),
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_value.isoformat().replace(
                                                '000+00:00', 'Z').replace(
                                                    '+00:00',
                                                    'Z').replace('000', 'Z'),
                                            actual_row["data"][column_name]))
                                elif isinstance(expected_value, time):
                                    # sql server time has second resolution only
                                    self.assertEqual(
                                        expected_value.replace(
                                            microsecond=0).isoformat().replace(
                                                '+00:00', ''),
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_value.isoformat().replace(
                                                '+00:00', 'Z'),
                                            actual_row["data"][column_name]))
                                elif isinstance(expected_value, date):
                                    # sql server time has second resolution only
                                    self.assertEqual(
                                        expected_value.isoformat() +
                                        'T00:00:00+00:00',
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_value.isoformat() +
                                            'T00:00:00+00:00',
                                            actual_row["data"][column_name]))
                                else:
                                    self.assertEqual(
                                        expected_value,
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_value,
                                            actual_row["data"][column_name]))

                            elif expected_value:
                                # we have an expected value for a deleted row
                                try:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%S.%fZ")
                                except ValueError:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%SZ")
                                self.assertGreaterEqual(
                                    actual_value,
                                    expected_value - timedelta(seconds=15))
                                self.assertLessEqual(
                                    actual_value,
                                    expected_value + timedelta(seconds=15))
                            else:
                                # the row wasn't deleted so we can either not pass the column or it can be None
                                self.assertIsNone(
                                    actual_row["data"].get(column_name))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                new_log_version = bookmark['current_log_version']
                self.assertGreater(new_log_version,
                                   inital_log_version,
                                   msg='expected log version to increase')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))
Ejemplo n.º 13
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{"breadcrumb": [], "metadata": {'replication-method': 'INCREMENTAL',
                                                         'replication-key': 'replication_key_column'}}]

        non_selected_properties = []

        BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id())
        expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()}
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream]['table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([m["action"] == "upsert" for m in records_by_stream[stream]['messages'][1:-1]]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(len(records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS]
                ]
                replication_column = column_names.index("replication_key_column")
                expected_messages = [
                    {
                        "action": "upsert", "data":
                        {
                            column: value for column, value
                            in list(zip(column_names, row_values))
                            if column not in non_selected_properties
                        }
                    } for row_values in sorted(stream_expected_data[self.VALUES],
                                               key=lambda row: (row[replication_column] is not None, row[replication_column]))
                ]

                # Verify all data is correct for incremental
                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()),
                                         msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row["data"].items():
                            self.assertEqual(expected_value, actual_row["data"][column_name],
                                             msg="expected: {} != actual {}".format(
                                                 expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(bookmark['replication_key_value'],
                                 max([row[replication_column] for row in stream_expected_data[self.VALUES]
                                      if row[replication_column] is not None]))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_column')

                self.assertEqual(bookmark['version'], table_version[stream],
                                 msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(expected_schemas,
                                                                         records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "constraints_database"
        schema_name = "dbo"
        table_name = "no_constraints"
        column_name = ["replication_key_column"]
        insert_value = [(49, )]
        update_value = [(3, )]
        delete_value = [(0, )]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name))
        query_list.extend([
            "UPDATE constraints_database.dbo.no_constraints "
            "SET replication_key_column = 3 "
            "WHERE replication_key_column = 1"])
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_no_constraints"]["values"] = \
            [(2, )] + insert_value + update_value

        database_name = "constraints_database"
        schema_name = "dbo"
        table_name = "multiple_column_pk"
        column_name = ["first_name", "last_name", "replication_key_column"]
        insert_value = [("Brian", "Lampkin", 72)]
        update_value = [("Sergey", "Brin", 65)]
        delete_value = [("Larry", "Page")]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:2]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_multiple_column_pk"]["values"] = \
            [("Tim", "Berners-Lee", 64)] + insert_value + update_value

        # duplicative of other testing
        # table_name = "single_column_pk"
        # column_name = ["pk", "replication_key_column"]
        # insert_value = [(3, 49)]
        # update_value = [(1, 65)]
        # delete_value = [(0,)]
        # query_list = (insert(database_name, schema_name, table_name, insert_value))
        # query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        # query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        # mssql_cursor_context_manager(*query_list)
        # insert_value = [insert_value[0] + (None,)]
        # update_value = [update_value[0] + (None,)]
        # delete_value = [delete_value[0] + (None, datetime.utcnow())]
        # self.EXPECTED_METADATA["constraints_database_dbo_single_column_pk"]["values"] = \
        #     insert_value + delete_value + update_value

        table_name = "pk_with_fk"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(5, 2), (6, None)]
        delete_value = [(1,), (2,)]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_pk_with_fk"]["values"] = \
           [(0, 1), (3, 1)] + insert_value[:-1]

        table_name = "pk_with_unique_not_null"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(3, 49)]
        update_value = [(1, 65)]
        delete_value = [(0,)]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_pk_with_unique_not_null"]["values"] = \
            [(2, 5)] + insert_value + update_value

        # update expected datafor VIEW_WITH_JOIN view
        self.EXPECTED_METADATA["constraints_database_dbo_view_with_join"]["values"] = \
            [(None, None, 4), (2, 5, 5), (None, None, 6)]

        table_name = "default_column"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(3, 49), (4, None), (5, )]
        update_value = [(1, 65)]
        query_list = (insert(database_name, schema_name, table_name, insert_value[:2]))
        query_list.extend(insert(database_name, schema_name, table_name, insert_value[-1:], column_names=column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_default_column"]["values"] = [
                (0, -1)] + [(3, 49), (5, -1)] + update_value

        table_name = "check_constraint"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(3, 49)]
        update_value = [(1, 65)]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_check_constraint"]["values"] = \
            [(0, 37)] + insert_value + update_value

        table_name = "even_identity"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(3,)]
        update_value = [(2,)]
        delete_value = [(1,)]
        query_list = (insert(database_name, schema_name, table_name, insert_value, column_names=column_name[:1]))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [insert_value[0] + (6, )]
        update_value = [update_value[0] + (4, )]
        self.EXPECTED_METADATA["constraints_database_dbo_even_identity"]["values"] = \
            insert_value + update_value

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id())
        expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()}
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(records_by_stream[stream]['messages'][0]['action'], 'activate_version')
                self.assertEqual(records_by_stream[stream]['messages'][-1]['action'], 'activate_version')
                self.assertTrue(all(
                    [message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:-1]]
                ))
                self.assertEqual(len(records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS]
                ]
                replication_column = column_names.index("replication_key_column")
                expected_messages = [
                    {
                        "action": "upsert", "data":
                        {
                            column: value for column, value
                            in list(zip(column_names, row_values))
                            if column not in non_selected_properties
                        }
                    } for row_values in sorted(stream_expected_data[self.VALUES],
                                               key=lambda row: (row[replication_column] is not None, row[replication_column]))
                ]

                # remove sequences from actual values for comparison
                [message.pop("sequence") for message
                 in records_by_stream[stream]['messages'][1:-1]]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()),
                                         msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row["data"].items():
                            self.assertEqual(expected_value, actual_row["data"][column_name],
                                             msg="expected: {} != actual {}".format(
                                                 expected_row, actual_row))
                        print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(bookmark['replication_key_value'],
                                 max([row[replication_column] for row in stream_expected_data[self.VALUES]
                                      if row[replication_column] is not None]))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_column')

                self.assertEqual(bookmark['version'], table_version[stream],
                                 msg="expected bookmark for stream to match version")
                self.assertEqual(bookmark['version'], new_table_version,
                                 msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(expected_schemas,
                                                                         records_by_stream[stream]['schema']))
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        # select all catalogs
        for c in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])
            for k in self.expected_pks()[c['stream_name']]:
                mdata = next(
                    (m for m in catalog_entry['metadata']
                     if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k),
                    None)
                print("Validating inclusion on {}: {}".format(
                    c['stream_name'], mdata))
                self.assertTrue(
                    mdata and mdata['metadata']['inclusion'] == 'automatic')
            connections.select_catalog_via_metadata(conn_id, c, catalog_entry)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # This should be validating the the PKs are written in each record
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        synced_records = runner.get_records_from_target_output()
        for stream_name, data in synced_records.items():
            record_messages = [
                set(row['data'].keys()) for row in data['messages']
            ]
            for record_keys in record_messages:
                # The symmetric difference should be empty
                self.assertEqual(
                    record_keys,
                    self.expected_automatic_fields().get(stream_name, set()))