Exemple #1
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """

        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        # self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                if records_by_stream[stream]['messages'][-1].get("data"):
                    last_row_data = True
                else:
                    last_row_data = False

                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-2]['action'],
                    'activate_version')
                if last_row_data:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-3]['action'],
                        'activate_version')
                else:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-1]['action'],
                        'activate_version')
                self.assertEqual(
                    len([
                        m for m in records_by_stream[stream]['messages'][1:]
                        if m["action"] == "activate_version"
                    ]),
                    2,
                    msg=
                    "Expect 2 more activate version messages for end of full table and beginning of log based"
                )

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # Verify all data is correct for the full table part
                if last_row_data:
                    final_row = -3
                else:
                    final_row = -2

                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]
                            ['messages'][1:final_row])):
                    with self.subTest(expected_row=expected_row):

                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, time):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.replace(
                                        microsecond=0).isoformat().replace(
                                            '+00:00', ''),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, date):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.isoformat() +
                                    'T00:00:00+00:00',
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat() +
                                        'T00:00:00+00:00',
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value,
                                        actual_row["data"][column_name]))

                # Verify all data is correct for the log replication part if sent
                if records_by_stream[stream]['messages'][-1].get("data"):
                    for column_name, expected_value in expected_messages[-1][
                            "data"].items():
                        if isinstance(expected_value, datetime):
                            # sql server only keeps milliseconds not microseconds
                            self.assertEqual(
                                expected_value.isoformat().replace(
                                    '000+00:00', 'Z').replace('+00:00', 'Z'),
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name]))
                        elif isinstance(expected_value, time):
                            # sql server time has second resolution only
                            self.assertEqual(
                                expected_value.replace(
                                    microsecond=0).isoformat().replace(
                                        '+00:00', ''),
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_value.isoformat().replace(
                                        '+00:00', 'Z'),
                                    actual_row["data"][column_name]))
                        elif isinstance(expected_value, date):
                            # sql server time has second resolution only
                            self.assertEqual(
                                expected_value.isoformat() + 'T00:00:00+00:00',
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_value.isoformat() +
                                    'T00:00:00+00:00',
                                    actual_row["data"][column_name]))
                        else:
                            self.assertEqual(
                                expected_value,
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_value,
                                    actual_row["data"][column_name]))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                inital_log_version = bookmark['current_log_version']

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "dates_and_times"
        column_name = [
            "pk", "just_a_date", "date_and_time",
            "bigger_range_and_precision_datetime", "datetime_with_timezones",
            "datetime_no_seconds", "its_time"
        ]
        new_date_value = datetime(2019, 7, 22, 21, 11, 40, 573000)
        insert_value = [
            (6, new_date_value.date(), new_date_value,
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(hours=10,
                                                minutes=5))).isoformat(),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc))
        ]
        update_value = [
            (2, new_date_value.date(), new_date_value,
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(hours=10,
                                                minutes=5))).isoformat(),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc))
        ]
        delete_value = [(3, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [
            (6, new_date_value.date(), new_date_value,
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(
                          hours=10, minutes=5))).astimezone(timezone.utc),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc))
        ]
        update_value = [
            (2, new_date_value.date(), new_date_value,
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(
                          hours=10, minutes=5))).astimezone(timezone.utc),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc))
        ]
        insert_value = [insert_value[0] + (None, )]
        update_value = [update_value[0] + (None, )]
        delete_value = [(3, None, None, None, None, None, None,
                         datetime.utcnow())]
        self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"]["values"] =  \
            [self.expected_metadata()["data_types_database_dbo_dates_and_times"]["values"][-1]] + \
            insert_value + delete_value + update_value
        self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"][
            "fields"].append({
                "_sdc_deleted_at": {
                    'sql-datatype': 'datetime',
                    'selected-by-default': True,
                    'inclusion': 'automatic'
                }
            })

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:]
                    ]))

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertGreaterEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if column_name != "_sdc_deleted_at":
                                if isinstance(expected_value, datetime):
                                    # sql server only keeps milliseconds not microseconds
                                    self.assertEqual(
                                        expected_value.isoformat().replace(
                                            '000+00:00', 'Z').replace(
                                                '+00:00',
                                                'Z').replace('000', 'Z'),
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_value.isoformat().replace(
                                                '000+00:00', 'Z').replace(
                                                    '+00:00',
                                                    'Z').replace('000', 'Z'),
                                            actual_row["data"][column_name]))
                                elif isinstance(expected_value, time):
                                    # sql server time has second resolution only
                                    self.assertEqual(
                                        expected_value.replace(
                                            microsecond=0).isoformat().replace(
                                                '+00:00', ''),
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_value.isoformat().replace(
                                                '+00:00', 'Z'),
                                            actual_row["data"][column_name]))
                                elif isinstance(expected_value, date):
                                    # sql server time has second resolution only
                                    self.assertEqual(
                                        expected_value.isoformat() +
                                        'T00:00:00+00:00',
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_value.isoformat() +
                                            'T00:00:00+00:00',
                                            actual_row["data"][column_name]))
                                else:
                                    self.assertEqual(
                                        expected_value,
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_value,
                                            actual_row["data"][column_name]))

                            elif expected_value:
                                # we have an expected value for a deleted row
                                try:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%S.%fZ")
                                except ValueError:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%SZ")
                                self.assertGreaterEqual(
                                    actual_value,
                                    expected_value - timedelta(seconds=15))
                                self.assertLessEqual(
                                    actual_value,
                                    expected_value + timedelta(seconds=15))
                            else:
                                # the row wasn't deleted so we can either not pass the column or it can be None
                                self.assertIsNone(
                                    actual_row["data"].get(column_name))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                new_log_version = bookmark['current_log_version']
                self.assertGreater(new_log_version,
                                   inital_log_version,
                                   msg='expected log version to increase')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))
Exemple #2
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{"breadcrumb": [], "metadata": {'replication-method': 'LOG_BASED'}}]

        # Don't select unsupported data types
        non_selected_properties = ["nvarchar_text", "varchar_text", "varbinary_data",
                                   "geospacial", "geospacial_map", "markup", "tree",
                                   "variant", "SpecialPurposeColumns", "started_at", "ended_at"]
        BaseTapTest.select_all_streams_and_fields(
            conn_id, found_catalogs, additional_md=additional_md,
            non_selected_properties=non_selected_properties)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id())
        expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()}
        # self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream]['table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                if records_by_stream[stream]['messages'][-1].get("data"):
                    last_row_data = True
                else:
                    last_row_data = False

                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-2]['action'],
                    'activate_version')
                if last_row_data:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-3]['action'],
                        'activate_version')
                else:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-1]['action'],
                        'activate_version')
                self.assertEqual(
                    len([m for m in records_by_stream[stream]['messages'][1:] if m["action"] == "activate_version"]),
                    2,
                    msg="Expect 2 more activate version messages for end of full table and beginning of log based")

                column_names = [
                    list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [
                    {
                        "action": "upsert", "data":
                        {
                            column: value for column, value
                            in list(zip(column_names, stream_expected_data[self.VALUES][row]))
                            if column not in non_selected_properties
                        }
                    } for row in range(len(stream_expected_data[self.VALUES]))
                ]

                # Verify all data is correct for the full table part
                if last_row_data:
                    final_row = -3
                else:
                    final_row = -2

                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]['messages'][1:final_row])):
                    with self.subTest(expected_row=expected_row):

                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()),
                                         msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row["data"].items():
                            self.assertEqual(expected_value, actual_row["data"][column_name],
                                             msg="expected: {} != actual {}".format(
                                                 expected_row, actual_row))

                # Verify all data is correct for the log replication part if sent
                if records_by_stream[stream]['messages'][-1].get("data"):
                    for column_name, expected_value in expected_messages[-1]["data"].items():
                        self.assertEqual(expected_value,
                                         records_by_stream[stream]['messages'][-1]["data"][column_name],
                                         msg="expected: {} != actual {}".format(
                                             expected_row, actual_row))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg="expected bookmark to have current_log_version because we are using log replication")
                self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete")
                inital_log_version = bookmark['current_log_version']

                self.assertEqual(bookmark['version'], table_version[stream],
                                 msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(expected_schemas,
                                                                         records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "text_and_image_deprecated_soon"
        column_name = ["pk", "nvarchar_text", "varchar_text", "varbinary_data",
                       "rowversion_synonym_timestamp"]
        insert_value = [(2, "JKL", "MNO", "PQR".encode('utf-8'))]
        update_value = [(1, "JKL", "MNO", "PQR".encode('utf-8'))]
        delete_value = [(0, )]
        query_list = (insert(database_name, schema_name, table_name, insert_value, column_name[:-1]))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        values = insert_value + update_value
        rows = mssql_cursor_context_manager(*[
            "select rowversion_synonym_timestamp from data_types_database.dbo.text_and_image_deprecated_soon "
            "where pk in (0,1,2) order by pk desc"])
        rows = [tuple(row) for row in rows]
        rows = [("0x{}".format(value.hex().upper()), ) for value, in rows]
        row_with_version = [x[0] + x[1] + (None, ) for x in zip(values, rows)]
        row_with_version.append((0, None, None, None, None, datetime.utcnow()))
        row_with_version[1], row_with_version[2] = row_with_version[2], row_with_version[1]
        self.EXPECTED_METADATA['data_types_database_dbo_text_and_image_deprecated_soon']['values'] = row_with_version
        self.EXPECTED_METADATA["data_types_database_dbo_text_and_image_deprecated_soon"]["fields"].append(
            {"_sdc_deleted_at": {
                'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic'}}
        )

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "weirdos"
        column_name = [
            "pk", "geospacial", "geospacial_map", "markup", "guid", "tree",
            "variant", "SpecialPurposeColumns", "version"]
        insert_value = [(3, None, None, None, str(uuid.uuid1()).upper(), None, None, None)]
        update_value = [(1, None, None, None, str(uuid.uuid1()).upper(), None, None, None)]
        delete_value = [(0,)]
        query_list = (insert(database_name, schema_name, table_name, insert_value, column_name[:-1]))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        values = insert_value + update_value
        rows = mssql_cursor_context_manager(*[
            "select version from data_types_database.dbo.weirdos "
            "where pk in (0,1,3) order by pk desc"])
        rows = [tuple(row) for row in rows]
        rows = [("0x{}".format(value.hex().upper()), ) for value, in rows]
        row_with_version = [x[0] + x[1] + (None, ) for x in zip(values, rows)]
        row_with_version.append((0, None, None, None, None, None, None, None, None, datetime.utcnow()))
        row_with_version[1], row_with_version[2] = row_with_version[2], row_with_version[1]
        self.EXPECTED_METADATA['data_types_database_dbo_weirdos']['values'] = row_with_version
        self.EXPECTED_METADATA["data_types_database_dbo_weirdos"]["fields"].append(
            {"_sdc_deleted_at": {
                'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic'}}
        )

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "computed_columns"
        column_name = ["pk", "started_at", "ended_at", "durations_days"]
        insert_value = [(2, datetime(1980, 5, 30, 16), datetime.now())]
        update_value = [(1, datetime(1942, 11, 30), datetime(2017, 2, 12))]
        delete_value = [(0,)]
        query_list = (insert(database_name, schema_name, table_name, insert_value, column_name[:-1]))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        values = insert_value + update_value  # + [delete_value[0] + (None, None)]
        rows = mssql_cursor_context_manager(
            *["select durations_days from data_types_database.dbo.computed_columns "
              "where pk in (0,1,2) order by pk desc"])
        rows = [tuple(row) for row in rows]
        row_with_duration = [x[0] + x[1] + (None, ) for x in zip(values, rows)]
        row_with_duration.append((0, None, None, None, datetime.utcnow()))
        row_with_duration[1], row_with_duration[2] = row_with_duration[2], row_with_duration[1]
        self.EXPECTED_METADATA['data_types_database_dbo_computed_columns']['values'] = row_with_duration
        self.EXPECTED_METADATA["data_types_database_dbo_computed_columns"]["fields"].append(
            {"_sdc_deleted_at": {
                'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic'}}
        )

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id())
        expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()}
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertTrue(all(
                    [message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:]]
                ))

                column_names = [
                    list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [
                    {
                        "action": "upsert", "data":
                        {
                            column: value for column, value
                            in list(zip(column_names, stream_expected_data[self.VALUES][row]))
                            if column not in non_selected_properties
                        }
                    } for row in range(len(stream_expected_data[self.VALUES]))
                ]

                # remove sequences from actual values for comparison
                [message.pop("sequence") for message
                 in records_by_stream[stream]['messages'][1:]]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]['messages'][1:])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertGreaterEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()),
                                         msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row["data"].items():
                            if column_name != "_sdc_deleted_at":
                                self.assertEqual(expected_value, actual_row["data"][column_name],
                                                 msg="expected: {} != actual {}".format(
                                                     expected_row, actual_row))
                            elif expected_value:
                                # we have an expected value for a deleted row
                                try:
                                    actual_value = datetime.strptime(actual_row["data"][column_name],
                                                                     "%Y-%m-%dT%H:%M:%S.%fZ")
                                except ValueError:
                                    actual_value = datetime.strptime(actual_row["data"][column_name],
                                                                     "%Y-%m-%dT%H:%M:%SZ")
                                self.assertGreaterEqual(actual_value, expected_value - timedelta(seconds=15))
                                self.assertLessEqual(actual_value, expected_value + timedelta(seconds=15))
                            else:
                                # the row wasn't deleted so we can either not pass the column or it can be None
                                self.assertIsNone(actual_row["data"].get(column_name))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg="expected bookmark to have current_log_version because we are using log replication")
                self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete")
                new_log_version = bookmark['current_log_version']
                self.assertGreater(new_log_version, inital_log_version,
                                   msg='expected log version to increase')

                self.assertEqual(bookmark['version'], table_version[stream],
                                 msg="expected bookmark for stream to match version")
                self.assertEqual(bookmark['version'], new_table_version,
                                 msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(expected_schemas,
                                                                         records_by_stream[stream]['schema']))
Exemple #3
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{"breadcrumb": [], "metadata": {'replication-method': 'INCREMENTAL',
                                                         'replication-key': 'replication_key_column'}}]

        non_selected_properties = []

        BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id())
        expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()}
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream]['table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([m["action"] == "upsert" for m in records_by_stream[stream]['messages'][1:-1]]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(len(records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS]
                ]
                replication_column = column_names.index("replication_key_column")
                expected_messages = [
                    {
                        "action": "upsert", "data":
                        {
                            column: value for column, value
                            in list(zip(column_names, row_values))
                            if column not in non_selected_properties
                        }
                    } for row_values in sorted(stream_expected_data[self.VALUES],
                                               key=lambda row: (row[replication_column] is not None, row[replication_column]))
                ]

                # Verify all data is correct for incremental
                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()),
                                         msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row["data"].items():
                            self.assertEqual(expected_value, actual_row["data"][column_name],
                                             msg="expected: {} != actual {}".format(
                                                 expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(bookmark['replication_key_value'],
                                 max([row[replication_column] for row in stream_expected_data[self.VALUES]
                                      if row[replication_column] is not None]))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_column')

                self.assertEqual(bookmark['version'], table_version[stream],
                                 msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(expected_schemas,
                                                                         records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "constraints_database"
        schema_name = "dbo"
        table_name = "no_constraints"
        column_name = ["replication_key_column"]
        insert_value = [(49, )]
        update_value = [(3, )]
        delete_value = [(0, )]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name))
        query_list.extend([
            "UPDATE constraints_database.dbo.no_constraints "
            "SET replication_key_column = 3 "
            "WHERE replication_key_column = 1"])
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_no_constraints"]["values"] = \
            [(2, )] + insert_value + update_value

        database_name = "constraints_database"
        schema_name = "dbo"
        table_name = "multiple_column_pk"
        column_name = ["first_name", "last_name", "replication_key_column"]
        insert_value = [("Brian", "Lampkin", 72)]
        update_value = [("Sergey", "Brin", 65)]
        delete_value = [("Larry", "Page")]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:2]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_multiple_column_pk"]["values"] = \
            [("Tim", "Berners-Lee", 64)] + insert_value + update_value

        # duplicative of other testing
        # table_name = "single_column_pk"
        # column_name = ["pk", "replication_key_column"]
        # insert_value = [(3, 49)]
        # update_value = [(1, 65)]
        # delete_value = [(0,)]
        # query_list = (insert(database_name, schema_name, table_name, insert_value))
        # query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        # query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        # mssql_cursor_context_manager(*query_list)
        # insert_value = [insert_value[0] + (None,)]
        # update_value = [update_value[0] + (None,)]
        # delete_value = [delete_value[0] + (None, datetime.utcnow())]
        # self.EXPECTED_METADATA["constraints_database_dbo_single_column_pk"]["values"] = \
        #     insert_value + delete_value + update_value

        table_name = "pk_with_fk"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(5, 2), (6, None)]
        delete_value = [(1,), (2,)]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_pk_with_fk"]["values"] = \
           [(0, 1), (3, 1)] + insert_value[:-1]

        table_name = "pk_with_unique_not_null"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(3, 49)]
        update_value = [(1, 65)]
        delete_value = [(0,)]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_pk_with_unique_not_null"]["values"] = \
            [(2, 5)] + insert_value + update_value

        # update expected datafor VIEW_WITH_JOIN view
        self.EXPECTED_METADATA["constraints_database_dbo_view_with_join"]["values"] = \
            [(None, None, 4), (2, 5, 5), (None, None, 6)]

        table_name = "default_column"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(3, 49), (4, None), (5, )]
        update_value = [(1, 65)]
        query_list = (insert(database_name, schema_name, table_name, insert_value[:2]))
        query_list.extend(insert(database_name, schema_name, table_name, insert_value[-1:], column_names=column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_default_column"]["values"] = [
                (0, -1)] + [(3, 49), (5, -1)] + update_value

        table_name = "check_constraint"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(3, 49)]
        update_value = [(1, 65)]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_check_constraint"]["values"] = \
            [(0, 37)] + insert_value + update_value

        table_name = "even_identity"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(3,)]
        update_value = [(2,)]
        delete_value = [(1,)]
        query_list = (insert(database_name, schema_name, table_name, insert_value, column_names=column_name[:1]))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [insert_value[0] + (6, )]
        update_value = [update_value[0] + (4, )]
        self.EXPECTED_METADATA["constraints_database_dbo_even_identity"]["values"] = \
            insert_value + update_value

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id())
        expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()}
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(records_by_stream[stream]['messages'][0]['action'], 'activate_version')
                self.assertEqual(records_by_stream[stream]['messages'][-1]['action'], 'activate_version')
                self.assertTrue(all(
                    [message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:-1]]
                ))
                self.assertEqual(len(records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS]
                ]
                replication_column = column_names.index("replication_key_column")
                expected_messages = [
                    {
                        "action": "upsert", "data":
                        {
                            column: value for column, value
                            in list(zip(column_names, row_values))
                            if column not in non_selected_properties
                        }
                    } for row_values in sorted(stream_expected_data[self.VALUES],
                                               key=lambda row: (row[replication_column] is not None, row[replication_column]))
                ]

                # remove sequences from actual values for comparison
                [message.pop("sequence") for message
                 in records_by_stream[stream]['messages'][1:-1]]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()),
                                         msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row["data"].items():
                            self.assertEqual(expected_value, actual_row["data"][column_name],
                                             msg="expected: {} != actual {}".format(
                                                 expected_row, actual_row))
                        print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(bookmark['replication_key_value'],
                                 max([row[replication_column] for row in stream_expected_data[self.VALUES]
                                      if row[replication_column] is not None]))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_column')

                self.assertEqual(bookmark['version'], table_version[stream],
                                 msg="expected bookmark for stream to match version")
                self.assertEqual(bookmark['version'], new_table_version,
                                 msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(expected_schemas,
                                                                         records_by_stream[stream]['schema']))
Exemple #4
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        # TODO - change the replication key back to replication_key_column when rowversion is supported
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'INCREMENTAL',
                'replication-key': 'temp_replication_key_column'
            }
        }]

        non_selected_properties = [
            "nvarchar_text", "varchar_text", "varbinary_data", "geospacial",
            "geospacial_map", "markup", "tree", "variant",
            "SpecialPurposeColumns", "started_at", "ended_at"
        ]
        BaseTapTest.select_all_streams_and_fields(
            conn_id,
            found_catalogs,
            non_selected_properties=non_selected_properties,
            additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        m["action"] == "upsert"
                        for m in records_by_stream[stream]['messages'][1:-1]
                    ]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(len(
                    records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [
                    {
                        "action": "upsert",
                        "data": {
                            column: value
                            for column, value in list(
                                zip(column_names, row_values))
                            if column not in non_selected_properties
                        }  # TODO - change to -1 for using rowversion for replication key
                    } for row_values in sorted(
                        stream_expected_data[self.VALUES],
                        key=lambda row: (row[1] is not None, row[1]))
                ]

                # Verify all data is correct for incremental
                for expected_row, actual_row in zip(
                        expected_messages,
                        records_by_stream[stream]['messages'][1:-1]):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key TODO - change to -1 for using rowversion for replication key
                self.assertEqual(
                    bookmark['replication_key_value'],
                    re.sub(
                        r'\d{3}Z', "Z",
                        max([
                            row[1] for row in stream_expected_data[self.VALUES]
                        ]).strftime("%Y-%m-%dT%H:%M:%S.%fZ")))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "text_and_image_deprecated_soon"
        column_name = [
            "pk", "temp_replication_key_column", "nvarchar_text",
            "varchar_text", "varbinary_data", "replication_key_column"
        ]
        insert_value = [(3,
                         datetime(2018,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  993000,
                                  tzinfo=timezone.utc), "JKL", "MNO",
                         "PQR".encode('utf-8'))]
        update_value = [(0,
                         datetime(2018,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  997000,
                                  tzinfo=timezone.utc), "JKL", "MNO",
                         "PQR".encode('utf-8'))]
        query_list = (insert(database_name,
                             schema_name,
                             table_name,
                             insert_value,
                             column_names=column_name[:-1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)

        values = insert_value + [(
            1, datetime(2018, 12, 31, 23, 59, 59, 987000, tzinfo=timezone.utc),
            "abc", "def", "ghi".encode('utf-8'))] + update_value
        rows = mssql_cursor_context_manager(*[
            "select replication_key_column from data_types_database.dbo.text_and_image_deprecated_soon "
            "where pk in (0, 1,3) order by pk desc"
        ])
        rows = [tuple(row) for row in rows]
        rows = [("0x{}".format(value.hex().upper()), ) for value, in rows]
        row_with_version = [x[0] + x[1] for x in zip(values, rows)]
        self.EXPECTED_METADATA[
            'data_types_database_dbo_text_and_image_deprecated_soon'][
                'values'] = row_with_version

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "weirdos"
        column_name = [
            "pk", "temp_replication_key_column", "geospacial",
            "geospacial_map", "markup", "guid", "tree", "variant",
            "SpecialPurposeColumns", "replication_key_column"
        ]
        insert_value = [(3,
                         datetime(9999,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  993000,
                                  tzinfo=timezone.utc), None, None, None,
                         str(uuid.uuid1()).upper(), None, None, None)]
        update_value = [(1,
                         datetime(9999,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  997000,
                                  tzinfo=timezone.utc), None, None, None,
                         str(uuid.uuid1()).upper(), None, None, None)]
        delete_value = [(0, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value, column_name[:-1]))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)

        values = insert_value + [
            (2, datetime(9999, 12, 31, 23, 59, 59, 990000,
                         tzinfo=timezone.utc), None, None, None,
             "B792681C-AEF4-11E9-8002-0800276BC1DF", None, None, None)
        ] + update_value
        rows = mssql_cursor_context_manager(*[
            "select replication_key_column from data_types_database.dbo.weirdos "
            "where pk in (1, 2, 3) order by pk desc"
        ])
        rows = [tuple(row) for row in rows]
        rows = [("0x{}".format(value.hex().upper()), ) for value, in rows]
        row_with_version = [x[0] + x[1] for x in zip(values, rows)]
        self.EXPECTED_METADATA['data_types_database_dbo_weirdos'][
            'values'] = row_with_version

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "computed_columns"
        column_name = [
            "pk", "temp_replication_key_column", "started_at", "ended_at",
            "replication_key_column"
        ]
        insert_value = [(2,
                         datetime(9998,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  990000,
                                  tzinfo=timezone.utc),
                         datetime(1980, 5, 30, 16), datetime.now())]
        update_value = [(0,
                         datetime(9998,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  997000,
                                  tzinfo=timezone.utc), datetime(1942, 11, 30),
                         datetime(2017, 2, 12))]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value, column_name[:-1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        values = insert_value + [(
            1, datetime(9998, 12, 31, 23, 59, 59, 987000, tzinfo=timezone.utc),
            datetime(1970, 1, 1, 0), datetime.now())] + update_value
        rows = mssql_cursor_context_manager(*[
            "select replication_key_column from data_types_database.dbo.computed_columns "
            "where pk in (0, 1, 2) order by pk desc"
        ])
        rows = [tuple(row) for row in rows]
        row_with_duration = [x[0] + x[1] for x in zip(values, rows)]
        self.EXPECTED_METADATA['data_types_database_dbo_computed_columns'][
            'values'] = row_with_duration

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:-1]
                    ]))
                self.assertEqual(len(
                    records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                        if column not in non_selected_properties
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                        print(
                            "records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(
                    bookmark['replication_key_value'],
                    re.sub(
                        r'\d{3}Z', "Z",
                        max([
                            row[1] for row in stream_expected_data[self.VALUES]
                        ]).strftime("%Y-%m-%dT%H:%M:%S.%fZ")))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))
Exemple #5
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'INCREMENTAL',
                'replication-key': 'replication_key_column'
            }
        }]

        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        non_selected_properties = []

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        m["action"] == "upsert"
                        for m in records_by_stream[stream]['messages'][1:-1]
                    ]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(len(
                    records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                        if column not in non_selected_properties
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # Verify all data is correct for incremental
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, time):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.replace(
                                        microsecond=0).isoformat().replace(
                                            '+00:00', ''),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, date):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.isoformat() +
                                    'T00:00:00+00:00',
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat() +
                                        'T00:00:00+00:00',
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value,
                                        actual_row["data"][column_name]))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                expected_bookmark = max([
                    row[1] for row in stream_expected_data[self.VALUES]
                    if row[1] is not None
                ])
                self.assertEqual(bookmark['replication_key_value'],
                                 expected_bookmark.isoformat())
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "dates_and_times"
        column_name = [
            "pk", "replication_key_column", "date_and_time",
            "bigger_range_and_precision_datetime", "datetime_with_timezones",
            "datetime_no_seconds", "its_time"
        ]
        insert_value = [
            (5, date(9999, 12, 30),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).isoformat(),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (6, date(2018, 12, 29),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).isoformat(),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc))
        ]
        update_value = [
            (3, date(9999, 12, 31),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=10))).isoformat(),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (4, date(2018, 12, 30),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=6))).isoformat(),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc))
        ]
        delete_value = [(2, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)

        insert_value = [
            (5, date(9999, 12, 30),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).astimezone(
                          timezone.utc),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (6, date(2018, 12, 29),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).astimezone(
                          timezone.utc),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc))
        ]
        update_value = [
            (3, date(9999, 12, 31),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=10))).astimezone(
                          timezone.utc),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (4, date(2018, 12, 30),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=6))).astimezone(
                          timezone.utc),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc))
        ]

        insert_value = insert_value[:-1]  # only repl_key >= gets included
        update_value = update_value[:-1]
        self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"][
            "values"] = [(
                1, date(9999, 12, 29),
                datetime(9999, 12, 31, 23, 59, 59, 997000,
                         tzinfo=timezone.utc),
                datetime(9999, 12, 31, 23, 59, 59, 999000,
                         tzinfo=timezone.utc),
                datetime(
                    9999, 12, 31, 10, 14, tzinfo=timezone(
                        timedelta(hours=14))).astimezone(timezone.utc),
                datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
                time(23, 59, 59, tzinfo=timezone.utc))
                         ] + update_value + insert_value

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:-1]
                    ]))
                self.assertEqual(len(
                    records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                        if column not in non_selected_properties
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, time):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.replace(
                                        microsecond=0).isoformat().replace(
                                            '+00:00', ''),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, date):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.isoformat() +
                                    'T00:00:00+00:00',
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat() +
                                        'T00:00:00+00:00',
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value,
                                        actual_row["data"][column_name]))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                expected_bookmark = max([
                    row[1] for row in stream_expected_data[self.VALUES]
                    if row[1] is not None
                ])
                self.assertEqual(bookmark['replication_key_value'],
                                 expected_bookmark.isoformat())
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))
Exemple #6
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'INCREMENTAL',
                'replication-key': 'replication_key_column'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        m["action"] == "upsert"
                        for m in records_by_stream[stream]['messages'][1:-1]
                    ]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(
                    len(stream_expected_data[self.VALUES]),
                    len(records_by_stream[stream]['messages'][1:-1]),
                    msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[-1] is not None, row[-1]))]

                # Verify all data is correct for incremental
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, Decimal):
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "decimal value is not represented as a number"
                                )
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                expected_bookmark = max([
                    row[-1] for row in stream_expected_data[self.VALUES]
                    if row[-1] is not None
                ])
                # currently decimal replication keys aren't supported in the front end.  If they are at a later point
                # this should be a decimal comparison. https://stitchdata.atlassian.net/browse/SRCE-1331
                self.assertEqual(bookmark['replication_key_value'],
                                 float(expected_bookmark))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(
                    records_by_stream[stream]['schema'],
                    simplejson.loads(simplejson.dumps(expected_schemas),
                                     use_decimal=True),
                    msg="expected: {} != actual: {}".format(
                        expected_schemas, records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "numeric_precisions"
        precision_scale = NUMERIC_PRECISION_SCALE
        column_type = [
            "numeric({},{})".format(precision, scale)
            for precision, scale in precision_scale
        ]
        column_name = ["pk"] + [
            x.replace("(", "_").replace(",", "_").replace(")", "")
            for x in column_type[:-1]
        ] + ["replication_key_column"]
        insert_value = [
            (8, Decimal(100), Decimal(100), Decimal(100), Decimal(100)),
            (7, Decimal('99999.9995'), Decimal('9999999.999999999999'),
             Decimal('999999.9999999999999999999999'),
             Decimal('99999999999999999999999999999999999.995'))
        ]
        update_value = [
            (5, Decimal(100), Decimal(100), Decimal(100), Decimal(100)),
            (6, Decimal('99999.9999'), Decimal('9999999.999999999999'),
             Decimal('999999.9999999999999999999999'),
             Decimal('99999999999999999999999999999999999.999'))
        ]
        delete_value = [(4, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = insert_value[-1:]  # only repl_key >= gets included
        update_value = update_value[-1:]
        self.EXPECTED_METADATA["data_types_database_dbo_numeric_precisions"][
            "values"] = [
                (3, Decimal('99999.9993'), Decimal('9999999.999999999999'),
                 Decimal('999999.9999999999999999999999'),
                 Decimal('99999999999999999999999999999999999.993')),
            ] + update_value + insert_value

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "decimal_precisions"
        precision_scale = DECIMAL_PRECISION_SCALE
        column_type = [
            "decimal({},{})".format(precision, scale)
            for precision, scale in precision_scale
        ]
        column_name = ["pk"] + [
            x.replace("(", "_").replace(",", "_").replace(")", "")
            for x in column_type[:-1]
        ] + ["replication_key_column"]
        insert_value = [
            (8, Decimal(100), Decimal(100), Decimal(100), Decimal(100)),
            (7, Decimal('99999.9995'), Decimal('9999999999999.999999'),
             Decimal('9999999999999999999999.999999'),
             Decimal('9999999999999999999999999.9999999999995'))
        ]
        update_value = [
            (5, Decimal(100), Decimal(100), Decimal(100), Decimal(100)),
            (6, Decimal('99999.9999'), Decimal('9999999999999.999999'),
             Decimal('9999999999999999999999.999999'),
             Decimal('9999999999999999999999999.9999999999999'))
        ]
        delete_value = [(4, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = insert_value[-1:]  # only repl_key >= gets included
        update_value = update_value[-1:]
        self.EXPECTED_METADATA["data_types_database_dbo_decimal_precisions"][
            "values"] = [
                (3, Decimal('99999.9993'), Decimal('9999999999999.999999'),
                 Decimal('9999999999999999999999.999999'),
                 Decimal('9999999999999999999999999.9999999999993'))
            ] + update_value + insert_value

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:-1]
                    ]))

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[-1] is not None, row[-1]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, Decimal):
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "decimal value is not represented as a number"
                                )
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                expected_bookmark = max([
                    row[-1] for row in stream_expected_data[self.VALUES]
                    if row[-1] is not None
                ])
                # currently decimal replication keys aren't supported in the front end.  If they are at a later point
                # this should be a decimal comparison. https://stitchdata.atlassian.net/browse/SRCE-1331
                self.assertEqual(bookmark['replication_key_value'],
                                 float(expected_bookmark))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(
                    records_by_stream[stream]['schema'],
                    simplejson.loads(simplejson.dumps(expected_schemas),
                                     use_decimal=True),
                    msg="expected: {} != actual: {}".format(
                        expected_schemas, records_by_stream[stream]['schema']))
Exemple #7
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        # self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                if records_by_stream[stream]['messages'][-1].get("data"):
                    last_row_data = True
                else:
                    last_row_data = False

                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-2]['action'],
                    'activate_version')
                if last_row_data:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-3]['action'],
                        'activate_version')
                else:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-1]['action'],
                        'activate_version')
                self.assertEqual(
                    len([
                        m for m in records_by_stream[stream]['messages'][1:]
                        if m["action"] == "activate_version"
                    ]),
                    2,
                    msg=
                    "Expect 2 more activate version messages for end of full table and beginning of log based"
                )

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # Verify all data is correct for the full table part
                if last_row_data:
                    final_row = -3
                else:
                    final_row = -2

                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]
                            ['messages'][1:final_row])):
                    with self.subTest(expected_row=expected_row):

                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, Decimal):
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "decimal value is not represented as a number"
                                )
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))

                # Verify all data is correct for the log replication part if sent
                if records_by_stream[stream]['messages'][-1].get("data"):
                    for column_name, expected_value in expected_messages[-1][
                            "data"].items():
                        self.assertEqual(
                            expected_value,
                            records_by_stream[stream]['messages'][-1]["data"]
                            [column_name],
                            msg="expected: {} != actual {}".format(
                                expected_row, actual_row))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                inital_log_version = bookmark['current_log_version']

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(
                    records_by_stream[stream]['schema'],
                    simplejson.loads(simplejson.dumps(expected_schemas),
                                     use_decimal=True),
                    msg="expected: {} != actual: {}".format(
                        expected_schemas, records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "decimal_precisions"
        precision_scale = DECIMAL_PRECISION_SCALE
        column_type = [
            "decimal({},{})".format(precision, scale)
            for precision, scale in precision_scale
        ]
        column_name = ["pk"] + [
            x.replace("(", "_").replace(",", "_").replace(")", "")
            for x in column_type
        ]
        insert_value = [
            (7, Decimal('-92473.8401'), Decimal('-4182159664734.645653'),
             Decimal('6101329656084900380190.268036'),
             Decimal('4778017533841887320066645.9761464001349')),
        ]
        update_value = [
            (3, Decimal('-92473.8401'), Decimal('-4182159664734.645653'),
             Decimal('6101329656084900380190.268036'),
             Decimal('4778017533841887320066645.9761464001349')),
        ]
        delete_value = [(4, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [insert_value[0] + (None, )]
        update_value = [update_value[0] + (None, )]
        delete_value = [
            delete_value[0] + (None, None, None, None, datetime.utcnow())
        ]
        self.EXPECTED_METADATA["data_types_database_dbo_decimal_precisions"]["values"] = \
            [self.expected_metadata()["data_types_database_dbo_decimal_precisions"]["values"][-1]] + \
            insert_value + delete_value + update_value
        self.EXPECTED_METADATA["data_types_database_dbo_decimal_precisions"][
            "fields"].append({
                "_sdc_deleted_at": {
                    'sql-datatype': 'datetime',
                    'selected-by-default': True,
                    'inclusion': 'automatic'
                }
            })

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "numeric_precisions"
        precision_scale = NUMERIC_PRECISION_SCALE
        column_type = [
            "numeric({},{})".format(precision, scale)
            for precision, scale in precision_scale
        ]
        column_name = ["pk"] + [
            x.replace("(", "_").replace(",", "_").replace(")", "")
            for x in column_type
        ]
        insert_value = [
            (7, Decimal('96701.9382'), Decimal('-4371716.186100650268'),
             Decimal('-367352.306093776232045517794'),
             Decimal('-81147872128956247517327931319278572.985')),
        ]
        update_value = [
            (3, Decimal('96701.9382'), Decimal('-4371716.186100650268'),
             Decimal('-367352.306093776232045517794'),
             Decimal('-81147872128956247517327931319278572.985')),
        ]
        delete_value = [(4, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [insert_value[0] + (None, )]
        update_value = [update_value[0] + (None, )]
        delete_value = [
            delete_value[0] + (None, None, None, None, datetime.utcnow())
        ]
        self.EXPECTED_METADATA["data_types_database_dbo_numeric_precisions"]["values"] = \
            insert_value + delete_value + update_value
        self.EXPECTED_METADATA["data_types_database_dbo_numeric_precisions"][
            "fields"].append({
                "_sdc_deleted_at": {
                    'sql-datatype': 'datetime',
                    'selected-by-default': True,
                    'inclusion': 'automatic'
                }
            })

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:]
                    ]))

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertGreaterEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if column_name != "_sdc_deleted_at":
                                if isinstance(expected_value, Decimal):
                                    self.assertEqual(
                                        type(actual_row["data"][column_name]),
                                        Decimal,
                                        msg=
                                        "decimal value is not represented as a number"
                                    )
                                    self.assertEqual(
                                        expected_value,
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_row, actual_row))
                                else:
                                    self.assertEqual(
                                        expected_value,
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_row, actual_row))
                            elif expected_value:
                                # we have an expected value for a deleted row
                                try:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%S.%fZ")
                                except ValueError:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%SZ")
                                self.assertGreaterEqual(
                                    actual_value,
                                    expected_value - timedelta(seconds=15))
                                self.assertLessEqual(
                                    actual_value,
                                    expected_value + timedelta(seconds=15))
                            else:
                                # the row wasn't deleted so we can either not pass the column or it can be None
                                self.assertIsNone(
                                    actual_row["data"].get(column_name))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                new_log_version = bookmark['current_log_version']
                self.assertGreater(new_log_version,
                                   inital_log_version,
                                   msg='expected log version to increase')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(
                    records_by_stream[stream]['schema'],
                    simplejson.loads(simplejson.dumps(expected_schemas),
                                     use_decimal=True),
                    msg="expected: {} != actual: {}".format(
                        expected_schemas, records_by_stream[stream]['schema']))
Exemple #8
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'INCREMENTAL',
                'replication-key': 'replication_key_column'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')

                self.assertTrue(
                    all([
                        m["action"] == "upsert"
                        for m in records_by_stream[stream]['messages'][1:-1]
                    ]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(
                    len(stream_expected_data[self.VALUES]),
                    len(records_by_stream[stream]['messages'][1:-1]),
                    msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # Verify all data is correct for incremental
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            column_index = [
                                list(key.keys())[0] for key in
                                self.expected_metadata()[stream][self.FIELDS]
                            ].index(column_name)
                            if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][self.DATATYPE] \
                                    in ("real", "float") \
                                    and actual_row["data"][column_name] is not None:
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "float value is not represented as a number"
                                )
                                self.assertEqual(
                                    float(str(float32(expected_value))),
                                    float(
                                        str(
                                            float32(actual_row["data"]
                                                    [column_name]))),
                                    msg=
                                    "single value of {} doesn't match actual {}"
                                    .format(
                                        float(str(float32(expected_value))),
                                        float(
                                            str(
                                                float32(actual_row["data"]
                                                        [column_name])))))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(
                    bookmark['replication_key_value'],
                    max([
                        row[1] for row in stream_expected_data[self.VALUES]
                        if row[1] is not None
                    ]))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "float_precisions"
        column_name = [
            "pk", "replication_key_column", "float_53", "real_24_bits"
        ]
        insert_value = [(15, 100, 100, 100),
                        (14, 3.4028235e+38, 1.7976931348623157e+308,
                         3.4028235e+38)]
        update_value = [(4, 101, 101, 101),
                        (6, 3.4028233e+38, 1.7976931348623157e+308,
                         3.4028235e+38)]
        delete_value = [(5, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = insert_value[-1:]  # only repl_key >= gets included
        update_value = update_value[-1:]
        self.EXPECTED_METADATA["data_types_database_dbo_float_precisions"]["values"] = \
            [(1, 3.4028230e+38, 1.7976931348623157e+308, 3.4028235e+38)] + update_value + insert_value

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:-1]
                    ]))

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            column_index = [
                                list(key.keys())[0] for key in
                                self.expected_metadata()[stream][self.FIELDS]
                            ].index(column_name)
                            if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][self.DATATYPE] \
                                    in ("real", "float") \
                                    and actual_row["data"][column_name] is not None:
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "float value is not represented as a number"
                                )
                                self.assertEqual(
                                    float(str(float32(expected_value))),
                                    float(
                                        str(
                                            float32(actual_row["data"]
                                                    [column_name]))),
                                    msg=
                                    "single value of {} doesn't match actual {}"
                                    .format(
                                        float(str(float32(expected_value))),
                                        float(
                                            str(
                                                float32(actual_row["data"]
                                                        [column_name])))))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(
                    bookmark['replication_key_value'],
                    max([
                        row[1] for row in stream_expected_data[self.VALUES]
                        if row[1] is not None
                    ]))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))