Beispiel #1
0
    def fetch_server_id(self):
        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            cur.execute("SELECT @@server_id")
            server_id = cur.fetchone()[0]

            return server_id
Beispiel #2
0
    def initialize_db(self, engine):
        connection = db_utils.get_db_connection(self.get_properties(),
                                                self.get_credentials())

        with connection.cursor() as cur:
            create_databases_sql = """
                DROP DATABASE IF EXISTS {};
                CREATE DATABASE {};
            """.format(self.database_name(), self.database_name())

            cur.execute(create_databases_sql)

            cur.execute(
                """
            SELECT EXISTS (
            SELECT 1
            FROM  information_schema.tables
            WHERE  table_schema = %s
            AND  table_name =   %s);""",
                [self.database_name(), self.table_name()])

            existing_table = cur.fetchone()[0]

            if existing_table:
                cur.execute("DROP TABLE {}.{}".format(self.database_name(),
                                                      self.table_name()))

            create_table_sql = """
CREATE TABLE {}.{} (
            id                     BIGINT PRIMARY KEY,
            our_json               JSON
)
ENGINE = {}
""".format(self.database_name(), self.table_name(), engine)

            cur.execute(create_table_sql)

            # Ensure expected engine in use
            cur.execute(
                """
            SELECT TABLE_NAME, ENGINE
            FROM  information_schema.tables
            where  table_schema =   %s;""", [self.database_name()])
            engine_in_use = cur._result.rows[0][1]
            self.assertEqual(
                engine,
                engine_in_use.upper(),
                msg="Unexpected engine in use: {}".format(engine_in_use))

            for record in [rec_1]:
                self.insert_record(cur, record)

        print("\n\nMySQL DB Instantiated." + \
              "\nNAME: {}\nENGINE: {}".format(self.database_name(), engine_in_use) + \
              "\nTABLE: {}\nEVENTS: 1 record inserted\n\n".format(self.table_name()))
Beispiel #3
0
    def setUp(self):
        missing_envs = [
            x for x in [
                os.getenv('TAP_MYSQL_HOST'),
                os.getenv('TAP_MYSQL_USER'),
                os.getenv('TAP_MYSQL_PASSWORD'),
                os.getenv('TAP_MYSQL_PORT')
            ] if x == None
        ]
        if len(missing_envs) != 0:
            raise Exception(
                "set TAP_MYSQL_HOST, TAP_MYSQL_USER, TAP_MYSQL_PASSWORD, TAP_MYSQL_PORT"
            )

        print("setting up mysql databases and tables")
        props = self.get_properties()
        props.pop('database')  # Don't connect to specific database for setup
        connection = db_utils.get_db_connection(props, self.get_credentials())

        with connection.cursor() as cur:
            create_databases_sql = """
                DROP DATABASE IF EXISTS {};
                CREATE DATABASE {};
            """.format(self.database_name(), self.database_name())

            cur.execute(create_databases_sql)

            create_table_sql = """
            CREATE TABLE {}.full_table (
                    a_pk      INTEGER AUTO_INCREMENT PRIMARY KEY,
                    a_varchar VARCHAR(10));
            """.format(self.database_name())

            cur.execute(create_table_sql)

            create_composite_key_table_sql = """
                CREATE TABLE {}.full_table_composite_key (
                    a_pk      INTEGER AUTO_INCREMENT,
                    a_varchar VARCHAR(10),
                    PRIMARY KEY (a_pk, a_varchar)
                );
            """.format(self.database_name())

            cur.execute(create_composite_key_table_sql)

            for table_name in self.table_names():
                for record in self.dummy_data():
                    self.insert_record(cur, record, table_name)
Beispiel #4
0
    def setUp(self):
        self.maxDiff = None
        missing_envs = [
            x for x in [
                os.getenv('TAP_MYSQL_HOST'),
                os.getenv('TAP_MYSQL_USER'),
                os.getenv('TAP_MYSQL_PASSWORD'),
                os.getenv('TAP_MYSQL_PORT')
            ] if x == None
        ]
        if len(missing_envs) != 0:
            raise Exception(
                "set TAP_MYSQL_HOST, TAP_MYSQL_USER, TAP_MYSQL_PASSWORD, TAP_MYSQL_PORT"
            )

        print("setting up mysql databases and tables")
        connection = db_utils.get_db_connection(self.get_properties(),
                                                self.get_credentials())

        with connection.cursor() as cursor:
            flatten = lambda l: [item for sublist in l for item in sublist]
            var_string_for_table = lambda t: ', '.join(['%s'] * len(dummy_data[
                t][0]))
            create_databases_sql = '''
                DROP DATABASE IF EXISTS tap_tester_mysql_0;
                CREATE DATABASE tap_tester_mysql_0;
                DROP DATABASE IF EXISTS tap_tester_mysql_1;
                CREATE DATABASE tap_tester_mysql_1;
            '''
            cursor.execute(create_databases_sql)

            var_string = var_string_for_table('simple_example')
            simple_example_table_sql = '''
                CREATE TABLE tap_tester_mysql_0.simple_example (
                    c_pk INTEGER PRIMARY KEY,
                    c_varchar VARCHAR(255),
                    c_dt DATETIME);
                INSERT INTO tap_tester_mysql_0.simple_example VALUES (%s), (%s), (%s), (%s), (%s);
            ''' % (var_string, var_string, var_string, var_string, var_string)

            cursor.execute(simple_example_table_sql,
                           flatten(dummy_data['simple_example']))
Beispiel #5
0
    def binlog_json_test(self):
        print("RUNNING {}\n\n".format(self.name()))

        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        expected_check_streams = {self.tap_stream_id()}
        expected_sync_streams = {self.table_name()}
        expected_pks = {self.table_name(): {'id'}}

        # verify the tap discovered the right streams
        found_catalogs = [
            catalog for catalog in menagerie.get_catalogs(conn_id)
            if catalog['tap_stream_id'] in expected_check_streams
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = expected_check_streams.symmetric_difference(found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]

        self.assertEqual(self.table_name(), test_catalog['stream_name'])

        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        selected_metadata = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog,
            menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']),
            additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run initial full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        self.maxDiff = None
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {self.table_name(): 1})
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(message_actions,
                         ['activate_version', 'upsert', 'activate_version'])

        # ensure some log_file and log_pos state was persisted
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertIsNotNone(bookmark['log_file'])
        self.assertIsNotNone(bookmark['log_pos'])

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        # grab version, log_file and log_pos from state to check later
        expected_table_version = records_for_stream['table_version']

        self.assertEqual(expected_table_version, bookmark['version'])

        # check for expected records
        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]

        self.assertEqual([expected_rec_1], upsert_records)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # record count should be empty as we did not persist anything to the gate
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {})

        # insert a new huge row
        data = dict([('foooo%i' % i, 'baaaaar%i' % i) for i in range(2560)],
                    literal=True)
        rec = {'id': 2, 'our_json': json.dumps(data)}

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            self.insert_record(cur, rec)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # Either the log_file is the same but the log_pos has increased or the log_file
        # has rotated and the numeric suffix has increased
        if expected_log_file == bookmark['log_file']:
            self.assertGreater(bookmark['log_pos'], expected_log_pos)
        else:
            expected_log_file_suffix = re.search('^.*\.(\d+)$',
                                                 expected_log_file).groups()[0]
            updated_log_file_suffix = re.search(
                '^.*\.(\d+)$', bookmark['log_file']).groups()[0]

            self.assertGreater(int(updated_log_file_suffix),
                               int(expected_log_file_suffix))

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        expected_rec_2 = copy.deepcopy(rec)

        # check for expected records
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(message_actions, ['upsert'])

        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]
        del upsert_records[0]['_sdc_deleted_at']

        expected_json = json.loads(expected_rec_2.get('our_json', {}))
        actual_json = json.loads(upsert_records[0].get('our_json', {}))

        self.assertTrue(len(actual_json.keys()) > 0)
        self.assertEqual(expected_json, actual_json)
Beispiel #6
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        for c in found_catalogs:
            catalog_props_to_check = ['stream_name', 'tap_stream_id']
            stream = c['stream_name']

            for prop in catalog_props_to_check:
                self.assertEqual(
                    c[prop],
                    expected_catalogs[stream][prop],
                    msg=
                    "unexpected stream catalog property `{}` for stream `{}`: `{}` != `{}`"
                    .format(prop, stream, expected_catalogs[stream][prop],
                            c[prop]))

        print("discovered streams are correct")

        print('checking discoverd metadata for tap_tester_mysql_0-incremental')
        incremental_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_0-incremental'
        ][0]
        md = menagerie.get_annotated_schema(
            conn_id, incremental_catalog['stream_id'])['metadata']

        incremental_stream_metadata = {
            'database-name': 'tap_tester_mysql_0',
            'row-count': 3,
            'is-view': False,
            'selected-by-default': False,
            'table-key-properties': ['c_pk']
        }

        self.assertEqual(
            sorted(md, key=lambda x: x['breadcrumb']),
            [{
                'breadcrumb': [],
                'metadata': incremental_stream_metadata
            }, {
                'breadcrumb': ['properties', 'c_dt'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'datetime'
                }
            }, {
                'breadcrumb': ['properties', 'c_pk'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'int(11)'
                }
            }, {
                'breadcrumb': ['properties', 'c_varchar'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'varchar(255)'
                }
            }, {
                'breadcrumb': ['properties', 'c_varchar_to_deselect'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'varchar(255)'
                }
            }])

        print('checking discovered metadata for tap_tester_mysql_1-view')
        view_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_1-view'
        ][0]
        view_catalog_key_properties_md = [{
            'breadcrumb': [],
            'metadata': {
                'view-key-properties': ['c_pk']
            }
        }]

        connections.set_non_discoverable_metadata(
            conn_id, view_catalog,
            menagerie.get_annotated_schema(conn_id, view_catalog['stream_id']),
            view_catalog_key_properties_md)
        md = menagerie.get_annotated_schema(
            conn_id, view_catalog['stream_id'])['metadata']

        view_stream_metadata = {
            'database-name': 'tap_tester_mysql_1',
            'is-view': True,
            'selected-by-default': False,
            'view-key-properties': ['c_pk']
        }

        self.assertEqual(sorted(md, key=lambda x: x['breadcrumb']),
                         [{
                             'breadcrumb': [],
                             'metadata': view_stream_metadata
                         }, {
                             'breadcrumb': ['properties', 'c_pk'],
                             'metadata': {
                                 'selected-by-default': True,
                                 'sql-datatype': 'int(11)'
                             }
                         }, {
                             'breadcrumb': ['properties', 'c_varchar'],
                             'metadata': {
                                 'selected-by-default': True,
                                 'sql-datatype': 'varchar(255)'
                             }
                         }])

        #No selected-by-default MD for c_year because it is an unsupported type
        various_types_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_0-various_types'
        ][0]
        md = menagerie.get_annotated_schema(
            conn_id, various_types_catalog['stream_id'])['metadata']
        c_year_md = [
            x for x in md if x['breadcrumb'] == ['properties', 'c_year']
        ]
        self.assertEqual(c_year_md, [{
            'breadcrumb': ['properties', 'c_year'],
            'metadata': {
                'selected-by-default': False,
                'sql-datatype': 'year(4)'
            }
        }])

        ##select_simple_example
        catalogs_to_select = [
            c for c in found_catalogs
            if c['tap_stream_id'] != 'tap_tester_mysql_0-simple_example'
        ]

        for a_catalog in catalogs_to_select:
            additional_md = []
            unselected_fields = []
            if a_catalog['tap_stream_id'] == 'tap_tester_mysql_0-incremental':
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-key': 'c_dt',
                        'replication-method': 'INCREMENTAL'
                    }
                }]
                unselected_fields = ['c_varchar_to_deselect']

            elif a_catalog['tap_stream_id'] == 'tap_tester_mysql_1-view':
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'view-key-properties': ['c_pk'],
                        'replication-method': 'FULL_TABLE'
                    }
                }]
            else:
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-method': 'FULL_TABLE'
                    }
                }]

            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, a_catalog,
                menagerie.get_annotated_schema(conn_id,
                                               a_catalog['stream_id']),
                additional_md, unselected_fields)
        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        expected_row_count = 8  # {'my_isam': 1, 'various_types': 3, 'incremental': 3, 'view': 1}
        self.assertEqual(
            replicated_row_count,
            expected_row_count,
            msg="failed to replicate correct number of rows: {}".format(
                record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        records_by_stream = runner.get_records_from_target_output()

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify that activate version messages were sent in the proper position
            self.assertEqual(
                recs['messages'][0]['action'],
                'activate_version',
                msg=
                "Expected first message sent for stream `{}` to have action `activate_version`"
                .format(stream))

            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # verify that the target output the proper numeric and date representations
        expected_various_types_records = [{
            'c_time':
            '1970-01-01T12:34:56.000000Z',
            'c_mediumint':
            8388607,
            'c_smallint':
            32767,
            'c_tinyint':
            127,
            'c_date':
            '2017-09-13T00:00:00.000000Z',
            'c_bigint':
            9223372036854775807,
            'c_decimal':
            -1,
            'c_int':
            2147483647,
            'c_bit':
            True,
            'c_decimal_2':
            Decimal('123456789.0'),
            'c_pk':
            1,
            'c_double':
            Decimal("1.234"),
            'c_float':
            Decimal("1.234"),
            'c_decimal_2_unsigned':
            Decimal("1.23"),
            'c_tinyint_1':
            True
        }, {
            'c_time':
            '1970-01-01T12:34:57.000000Z',
            'c_mediumint':
            -8388608,
            'c_smallint':
            -32768,
            'c_tinyint':
            -128,
            'c_date':
            '2017-09-14T00:00:00.000000Z',
            'c_bigint':
            -9223372036854775808,
            'c_decimal':
            0,
            'c_int':
            -2147483648,
            'c_bit':
            False,
            'c_decimal_2':
            Decimal("123456790.0"),
            'c_pk':
            2,
            'c_double':
            Decimal("2.234"),
            'c_float':
            Decimal("2.234"),
            'c_decimal_2_unsigned':
            Decimal("0.23"),
            'c_tinyint_1':
            False
        }, {
            'c_time':
            '1970-01-01T12:34:57.000000Z',
            'c_mediumint':
            -8388608,
            'c_smallint':
            -32768,
            'c_tinyint':
            -128,
            'c_date':
            '2017-09-14T00:00:00.000000Z',
            'c_bigint':
            -9223372036854775808,
            'c_decimal':
            0,
            'c_int':
            -2147483648,
            'c_bit':
            None,
            'c_decimal_2':
            Decimal("123456790.0"),
            'c_pk':
            3,
            'c_double':
            Decimal("2.234"),
            'c_float':
            Decimal("2.234"),
            'c_decimal_2_unsigned':
            Decimal("0.23"),
            'c_tinyint_1':
            None
        }]

        actual_various_types_records = [
            r['data']
            for r in records_by_stream['various_types']['messages'][1:4]
        ]

        self.assertEqual(
            actual_various_types_records,
            expected_various_types_records,
            msg=
            "Expected `various_types` upsert record data to be {}, but target output {}"
            .format(expected_various_types_records,
                    actual_various_types_records))

        # verify that deselected property was not output
        expected_incremental_record = {
            'c_pk': 1,
            'c_dt': '2017-01-01T00:00:00.000000Z',
            'c_varchar': 'a'
        }

        actual_incremental_record = records_by_stream['incremental'][
            'messages'][1]['data']

        self.assertEqual(
            actual_incremental_record,
            expected_incremental_record,
            msg=
            "Expected first `incremental` upsert record data to be {}, but target output {}"
            .format(expected_incremental_record, actual_incremental_record))

        print("records are correct")

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)
        bookmarks = state['bookmarks']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        for k, v in bookmarks.items():
            if k == 'tap_tester_mysql_0-incremental':
                self.assertIsNotNone(
                    v['version'],
                    msg="expected bookmark for stream `{}` to have a version set"
                    .format(k))
                self.assertEqual(
                    v['replication_key_value'],
                    '2017-01-01T00:00:02.000000Z',
                    msg=
                    "incorrect replication_key_value in bookmark for stream `{}`"
                    .format(k))
                self.assertEqual(
                    v['replication_key'],
                    'c_dt',
                    msg=
                    "incorrect replication_key specified in bookmark for stream `{}`"
                    .format(k))
            else:
                self.assertFalse(
                    'version' in v,
                    msg=
                    "expected bookmark for stream `{}` to not have a version key"
                    .format(k))
                self.assertTrue(
                    'initial_full_table_complete' in v,
                    msg=
                    "expected bookmark for stream `{}` to have a true initial_full_table_complete key"
                    .format(k))
        print("state and bookmarks are correct")

        incremental_table_initial_table_version = bookmarks[
            'tap_tester_mysql_0-incremental']['version']

        #----------------------------------------------------------------------
        # invoke the sync job again after some modifications
        #----------------------------------------------------------------------

        print("adding a column to an existing table in the source db")
        connection = db_utils.get_db_connection(self.get_properties(),
                                                self.get_credentials())

        with connection.cursor() as cursor:
            add_column_sql = '''
                ALTER TABLE tap_tester_mysql_0.incremental
                  ADD COLUMN favorite_number INTEGER;
                INSERT INTO tap_tester_mysql_0.incremental VALUES (4, '4', '2017-01-01 00:00:03', 'yeehaw', 999);
            '''
            cursor.execute(add_column_sql)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        expected_row_count = 7  # {'my_isam': 1, 'various_types': 3, 'incremental': 2, 'view': 1}
        self.assertEqual(
            replicated_row_count,
            expected_row_count,
            msg="failed to replicate correct number of rows: {}".format(
                record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        records_by_stream = runner.get_records_from_target_output()

        expected_schema_of_new_column = {
            'maximum': 2147483647,
            'selected': True,
            'inclusion': 'available',
            'type': ['null', 'integer'],
            'minimum': -2147483648
        }

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify that a activate version messages were sent in the proper position
            if stream == 'incremental':
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version',
                    msg=
                    "Expected first message sent for stream `{}` not to have action `activate_version`"
                    .format(stream))
                expected_schema_of_new_column = {
                    'maximum': 2147483647,
                    'inclusion': 'available',
                    'type': ['null', 'integer'],
                    'minimum': -2147483648
                }
                self.assertEqual(
                    records_by_stream[stream]['schema']['properties']
                    ['favorite_number'],
                    expected_schema_of_new_column,
                    msg=
                    "Expected newly-added column to be present in schema for stream `{}`, but it was not."
                    .format(stream))
            else:
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'upsert',
                    msg=
                    "Expected first message sent for stream `{}` to have action `upsert`"
                    .format(stream))
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version',
                    msg=
                    "Expected last message sent for stream `{}` to have action `activate_version`"
                    .format(stream))

        state = menagerie.get_state(conn_id)
        bookmarks = state['bookmarks']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        for k, v in bookmarks.items():
            if k == 'tap_tester_mysql_0-incremental':
                self.assertIsNotNone(
                    v['version'],
                    msg="expected bookmark for stream `{}` to have a version set"
                    .format(k))
                self.assertEqual(
                    v['replication_key_value'],
                    '2017-01-01T00:00:03.000000Z',
                    msg=
                    "incorrect replication_key_value in bookmark for stream `{}`"
                    .format(k))
                self.assertEqual(
                    v['replication_key'],
                    'c_dt',
                    msg=
                    "incorrect replication_key specified in bookmark for stream `{}`"
                    .format(k))
            else:
                self.assertFalse(
                    'version' in v,
                    msg=
                    "expected bookmark for stream `{}` to not have a version key"
                    .format(k))
                self.assertTrue(
                    'initial_full_table_complete' in v,
                    msg=
                    "expected bookmark for stream `{}` to have a true initial_full_table_complete key"
                    .format(k))

        print("state and bookmarks are correct")

        # verify incremental table_version didn't change
        incremental_table_new_table_version = bookmarks[
            'tap_tester_mysql_0-incremental']['version']

        self.assertEqual(
            incremental_table_initial_table_version,
            incremental_table_new_table_version,
            msg=
            "Expected incrementally-replicated table's table_version to remain unchanged over multiple invocations."
        )
Beispiel #7
0
    def setUp(self):
        self.maxDiff = None
        missing_envs = [
            x for x in [
                os.getenv('TAP_MYSQL_HOST'),
                os.getenv('TAP_MYSQL_USER'),
                os.getenv('TAP_MYSQL_PASSWORD'),
                os.getenv('TAP_MYSQL_PORT')
            ] if x == None
        ]
        if len(missing_envs) != 0:
            raise Exception(
                "set TAP_MYSQL_HOST, TAP_MYSQL_USER, TAP_MYSQL_PASSWORD, TAP_MYSQL_PORT"
            )

        print("setting up mysql databases and tables")
        connection = db_utils.get_db_connection(self.get_properties(),
                                                self.get_credentials())

        with connection.cursor() as cursor:
            flatten = lambda l: [item for sublist in l for item in sublist]
            var_string_for_table = lambda t: ', '.join(['%s'] * len(dummy_data[
                t][0]))
            create_databases_sql = '''
                DROP DATABASE IF EXISTS tap_tester_mysql_0;
                CREATE DATABASE tap_tester_mysql_0;
                DROP DATABASE IF EXISTS tap_tester_mysql_1;
                CREATE DATABASE tap_tester_mysql_1;
            '''
            cursor.execute(create_databases_sql)

            simple_example_table_sql = '''
                CREATE TABLE tap_tester_mysql_0.simple_example (
                    c_pk INTEGER PRIMARY KEY,
                    c_varchar VARCHAR(255));
                INSERT INTO tap_tester_mysql_0.simple_example VALUES (%s);
            ''' % var_string_for_table('simple_example')

            cursor.execute(simple_example_table_sql,
                           flatten(dummy_data['simple_example']))

            var_string = var_string_for_table('various_types')
            various_types_table_sql = '''
                CREATE TABLE tap_tester_mysql_0.various_types (
                    c_pk INTEGER PRIMARY KEY,
                    c_decimal DECIMAL,
                    c_decimal_2_unsigned DECIMAL(5, 2) UNSIGNED,
                    c_decimal_2 DECIMAL(11, 2),
                    c_tinyint TINYINT,
                    c_smallint SMALLINT,
                    c_mediumint MEDIUMINT,
                    c_int INT,
                    c_bigint BIGINT,
                    c_float FLOAT,
                    c_double DOUBLE,
                    c_bit BIT(4),
                    c_date DATE,
                    c_time TIME,
                    c_year YEAR,
                    c_tinyint_1 TINYINT(1));
                INSERT INTO tap_tester_mysql_0.various_types
                    VALUES (%s), (%s), (%s);
            ''' % (var_string, var_string, var_string)

            cursor.execute(various_types_table_sql,
                           flatten(dummy_data['various_types']))

            var_string = var_string_for_table('incremental')
            incremental_table_sql = '''
                CREATE TABLE tap_tester_mysql_0.incremental (
                    c_pk INTEGER PRIMARY KEY,
                    c_varchar VARCHAR(255),
                    c_dt DATETIME,
                    c_varchar_to_deselect VARCHAR(255));
                INSERT INTO tap_tester_mysql_0.incremental
                    VALUES (%s), (%s), (%s);
            ''' % (var_string, var_string, var_string)

            cursor.execute(incremental_table_sql,
                           flatten(dummy_data['incremental']))

            var_string = var_string_for_table('simple_example')
            isam_table_sql = '''
                CREATE TABLE tap_tester_mysql_1.my_isam (
                    c_pk INTEGER PRIMARY KEY,
                    c_varchar VARCHAR(255))
                ENGINE = MYISAM;
                INSERT INTO tap_tester_mysql_1.my_isam VALUES (%s);
            ''' % var_string_for_table('simple_example')

            cursor.execute(isam_table_sql,
                           flatten(dummy_data['simple_example']))

            view_sql = '''
                CREATE VIEW tap_tester_mysql_1.view AS SELECT * FROM tap_tester_mysql_0.simple_example;
            '''

            cursor.execute(view_sql)
Beispiel #8
0
    def binlog_edge_test(self, expected_records=[]):
        """
        Test binlog replication edge cases
        • Verify an initial sync returns expected records of various datatypes
        • Verify we bookmark correctly when a transaction spans multiple files
        • Insert and delete a record prior to sync. Verify both events are replicated
        • Insert and update a record prior to sync. Verify both events are replicated
        • Verify a valid log_file and log_pos state are persisted after each sync
        """

        conn_id = connections.ensure_connection(self)

        # prior to first sync update a record...
        updated_timestamp = datetime.datetime.now()
        updated_id = 1
        expected_records[1]['our_timestamp_2'] = datetime.datetime.strftime(
            updated_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")

        # insert a record and...
        inserted_record = self.generate_record_n(len(expected_records))
        expected_records += [inserted_record]  # TODO need to format

        # delete a record
        deleted_id = 2

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            cur.execute(
                "UPDATE {}.{} SET our_timestamp_2 = '{}' WHERE id = {}".format(
                    self.database_name(), self.table_name_1(),
                    updated_timestamp, updated_id))

            self.insert_record(cur, inserted_record, self.table_name_1())

            delete_time = datetime.datetime.now()
            cur.execute("DELETE FROM {}.{} WHERE id = {}".format(
                self.database_name(), self.table_name_1(), deleted_id))

        print(
            "\n\nMySQL DB Actions." + \
            "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \
            "\nEVENTS: {} records updated".format(1) + \
            "\n        {} records deleted\n\n".format(1)
        )

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        t1 = self.table_name_1()
        t2 = self.table_name_2()
        expected_check_streams = {
            self.tap_stream_id(t1),
            self.tap_stream_id(t2)
        }
        expected_sync_streams = {t1, t2}
        expected_pks = {t1: {'id'}, t2: {'id'}}

        # verify the tap discovered the right streams
        found_catalogs = [
            catalog for catalog in menagerie.get_catalogs(conn_id)
            if catalog['tap_stream_id'] in expected_check_streams
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = expected_check_streams.symmetric_difference(found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        self.assertEqual(self.table_name_1(), found_catalogs[0]['stream_name'])
        self.assertEqual(self.table_name_2(), found_catalogs[1]['stream_name'])
        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        for catalog in found_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            _ = connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, catalog, additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run initial full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        self.maxDiff = None
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        # BUG missing deleted record | https://stitchdata.atlassian.net/browse/SRCE-4258
        # self.assertEqual({self.table_name_1(): len(expected_records)}, record_count_by_stream)
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name_1()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        # verify activate version messages are present
        self.assertEqual('activate_version', message_actions[0])
        self.assertEqual('activate_version', message_actions[-1])

        # ensure some log_file and log_pos state was persisted
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertIsNotNone(bookmark['log_file'])
        self.assertIsNotNone(bookmark['log_pos'])

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        # grab version, log_file and log_pos from state to check later
        expected_table_version = records_for_stream['table_version']

        self.assertEqual(expected_table_version, bookmark['version'])

        # check for expected records
        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]
        # we need to compare record by record since there are so many.
        # a failure comparing expected_records to upsert_records would result in
        # an output message greater in length than a standard tmux buffer
        # BUG missing datetime precision | https://stitchdata.atlassian.net/browse/SRCE-4257
        # for expected_record in expected_records:
        #     upsert_record = [rec for rec in upsert_records
        #                      if rec['id'] == expected_record['id']]
        #     self.assertEqual(1, len(upsert_record),
        #                      msg="multiple upsert_recs with same pk: {}".format(upsert_record))
        #     self.assertEqual(expected_record, upsert_record.pop())

        # TODO add check for _sdc_delete_at for deleted record once bug addressed

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertEqual(expected_table_version, bookmark['version'])

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # record count should be empty as we did not persist anything to the gate
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)
        self.assertEqual(record_count_by_stream, {})

        # Create 1 more record prior to 2nd sync
        new_record = self.generate_record_n(len(expected_records))
        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            self.insert_record(cur, new_record, self.table_name_1())
        print(
            "\n\nMySQL DB Actions." + \
            "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \
            "\nEVENTS: {} records inserted".format(1)
        )

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertEqual(expected_table_version, bookmark['version'])

        # Either the log_file is the same but the log_pos has increased or the log_file
        # has rotated and the numeric suffix has increased
        if expected_log_file == bookmark['log_file']:
            print("PATH A")
            self.assertGreater(bookmark['log_pos'], expected_log_pos)
        else:
            expected_log_file_suffix = re.search('^.*\.(\d+)$',
                                                 expected_log_file).groups()[0]
            updated_log_file_suffix = re.search(
                '^.*\.(\d+)$', bookmark['log_file']).groups()[0]
            print("PATH B")
            self.assertGreater(int(updated_log_file_suffix),
                               int(expected_log_file_suffix))

        # Execute delete across tables using join prior to 3rd sync
        deleted_id = 4

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:

            delete_time = datetime.datetime.now()
            # DELETE T1, T2
            # FROM T1
            # INNER JOIN T2 ON T1.key = T2.key
            # WHERE condition;
            db = self.database_name()
            db_t1 = db + "." + t1
            db_t2 = db + "." + t2
            t1_key = db_t1 + ".id"
            t2_key = db_t2 + ".id"
            statement = "DELETE {}, {} ".format(db_t1, db_t2) + \
                "FROM {} ".format(t1) + \
                "INNER JOIN {} ON {} = {} ".format(db_t2, t1_key, t2_key) + \
                "WHERE {} = {}".format(t1_key, deleted_id)
            cur.execute(statement)

        print(
            "\n\nMySQL DB Actions." + \
            "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_2()) + \
            "\nTABLE: {}".format(self.table_name_2()) + \
            "\nEVENTS:  {} records deleted\n\n".format(1)
        )

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertEqual(expected_table_version, bookmark['version'])

        target_records = runner.get_records_from_target_output()
        records_stream_1 = target_records[self.table_name_1()]
        upsert_records_1 = [
            m['data'] for m in records_stream_1['messages']
            if m['action'] == 'upsert'
        ]
        records_stream_2 = target_records[self.table_name_2()]
        upsert_records_2 = [
            m['data'] for m in records_stream_2['messages']
            if m['action'] == 'upsert'
        ]

        # make sure the record is in the target for both tables with a delete time
        deleted_at_t1 = upsert_records_1[0].get('_sdc_deleted_at')
        deleted_at_t1_timestamp = utils.strptime_to_utc(
            deleted_at_t1).timestamp()
        self.assertIsNotNone(deleted_at_t1)

        deleted_at_t2 = upsert_records_2[0].get('_sdc_deleted_at')
        deleted_at_t2_timestamp = utils.strptime_to_utc(
            deleted_at_t2).timestamp()
        self.assertIsNotNone(deleted_at_t2)

        # the delete times should be equal since it was a single transaction
        self.assertEqual(deleted_at_t1_timestamp, deleted_at_t2_timestamp)
        time_delta = delete_time.timestamp() - deleted_at_t1_timestamp
        print("Delete time vs record: difference in seconds", time_delta)
        self.assertLess(time_delta,
                        3)  # time delta less than 3 seconds in magnitude
Beispiel #9
0
    def initialize_db(self, engine, log_file_size):
        connection = db_utils.get_db_connection(self.get_properties(),
                                                self.get_credentials())

        with connection.cursor() as cur:

            create_databases_sql = """
                DROP DATABASE IF EXISTS {};
                CREATE DATABASE {}
            """.format(self.database_name(), self.database_name())

            cur.execute(create_databases_sql)
            for table_name in [self.table_name_1(), self.table_name_2()]:
                cur.execute(
                    """
                SELECT EXISTS (
                SELECT 1
                FROM  information_schema.tables
                WHERE  table_schema = %s
                AND  table_name =   %s);""",
                    [self.database_name(), table_name])

                existing_table = cur.fetchone()[0]

                if existing_table:
                    cur.execute("DROP TABLE {}.{}".format(
                        self.database_name(), table_name))

                create_table_sql = """
CREATE TABLE {}.{} (
            id                     BIGINT PRIMARY KEY,
            our_timestamp_1        TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            our_timestamp_2        TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            our_varchar_1          VARCHAR(255),
            our_varchar_2          VARCHAR(255)
)
ENGINE = {}
""".format(self.database_name(), table_name, engine)
                cur.execute(create_table_sql)

            # Ensure expected engine in use
            cur.execute(
                """
            SELECT TABLE_NAME, ENGINE
            FROM  information_schema.tables
            where  table_schema =   %s;""", [self.database_name()])
            engine_in_use = cur._result.rows[0][1]
            self.assertEqual(
                engine,
                engine_in_use.upper(),
                msg="Unexpected engine in use: {}".format(engine_in_use))

            # Ensure expected log file size in use
            if log_file_size is not None:
                cur.execute("""
                SHOW VARIABLES LIKE 'innodb_log_file_size';
                """)
                log_file_size_in_use = cur._result.rows[0][1]
                self.assertEqual(
                    log_file_size,
                    log_file_size_in_use,
                    msg="Unexpected log file size in use: {}".format(
                        log_file_size_in_use))

            # Change innodb_log_file_size = 3 MB (3145728)
            # the default page size is 16384 (16KB) which has a min file size req of 3 MB
            #      Timestamp is 4 bytes
            #      2 Timestamps + 2 varchar(255) ~= 518 bytes
            #      50331648 / 518 = 97165

            n_1 = 98000
            print("Generating {} records.".format(n_1))
            expected_records, records = self.create_n_records(n_1)

            print("Inserting {} records in table {}.".format(
                n_1, self.table_name_1()))
            inc = 0
            for record in records:
                self.insert_record(cur, record, self.table_name_1())

                # we are inserting many records, show some output for tester's sanity
                inc += 1
                if inc % 1000 == 0 and inc < 90001:
                    s = "{}%".format(int(100 * inc /
                                         n_1)) if inc % 9000 == 0 else "."
                    print(s, sep=' ', end='', flush=True)

            print("")  # breakline
            n_2 = 5  # first five records from table 1
            print("Inserting {} records in table {}.".format(
                n_2, self.table_name_2()))
            for i in range(n_2):
                self.insert_record(cur, records[i], self.table_name_2())

        print("\n\nMySQL DB Instantiated." + \
              "\nNAME: {}\nENGINE: {}".format(self.database_name(), engine_in_use) + \
              "\nTABLE: {}\nEVENTS: {} records inserted".format(self.table_name_1(), n_1) + \
              "\nTABLE: {}\nEVENTS: {} records inserted\n\n".format(self.table_name_2(), n_2))

        return expected_records
Beispiel #10
0
    def binlog_test(self):
        """
        Test binlog replication
        • Verify an initial sync returns expected records of various datatypes
        • Verify no changes and a subsequent sync results in no replicated records
        • Update, Delete, and Insert records then verify the next sync captures these changes
        • Verify some log_file and log_pos state was persisted after each sync
        """
        print("RUNNING {}\n\n".format(self.name()))

        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        expected_check_streams = {self.tap_stream_id()}
        expected_sync_streams = {self.table_name()}
        expected_pks = {self.table_name(): {'id'}}

        # verify the tap discovered the right streams
        found_catalogs = [
            catalog for catalog in menagerie.get_catalogs(conn_id)
            if catalog['tap_stream_id'] in expected_check_streams
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = expected_check_streams.symmetric_difference(found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]

        self.assertEqual(self.table_name(), test_catalog['stream_name'])

        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        selected_metadata = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog,
            menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']),
            additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run initial full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        self.maxDiff = None
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {self.table_name(): 2})
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(
            message_actions,
            ['activate_version', 'upsert', 'upsert', 'activate_version'])

        # ensure some log_file and log_pos state was persisted
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertIsNotNone(bookmark['log_file'])
        self.assertIsNotNone(bookmark['log_pos'])

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        # grab version, log_file and log_pos from state to check later
        expected_table_version = records_for_stream['table_version']

        self.assertEqual(expected_table_version, bookmark['version'])

        # check for expected records
        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]

        self.assertEqual([expected_rec_1, expected_rec_2], upsert_records)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # record count should be empty as we did not persist anything to the gate
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {})

        # run some inserts, updates, and deletes in source
        updated_rec_1_varchar = 'THIS HAS BEEN UPDATED'

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            cur.execute(
                "UPDATE {}.{} SET our_varchar = '{}' WHERE id = {}".format(
                    self.database_name(), self.table_name(),
                    updated_rec_1_varchar, rec_1['id']))

            delete_time = datetime.datetime.now()
            cur.execute("DELETE FROM {}.{} WHERE id = {}".format(
                self.database_name(), self.table_name(), rec_2['id']))

            self.insert_record(cur, rec_3)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # Either the log_file is the same but the log_pos has increased or the log_file
        # has rotated and the numeric suffix has increased
        if expected_log_file == bookmark['log_file']:
            print("PATH A")
            self.assertGreater(bookmark['log_pos'], expected_log_pos)
        else:
            expected_log_file_suffix = re.search('^.*\.(\d+)$',
                                                 expected_log_file).groups()[0]
            updated_log_file_suffix = re.search(
                '^.*\.(\d+)$', bookmark['log_file']).groups()[0]
            print("PATH B")
            self.assertGreater(int(updated_log_file_suffix),
                               int(expected_log_file_suffix))

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        updated_expected_rec_1 = copy.deepcopy(expected_rec_1)
        updated_expected_rec_2 = copy.deepcopy(expected_rec_2)
        updated_expected_rec_3 = copy.deepcopy(expected_rec_3)

        updated_expected_rec_1['our_varchar'] = updated_rec_1_varchar

        # Floats that come back from binlog provide more precision
        # than from SELECT based queries
        updated_expected_rec_1['our_unsigned_float'] = Decimal(
            "1.2345000505447388")
        updated_expected_rec_1['our_signed_float'] = -Decimal(
            "1.2345000505447388")
        #        updated_expected_rec_1['_sdc_deleted_at'] = None
        updated_expected_rec_2['our_unsigned_float'] = Decimal(
            "2.4690001010894775")
        updated_expected_rec_2['our_signed_float'] = -Decimal(
            "2.4690001010894775")
        #        updated_expected_rec_2['_sdc_deleted_at'] = None
        #        updated_expected_rec_3['_sdc_deleted_at'] = None

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # check for expected records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {self.table_name(): 3})

        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(message_actions, ['upsert', 'upsert', 'upsert'])

        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]

        deleted_at_rec = upsert_records[1].get('_sdc_deleted_at')
        deleted_at_rec_timestamp = utils.strptime_to_utc(
            deleted_at_rec).timestamp()
        time_delta = delete_time.timestamp() - deleted_at_rec_timestamp
        print("Delete time vs record: difference in seconds", time_delta)
        self.assertIsNotNone(deleted_at_rec)
        assert (time_delta < 3)  #i dunno

        # since we don't know exactly what the _sdc_deleted_at value will be
        # we will make the assertions we can make on that field here
        # and then remove it from all records prior to doing a full
        # record-level comparison
        self.assertIn('_sdc_deleted_at', upsert_records[0])
        self.assertIn('_sdc_deleted_at', upsert_records[1])
        self.assertIn('_sdc_deleted_at', upsert_records[2])
        self.assertIsNone(upsert_records[0].get('_sdc_deleted_at'))
        self.assertIsNotNone(upsert_records[1].get('_sdc_deleted_at'))
        self.assertIsNone(upsert_records[2].get('_sdc_deleted_at'))
        del upsert_records[0]['_sdc_deleted_at']
        del upsert_records[1]['_sdc_deleted_at']
        del upsert_records[2]['_sdc_deleted_at']

        self.assertEqual([
            updated_expected_rec_1, updated_expected_rec_2,
            updated_expected_rec_3
        ], upsert_records)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        self.maxDiff = None
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # record count should be empty as we did not persist anything to the gate
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {})
Beispiel #11
0
    def initialize_db(self, engine):
        connection = db_utils.get_db_connection(self.get_properties(),
                                                self.get_credentials())

        with connection.cursor() as cur:
            create_databases_sql = """
                DROP DATABASE IF EXISTS {};
                CREATE DATABASE {};
            """.format(self.database_name(), self.database_name())

            cur.execute(create_databases_sql)
            cur.execute(
                """
            SELECT EXISTS (
            SELECT 1
            FROM  information_schema.tables
            WHERE  table_schema = %s
            AND  table_name =   %s);""",
                [self.database_name(), self.table_name()])

            existing_table = cur.fetchone()[0]

            if existing_table:
                cur.execute("DROP TABLE {}.{}".format(self.database_name(),
                                                      self.table_name()))

            create_table_sql = """
CREATE TABLE {}.{} (
            id                     BIGINT PRIMARY KEY,
            our_char               CHAR,
            our_enum               ENUM('one', 'two', 'three'),
            our_longtext           LONGTEXT,
            our_mediumtext         MEDIUMTEXT,
            our_text               TEXT,
            our_varchar            VARCHAR(255),
            our_unsigned_tinyint   TINYINT UNSIGNED,
            our_signed_tinyint     TINYINT,
            our_unsigned_smallint  SMALLINT UNSIGNED,
            our_signed_smallint    SMALLINT,
            our_unsigned_mediumint MEDIUMINT UNSIGNED,
            our_signed_mediumint   MEDIUMINT,
            our_unsigned_int       INT UNSIGNED,
            our_signed_int         INT,
            our_unsigned_bigint    BIGINT UNSIGNED,
            our_signed_bigint      BIGINT,
            our_unsigned_decimal_1 DECIMAL(11,2) UNSIGNED,
            our_signed_decimal_1   DECIMAL(11,2),
            our_unsigned_decimal_2 DECIMAL UNSIGNED,
            our_signed_decimal_2   DECIMAL,
            our_unsigned_float     FLOAT UNSIGNED,
            our_signed_float       FLOAT,
            our_unsigned_double    DOUBLE UNSIGNED,
            our_signed_double      DOUBLE,
            our_bit_1              BIT(1),
            our_datetime           DATETIME,
            our_timestamp          TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            our_date               DATE,
            our_time               TIME,
            our_boolean            BOOLEAN
)
ENGINE = {}
""".format(self.database_name(), self.table_name(), engine)

            cur.execute(create_table_sql)

            # Ensure expected engine in use
            cur.execute(
                """
            SELECT TABLE_NAME, ENGINE
            FROM  information_schema.tables
            where  table_schema =   %s;""", [self.database_name()])
            engine_in_use = cur._result.rows[0][1]
            self.assertEqual(
                engine,
                engine_in_use.upper(),
                msg="Unexpected engine in use: {}".format(engine_in_use))

            for record in [rec_1, rec_2]:
                self.insert_record(cur, record)

        print("\n\nMySQL DB Instantiated." + \
              "\nNAME: {}\nENGINE: {}".format(self.database_name(), engine_in_use) + \
              "\nTABLE: {}\nEVENTS: 2 records inserted\n\n".format(self.table_name()))
Beispiel #12
0
def create_user_table() -> None:
    conn = get_db_connection()
    conn.execute('CREATE TABLE user (user_name TEXT PRIMARY KEY, user_alias TEXT )')
    conn.commit()
    conn.close()
Beispiel #13
0
def create_meetings_table() -> None:
    conn = get_db_connection()
    conn.execute('CREATE TABLE meeting (meeting_id TEXT, meeting_pw TEXT, '
                 'user_name TEXT, meeting_name TEXT, meeting_company TEXT, FOREIGN KEY (user_name) REFERENCES user(user_name))')
    conn.commit()
    conn.close()