Python select_catalog_and_fields_via_metadata Beispiele

Programmiersprache: Python

Namespace / Paketname: tap_tester.connections

Methode / Funktion: select_catalog_and_fields_via_metadata

Beispiele auf hotexamples.com: 30

Python select_catalog_and_fields_via_metadata - 30 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die tap_tester.connections.select_catalog_and_fields_via_metadata, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

Datei: base.py Projekt: BazaarvoiceBizTech/tap-jira

    def select_all_streams_and_fields(conn_id,
                                      catalogs,
                                      select_all_fields: bool = True):
        """Select all streams and all fields within streams"""
        for catalog in catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])

            non_selected_properties = []
            if not select_all_fields:
                # get a list of all properties so that none are selected
                non_selected_properties = set(
                    schema.get('annotated-schema', {}).get('properties',
                                                           {}).keys())

            # HACK: This can be removed if the tap unwraps envelope
            # objects and declares replication keys as automatic
            if catalog[
                    "tap_stream_id"] == 'issues' and 'fields' in non_selected_properties:
                non_selected_properties.remove(
                    "fields")  # This contains replication key for issues
            elif catalog[
                    "tap_stream_id"] == "worklogs" and 'updated' in non_selected_properties:
                non_selected_properties.remove(
                    "updated")  # Replication key for worklogs

            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], non_selected_properties)

Beispiel #2

Datei anzeigen

Datei: base.py Projekt: isabella232/tap-google-analytics

    def select_all_streams_and_fields(self,
                                      conn_id,
                                      catalogs,
                                      select_all_fields: bool = True,
                                      select_default_fields: bool = False):
        """Select all streams and all fields within streams"""

        for catalog in catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])

            non_selected_properties = []
            if not select_all_fields:
                # get a list of all properties so that none are selected
                non_selected_properties = set(
                    schema.get('annotated-schema', {}).get('properties',
                                                           {}).keys())

                if select_default_fields and self.is_custom_report(
                        catalog['stream_name']):
                    non_selected_properties = non_selected_properties.difference(
                        self.custom_report_minimum_valid_field_selection(
                            catalog['stream_name']))

            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], non_selected_properties)

Beispiel #3

Datei anzeigen

Datei: test_exacttarget_base.py Projekt: uptilab2/tap-exacttarget

    def select_found_catalogs(self,
                              conn_id,
                              found_catalogs,
                              only_streams=None):
        selected = []
        for catalog in found_catalogs:
            if only_streams and catalog["tap_stream_id"] not in only_streams:
                continue
            schema = menagerie.select_catalog(conn_id, catalog)

            selected.append({
                "key_properties":
                catalog.get("key_properties"),
                "schema":
                schema,
                "tap_stream_id":
                catalog.get("tap_stream_id"),
                "replication_method":
                catalog.get("replication_method"),
                "replication_key":
                catalog.get("replication_key"),
            })

        for catalog_entry in selected:
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog_entry,
                {"annotated-schema": catalog_entry['schema']})

Beispiel #4

Datei anzeigen

    def _select_streams_and_fields(self, conn_id, catalogs,
                                   select_default_fields,
                                   select_pagination_fields):
        """Select all streams and all fields within streams"""

        for catalog in catalogs:

            schema_and_metadata = menagerie.get_annotated_schema(
                conn_id, catalog['stream_id'])
            metadata = schema_and_metadata['metadata']

            properties = set(md['breadcrumb'][-1] for md in metadata
                             if len(md['breadcrumb']) > 0
                             and md['breadcrumb'][0] == 'properties')

            # get a list of all properties so that none are selected
            if select_default_fields:
                non_selected_properties = properties.difference(
                    self.expected_default_fields()[catalog['stream_name']])
            elif select_pagination_fields:
                non_selected_properties = properties.difference(
                    self.expected_pagination_fields()[catalog['stream_name']])
            else:
                non_selected_properties = properties

            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema_and_metadata, [],
                non_selected_properties)

Beispiel #5

Datei anzeigen

    def select_found_catalogs(self,
                              conn_id,
                              catalogs,
                              only_streams=None,
                              deselect_all_fields: bool = False,
                              non_selected_props=[]):
        """Select all streams and all fields within streams"""
        for catalog in catalogs:
            if only_streams and catalog["stream_name"] not in only_streams:
                continue
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog["stream_id"])

            non_selected_properties = non_selected_props if not deselect_all_fields else []
            if deselect_all_fields:
                # get a list of all properties so that none are selected
                non_selected_properties = schema.get("annotated-schema",
                                                     {}).get("properties", {})
                non_selected_properties = non_selected_properties.keys()
            additional_md = []

            connections.select_catalog_and_fields_via_metadata(
                conn_id,
                catalog,
                schema,
                additional_md=additional_md,
                non_selected_fields=non_selected_properties)

Beispiel #6

Datei anzeigen

Datei: test_all_streams.py Projekt: BenjMaq/bmaquet-tap-zendesk

    def do_test(self, conn_id):
        # Select our catalogs
        our_catalogs = [c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream,{}).get('messages',[])
            if stream in  ['tickets', 'groups', 'users']:
                self.assertGreater(len(messages), 100, msg="Stream {} has fewer than 100 records synced".format(stream))
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk), msg="Missing primary-key for message {}".format(m))

Beispiel #7

Datei anzeigen

Datei: base.py Projekt: isabella232/tap-stripe

    def select_all_streams_and_fields(self,
                                      conn_id,
                                      catalogs,
                                      select_all_fields: bool = True,
                                      exclude_streams=None):
        """Select all streams and all fields within streams"""

        for catalog in catalogs:
            if exclude_streams and catalog.get(
                    'stream_name') in exclude_streams:
                continue
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            non_selected_properties = []
            if not select_all_fields:
                # get a list of all properties so that none are selected
                non_selected_properties = schema.get('annotated-schema',
                                                     {}).get('properties', {})
                # remove properties that are automatic
                for prop in self.expected_automatic_fields().get(
                        catalog['stream_name'], []):
                    if prop in non_selected_properties:
                        del non_selected_properties[prop]
                non_selected_properties = non_selected_properties.keys()
            additional_md = []

            connections.select_catalog_and_fields_via_metadata(
                conn_id,
                catalog,
                schema,
                additional_md=additional_md,
                non_selected_fields=non_selected_properties)

Beispiel #8

Datei anzeigen

Datei: test_github_projects_streams.py Projekt: vitaliizazmic/tap-github

def select_all_streams_and_fields(conn_id, catalogs):
    """Select all streams and all fields within streams"""
    for catalog in catalogs:
        schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id'])

        connections.select_catalog_and_fields_via_metadata(
            conn_id, catalog, schema)

Beispiel #9

Datei anzeigen

Datei: test_hubspot_bookmarks2.py Projekt: vroomly/tap-hubspot

    def test_run(self):
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        #select all catalogs
        for catalog in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id']))

        future_time = "2050-01-01T00:00:00.000000Z"

        #clear state
        future_bookmarks = {"currently_syncing" : None,
                            "bookmarks":  {"contacts" : {"offset" : {},
                                                         "versionTimestamp" :  future_time},
                                           "subscription_changes" : {"startTimestamp" : future_time,
                                                                     "offset" :  {}},
                                           "campaigns" :  {"offset" : {}},
                                           "forms" : {"updatedAt" :  future_time},
                                           "deals" :  {"offset" :  {},
                                                       "hs_lastmodifieddate" :  future_time},
                                           "workflows" :  {"updatedAt" : future_time},
                                           "owners" :  {"updatedAt" :  future_time},
                                           "contact_lists" :  {"updatedAt" :  future_time,
                                                               "offset" :  {}},
                                           "email_events" :  {"startTimestamp" : future_time,
                                                              "offset" : {}},
                                           "companies" :  {"offset" : {},
                                                           "hs_lastmodifieddate" :  future_time},
                                           "engagements" :  {"lastUpdated" :  future_time,
                                                             "offset" : {}}}}

        menagerie.set_state(conn_id, future_bookmarks)

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        #because the bookmarks were set into the future, we should NOT actually replicate any data.
        #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks
        streams_with_bookmarks = self.expected_sync_streams()
        streams_with_bookmarks.remove('campaigns')
        streams_with_bookmarks.remove('deal_pipelines')
        bad_streams = streams_with_bookmarks.intersection(record_count_by_stream.keys())
        self.assertEqual(len(bad_streams), 0, msg="still pulled down records from {} despite future bookmarks".format(bad_streams))


        state = menagerie.get_state(conn_id)

        # NB: Companies and engagements won't set a bookmark in the future.
        state["bookmarks"].pop("companies")
        state["bookmarks"].pop("engagements")
        future_bookmarks["bookmarks"].pop("companies")
        future_bookmarks["bookmarks"].pop("engagements")

        self.assertEqual(state, future_bookmarks, msg="state should not have been modified because we didn't replicate any data")
        bookmarks = state.get('bookmarks')
        bookmark_streams = set(state.get('bookmarks').keys())

Beispiel #10

Datei anzeigen

    def test_catalog_without_properties(self):

        self.setUpTestEnvironment()

        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertEqual(len(found_catalogs), 1,
                         msg="unable to locate schemas for connection {}".format(self.conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset, msg="Expected check streams are not subset of discovered catalog")

        our_catalogs = [c for c in found_catalogs if c.get(
            'tap_stream_id') in self.expected_streams()]

        # Select our catalogs
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                self.conn_id, c['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                self.conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, self.conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        synced_records = runner.get_records_from_target_output()
        upsert_messages = [m for m in synced_records.get(
            'csv_with_empty_lines').get('messages') if m['action'] == 'upsert']

        records = [message.get('data') for message in upsert_messages]

        #Empty line should be ignored in emitted records.

        expected_records = [
            {'id': 1, 'name': 'John', '_sdc_extra': [{'name': 'carl'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 2},
            {'id': 2, 'name': 'Bob', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 3},
            {'id': 3, '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 4},
            {'id': 4, 'name': 'Alice', '_sdc_extra': [{'no_headers': ['Ben', '5']}, {
                'name': 'Barak'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 5}
        ]

        self.assertListEqual(expected_records, records)

Beispiel #11

Datei anzeigen

    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")

        #select all catalogs

        for c in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id'])
            if c['stream_name'] in self.expected_sync_streams().keys():
                stream = c['stream_name']
                pks = self.expected_sync_streams()[stream]

                for pk in pks:
                    mdata = next((m for m in catalog_entry['metadata']
                                  if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == pk), None)
                    print("Validating inclusion on {}: {}".format(c['stream_name'], mdata))
                    self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic')

                connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry)

        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        first_record_count_by_stream = runner.examine_target_output_file(self, conn_id, set(self.expected_sync_streams().keys()), self.expected_sync_streams())
        replicated_row_count =  reduce(lambda accum,c : accum + c, first_record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Verify that automatic fields are all emitted with records
        synced_records = runner.get_records_from_target_output()
        for stream_name, data in synced_records.items():
            record_messages = [set(row['data'].keys()) for row in data['messages']]
            self.assertGreater(len(record_messages), 0, msg="stream {} did not sync any records.".format(stream_name))
            for record_keys in record_messages:
                self.assertEqual(self.expected_sync_streams().get(stream_name, set()) - record_keys, set())

Beispiel #12

Datei anzeigen

Datei: test_facebook_field_selection.py Projekt: tsrivishnu/tap-facebook

    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        all_excluded_fields = {}
        # select all catalogs
        for c in found_catalogs:
            if c['stream_name'] == 'ads':
                continue

            discovered_schema = menagerie.get_annotated_schema(conn_id, c['stream_id'])['annotated-schema']
            all_excluded_fields[c['stream_name']] = list(set(discovered_schema.keys()) - self.expected_automatic_fields().get(c['stream_name'], set()))[:5]
            connections.select_catalog_and_fields_via_metadata(
                conn_id,
                c,
                discovered_schema,
                non_selected_fields=all_excluded_fields[c['stream_name']])

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # This should be validating the the PKs are written in each record
        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        synced_records = runner.get_records_from_target_output()
        self.assertTrue('ads' not in synced_records.keys())
        for stream_name, data in synced_records.items():
            record_messages = [set(row['data'].keys()) for row in data['messages']]
            for record_keys in record_messages:
                # The intersection should be empty
                self.assertFalse(record_keys.intersection(all_excluded_fields[stream_name]))

Beispiel #13

Datei anzeigen

    def test_run(self):

        conn_id = connections.ensure_connection(self, payload_hook=None)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")
        #
        # # Select some catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for catalog in our_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], [])

        # # Verify that all streams sync at least one row for initial sync
        # # This test is also verifying access token expiration handling. If test fails with
        # # authentication error, refresh token was not replaced after expiring.
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        zero_count_streams = {
            k
            for k, v in record_count_by_stream.items() if v == 0
        }
        self.assertFalse(
            zero_count_streams,
            msg="The following streams did not sync any rows {}".format(
                zero_count_streams))

Beispiel #14

Datei anzeigen

Datei: test_basic.py Projekt: isabella232/tap-yotpo

    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")

        # Select some catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for catalog in our_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema)

        # Clear State and run sync
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

Beispiel #15

Datei anzeigen

    def select_all_streams_and_fields(conn_id, catalogs, select_all_fields: bool = True):
        """Select all streams and all fields within streams"""
        for catalog in catalogs:
            schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id'])

            non_selected_properties = []
            if not select_all_fields:
                # get a list of all properties so that none are selected
                non_selected_properties = schema.get('annotated-schema', {}).get(
                    'properties', {}).keys()

            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], non_selected_properties)

Beispiel #16

Datei anzeigen

    def test_run(self):
        conn_id = self.create_connection()

        # Select our catalogs
        our_catalogs = [
            c for c in self.found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream).get('messages')
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk),
                                         msg="oh no! {}".format(m))

        bookmarks = menagerie.get_state(conn_id)['bookmarks']

        self.assertTrue('orders' in bookmarks)

Beispiel #17

Datei anzeigen

Datei: test_facebook_bookmarks.py Projekt: tsrivishnu/tap-facebook

    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        #select all catalogs
        #selected_catalogs = list(map(lambda catalog: self.perform_field_selection(conn_id, catalog), found_catalogs))
        #menagerie.post_annotated_catalogs(conn_id, selected_catalogs)

        for c in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(conn_id, c,
                                                               menagerie.get_annotated_schema(conn_id, c['stream_id']))

        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # bookmarks for the 4 streams should be 2015-03-16
        states = menagerie.get_state(conn_id)["bookmarks"]
        end_date = self.get_properties()["end_date"].split()[0]
        for k, v in states.items():
            if "insights" in k:
                bm_date = v.get("date_start")
                self.assertEqual(end_date, bm_date)
        print("bookmarks match end_date of {}".format(end_date))

Beispiel #18

Datei anzeigen

    def select_found_catalogs(self, found_catalogs):
        # selected = [menagerie.select_catalog(self.conn_id, c) for c in found_catalogs]
        # menagerie.post_annotated_catalogs(self.conn_id, selected)
        for catalog in found_catalogs:
            schema = menagerie.get_annotated_schema(self.conn_id,
                                                    catalog['stream_id'])
            non_selected_properties = []
            additional_md = []

            connections.select_catalog_and_fields_via_metadata(
                self.conn_id,
                catalog,
                schema,
                additional_md=additional_md,
                non_selected_fields=non_selected_properties)

Beispiel #19

Datei anzeigen

    def select_specific_fields(conn_id, catalogs, select_all_fields: bool = True, specific_fields: dict = {}):
        """Select all streams and all fields within streams"""
        for catalog in catalogs:
            schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id'])

            non_selected_properties = []
            if not select_all_fields:
                # get a list of all properties and remove measuer fields
                non_selected_properties = set(schema.get('annotated-schema', {}).get(
                    'properties', {}).keys())
                spec_fields = specific_fields.get(catalog['stream_name'], set())
                non_selected_properties_adjusted = non_selected_properties.difference(spec_fields)

            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], non_selected_properties_adjusted)

Beispiel #20

Datei anzeigen

Datei: test_hubspot_bookmarks1.py Projekt: vroomly/tap-hubspot

    def test_run(self):
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # Select all Catalogs
        for catalog in found_catalogs:
            if catalog['tap_stream_id'] in self.expected_sync_streams():
                connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id']))

        #clear state
        menagerie.set_state(conn_id, {})

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        max_bookmarks_from_records = runner.get_most_recent_records_from_target(self, self.expected_bookmarks(), self.get_properties()['start_date'])

        start_of_today =  utils.strftime(datetime.datetime(datetime.datetime.utcnow().year, datetime.datetime.utcnow().month, datetime.datetime.utcnow().day, 0, 0, 0, 0, datetime.timezone.utc))
        max_bookmarks_from_records['subscription_changes'] = start_of_today
        max_bookmarks_from_records['email_events'] = start_of_today


        #if we didn't replicate data, the bookmark should be the start_date
        for k in self.expected_bookmarks().keys():
            if max_bookmarks_from_records.get(k) is None:
                max_bookmarks_from_records[k] = utils.strftime(datetime.datetime(2017, 5, 1, 0, 0, 0, 0, datetime.timezone.utc))

        state = menagerie.get_state(conn_id)
        bookmarks = state.get('bookmarks')
        bookmark_streams = set(state.get('bookmarks').keys())

        #verify bookmarks and offsets
        for k,v in sorted(list(self.expected_bookmarks().items())):
            for w in v:
                bk_value = bookmarks.get(k,{}).get(w)
                self.assertEqual(utils.strptime_with_tz(bk_value),
                                 utils.strptime_with_tz(max_bookmarks_from_records[k]),
                                 "Bookmark {} ({}) for stream {} should have been updated to {}".format(bk_value, w, k, max_bookmarks_from_records[k]))
                print("bookmark {}({}) updated to {} from max record value {}".format(k, w, bk_value, max_bookmarks_from_records[k]))

        for k,v in self.expected_offsets().items():
            self.assertEqual(bookmarks.get(k,{}).get('offset', {}), v, msg="unexpected offset found for stream {} {}. state: {}".format(k, v, state))
            print("offsets {} cleared".format(k))

        diff = bookmark_streams.difference(self.acceptable_bookmarks())
        self.assertEqual(len(diff), 0, msg="Unexpected bookmarks: {} Expected: {} Actual: {}".format(diff, self.acceptable_bookmarks(), bookmarks))

        self.assertEqual(state.get('currently_syncing'), None,"Unexpected `currently_syncing` bookmark value: {} Expected: None".format(state.get('currently_syncing')))

Beispiel #21

Datei anzeigen

    def run_test(self):
        conn_id = connections.ensure_connection(self)

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        catalog = menagerie.get_catalogs(conn_id)
        found_catalog_names = set(map(lambda c: c['tap_stream_id'], catalog))

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(), found_catalog_names)

        for tap_stream_id in self.expected_check_streams():
            found_stream = [
                c for c in catalog if c['tap_stream_id'] == tap_stream_id
            ][0]
            schema_and_metadata = menagerie.get_annotated_schema(
                conn_id, found_stream['stream_id'])
            main_metadata = schema_and_metadata["metadata"]
            stream_metadata = [
                mdata for mdata in main_metadata if mdata["breadcrumb"] == []
            ]

            # assert that the pks are correct
            self.assertEqual(
                self.expected_pks()[tap_stream_id],
                set(stream_metadata[0]['metadata']['table-key-properties']))

        for stream_catalog in catalog:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema['annotated-schema'],
                [])

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_first_sync_streams(),
            self.expected_pks())

        # Verify that the full table was syncd
        for tap_stream_id in self.expected_first_sync_streams():
            self.assertEqual(
                self.expected_first_sync_row_counts()[tap_stream_id],
                record_count_by_stream[tap_stream_id])

Beispiel #22

Datei anzeigen

Datei: base_for_compressed_file.py Projekt: symon-ai/tap-s3-csv

    def select_specific_catalog(self, found_catalogs, catalog_to_select):
        for catalog in found_catalogs:
            if catalog['tap_stream_id'] != catalog_to_select:
                continue

            schema = menagerie.get_annotated_schema(self.conn_id,
                                                    catalog['stream_id'])
            non_selected_properties = []
            additional_md = []

            connections.select_catalog_and_fields_via_metadata(
                self.conn_id,
                catalog,
                schema,
                additional_md=additional_md,
                non_selected_fields=non_selected_properties)
            break

Beispiel #23

Datei anzeigen

    def test_run(self):
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # Select only the expected streams tables
        expected_streams = self.testable_streams()
        catalog_entries = [
            ce for ce in found_catalogs
            if ce['tap_stream_id'] in expected_streams
        ]

        for catalog_entry in catalog_entries:
            stream_schema = menagerie.get_annotated_schema(
                conn_id, catalog_entry['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog_entry, stream_schema)

        # Run sync
        first_record_count_by_stream = self.run_and_verify_sync(conn_id)

        replicated_row_count = sum(first_record_count_by_stream.values())
        synced_records = runner.get_records_from_target_output()

        # Test by Stream
        for stream in self.testable_streams():
            with self.subTest(stream=stream):

                expected_fields = set(
                    synced_records.get(stream)['schema']['properties'].keys())
                print('Number of expected keys ', len(expected_fields))
                actual_fields = set(
                    runner.examine_target_output_for_fields()[stream])
                print('Number of actual keys ', len(actual_fields))
                print('Number of known missing keys ',
                      len(KNOWN_MISSING_FIELDS[stream]))

                unexpected_fields = actual_fields & KNOWN_MISSING_FIELDS[stream]
                if unexpected_fields:
                    print('WARNING: Found new fields: {}'.format(
                        unexpected_fields))
                self.assertSetEqual(
                    expected_fields,
                    actual_fields | KNOWN_MISSING_FIELDS[stream])

Beispiel #24

Datei anzeigen

Datei: test_postgres_automatic_fields.py Projekt: ers81239/tap-postgres

    def select_streams_and_fields(self,
                                  conn_id,
                                  catalog,
                                  select_all_fields: bool = False):
        """Select all streams and all fields within streams or all streams and no fields."""

        schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id'])

        if self.default_replication_method is self.FULL_TABLE:
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    "replication-method": self.FULL_TABLE
                }
            }]

        elif self.default_replication_method is self.INCREMENTAL:
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    "replication-method": self.INCREMENTAL,
                    "replication-key": "our_integer"
                }
            }]

        else:
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    "replication-method": self.LOG_BASED
                }
            }]

        non_selected_properties = []
        if not select_all_fields:
            # get a list of all properties so that none are selected
            non_selected_properties = schema.get('annotated-schema',
                                                 {}).get('properties',
                                                         {}).keys()

        connections.select_catalog_and_fields_via_metadata(
            conn_id, catalog, schema, additional_md, non_selected_properties)

Beispiel #25

Datei anzeigen

    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify discovery produced (at least) 1 expected catalog
        found_catalogs = [
            found_catalog for found_catalog in menagerie.get_catalogs(conn_id)
            if found_catalog['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreaterEqual(len(found_catalogs), 1)

        # verify the tap discovered the expected streams
        found_catalog_names = {
            catalog['tap_stream_id']
            for catalog in found_catalogs
        }
        self.assertSetEqual(self.expected_check_streams(), found_catalog_names)

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]
        self.assertEqual(test_table_name, test_catalog['stream_name'])
        print("discovered streams are correct")

        # perform table selection
        print('selecting {} and all fields within the table'.format(
            test_table_name))
        schema_and_metadata = menagerie.get_annotated_schema(
            conn_id, test_catalog['stream_id'])
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'FULL_TABLE'
            }
        }]
        _ = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog, schema_and_metadata, additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run sync job 1 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_1 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(3, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(5, len(messages))
        self.assertEqual('activate_version', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('upsert', messages[3]['action'])
        self.assertEqual('activate_version', messages[4]['action'])

        # verify the persisted schema matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # verify replicated records match expectations
        self.assertDictEqual(self.expected_records[0], messages[1]['data'])
        self.assertDictEqual(self.expected_records[1], messages[2]['data'])
        self.assertDictEqual(self.expected_records[2], messages[3]['data'])

        print("records are correct")

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_1, bookmark['version'])

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN and get the same 3 records
        #----------------------------------------------------------------------

        # run sync job 2 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_2 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(3, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(4, len(messages))
        self.assertEqual('upsert', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('activate_version', messages[3]['action'])

        # verify the new table version increased on the second sync
        self.assertGreater(table_version_2, table_version_1)

        # verify the persisted schema still matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # verify replicated records still match expectations
        self.assertDictEqual(self.expected_records[0], messages[0]['data'])
        self.assertDictEqual(self.expected_records[1], messages[1]['data'])
        self.assertDictEqual(self.expected_records[2], messages[2]['data'])

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_2, bookmark['version'])

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN following various manipulations to the data
        #----------------------------------------------------------------------

        with db_utils.get_test_connection('dev') as conn:
            conn.autocommit = True
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:

                # NB | We will perform the following actions prior to the next sync:
                #      [Action (EXPECTED RESULT)]

                #      Insert a record
                #      Insert a record to be updated prior to sync
                #      Insert a record to be deleted prior to sync (NOT REPLICATED)

                #      Update an existing record
                #      Update a newly inserted record

                #      Delete an existing record
                #      Delete  a newly inserted record

                # inserting...
                # a new record
                nyc_tz = pytz.timezone('America/New_York')
                our_time_offset = "-04:00"
                our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(6, 6, 6)
                our_time_tz = our_time.isoformat() + our_time_offset
                our_date = datetime.date(1970, 7, 1)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 2",
                    'our_varchar_10':
                    "varchar_10",
                    'our_text':
                    "some text 2",
                    'our_integer':
                    44101,
                    'our_smallint':
                    2,
                    'our_bigint':
                    1000001,
                    'our_decimal':
                    decimal.Decimal('9876543210.02'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '1',
                    'our_json':
                    json.dumps({'nymn': 77}),
                    'our_jsonb':
                    json.dumps({'burgers': 'good++'}),
                    'our_uuid':
                    my_uuid,
                    'our_citext':
                    'cyclops 2',
                    'our_store':
                    'dances=>"floor",name=>"betty"',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    '$0.98789'
                })
                self.expected_records.append({
                    'id':
                    4,
                    'our_varchar':
                    "our_varchar 2",
                    'our_varchar_10':
                    "varchar_10",
                    'our_text':
                    "some text 2",
                    'our_integer':
                    44101,
                    'our_smallint':
                    2,
                    'our_bigint':
                    1000001,
                    'our_decimal':
                    decimal.Decimal('9876543210.02'),
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'OUR DATE':
                    '1970-07-01T00:00:00+00:00',
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    True,
                    'our_json':
                    '{"nymn": 77}',
                    'our_jsonb':
                    '{"burgers": "good++"}',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_citext':
                    self.inserted_records[-1]['our_citext'],
                    'our_store': {
                        "name": "betty",
                        "dances": "floor"
                    },
                    'our_cidr':
                    self.inserted_records[-1]['our_cidr'],
                    'our_inet':
                    self.inserted_records[-1]['our_inet'],
                    'our_mac':
                    self.inserted_records[-1]['our_mac'],
                    'our_money':
                    '$0.99',
                    'our_alignment_enum':
                    None,
                })
                # a new record which we will then update prior to sync
                our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(12, 11, 10)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1999, 9, 9)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 4",
                    'our_varchar_10':
                    "varchar_3",
                    'our_text':
                    "some text 4",
                    'our_integer':
                    55200,
                    'our_smallint':
                    1,
                    'our_bigint':
                    100000,
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '0',
                    'our_json':
                    json.dumps('some string'),
                    'our_jsonb':
                    json.dumps(['burgers are good']),
                    'our_uuid':
                    my_uuid,
                    'our_store':
                    'size=>"small",name=>"betty"',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                })
                self.expected_records.append({
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    'our_text':
                    'some text 4',
                    'our_bit':
                    False,
                    'our_integer':
                    55200,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'id':
                    5,
                    'our_json':
                    self.inserted_records[-1]['our_json'],
                    'our_boolean':
                    True,
                    'our_jsonb':
                    self.inserted_records[-1]['our_jsonb'],
                    'our_bigint':
                    100000,
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'our_store': {
                        "name": "betty",
                        "size": "small"
                    },
                    'our_smallint':
                    1,
                    'OUR DATE':
                    '1999-09-09T00:00:00+00:00',
                    'our_varchar':
                    'our_varchar 4',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_varchar_10':
                    'varchar_3',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                    'our_alignment_enum':
                    None,
                })
                # a new record to be deleted prior to sync
                our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(12, 11, 10)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1999, 9, 9)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 4",
                    'our_varchar_10':
                    "varchar_3",
                    'our_text':
                    "some text 4",
                    'our_integer':
                    55200,
                    'our_smallint':
                    1,
                    'our_bigint':
                    100000,
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '0',
                    'our_json':
                    json.dumps('some string'),
                    'our_jsonb':
                    json.dumps(['burgers are good']),
                    'our_uuid':
                    my_uuid,
                    'our_store':
                    'size=>"small",name=>"betty"',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                })
                self.expected_records.append({
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    'our_text':
                    'some text 4',
                    'our_bit':
                    False,
                    'our_integer':
                    55200,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'id':
                    6,
                    'our_json':
                    self.inserted_records[-1]['our_json'],
                    'our_boolean':
                    True,
                    'our_jsonb':
                    self.inserted_records[-1]['our_jsonb'],
                    'our_bigint':
                    100000,
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'our_store': {
                        "name": "betty",
                        "size": "small"
                    },
                    'our_smallint':
                    1,
                    'OUR DATE':
                    '1999-09-09T00:00:00+00:00',
                    'our_varchar':
                    'our_varchar 4',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_varchar_10':
                    'varchar_3',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                    'our_alignment_enum':
                    None,
                })

                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[3])
                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[4])
                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[5])

                # updating ...
                # an existing record
                canon_table_name = db_utils.canonicalized_table_name(
                    cur, test_schema_name, test_table_name)
                record_pk = 1
                our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                updated_data = {
                    "OUR TS TZ": our_ts_tz,
                    "our_double": decimal.Decimal("6.6"),
                    "our_money": "$0.00"
                }
                self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz(
                    our_ts_tz)
                self.expected_records[0]["our_double"] = decimal.Decimal("6.6")
                self.expected_records[0]["our_money"] = "$0.00"

                db_utils.update_record(cur, canon_table_name, record_pk,
                                       updated_data)

                # a newly inserted record
                canon_table_name = db_utils.canonicalized_table_name(
                    cur, test_schema_name, test_table_name)
                record_pk = 5
                our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                updated_data = {
                    "OUR TS TZ": our_ts_tz,
                    "our_double": decimal.Decimal("6.6"),
                    "our_money": "$0.00"
                }
                self.expected_records[4]["OUR TS TZ"] = self.expected_ts_tz(
                    our_ts_tz)
                self.expected_records[4]["our_double"] = decimal.Decimal("6.6")
                self.expected_records[4]["our_money"] = "$0.00"

                db_utils.update_record(cur, canon_table_name, record_pk,
                                       updated_data)

                # deleting
                # an existing record
                record_pk = 2
                db_utils.delete_record(cur, canon_table_name, record_pk)

                # a newly inserted record
                record_pk = 6
                db_utils.delete_record(cur, canon_table_name, record_pk)

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN after vairous manipulations
        #----------------------------------------------------------------------

        # run sync job 3 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_3 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(4, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(5, len(messages))
        self.assertEqual('upsert', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('upsert', messages[3]['action'])
        self.assertEqual('activate_version', messages[4]['action'])

        # verify the new table version increased on the second sync
        self.assertGreater(table_version_3, table_version_2)

        # verify the persisted schema still matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # NB | This is a little tough to track mentally so here's a breakdown of
        #      the order of operations by expected records indexes:

        #      Prior to Sync 1
        #        insert 0, 1, 2

        #      Prior to Sync 2
        #        No db changes

        #      Prior to Sync 3
        #        insert 3, 4, 5
        #        update 0, 4
        #        delete 1, 5

        #      Resulting Synced Records: 2, 3, 0, 4

        # verify replicated records still match expectations
        self.assertDictEqual(self.expected_records[2],
                             messages[0]['data'])  # existing insert
        self.assertDictEqual(self.expected_records[3],
                             messages[1]['data'])  # new insert
        self.assertDictEqual(self.expected_records[0],
                             messages[2]['data'])  # existing update
        self.assertDictEqual(self.expected_records[4],
                             messages[3]['data'])  # new insert / update

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_3, bookmark['version'])

Beispiel #26

Datei anzeigen

    def binlog_json_test(self):
        print("RUNNING {}\n\n".format(self.name()))

        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        expected_check_streams = {self.tap_stream_id()}
        expected_sync_streams = {self.table_name()}
        expected_pks = {self.table_name(): {'id'}}

        # verify the tap discovered the right streams
        found_catalogs = [
            catalog for catalog in menagerie.get_catalogs(conn_id)
            if catalog['tap_stream_id'] in expected_check_streams
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = expected_check_streams.symmetric_difference(found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]

        self.assertEqual(self.table_name(), test_catalog['stream_name'])

        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        selected_metadata = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog,
            menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']),
            additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run initial full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        self.maxDiff = None
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {self.table_name(): 1})
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(message_actions,
                         ['activate_version', 'upsert', 'activate_version'])

        # ensure some log_file and log_pos state was persisted
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertIsNotNone(bookmark['log_file'])
        self.assertIsNotNone(bookmark['log_pos'])

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        # grab version, log_file and log_pos from state to check later
        expected_table_version = records_for_stream['table_version']

        self.assertEqual(expected_table_version, bookmark['version'])

        # check for expected records
        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]

        self.assertEqual([expected_rec_1], upsert_records)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # record count should be empty as we did not persist anything to the gate
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {})

        # insert a new huge row
        data = dict([('foooo%i' % i, 'baaaaar%i' % i) for i in range(2560)],
                    literal=True)
        rec = {'id': 2, 'our_json': json.dumps(data)}

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            self.insert_record(cur, rec)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # Either the log_file is the same but the log_pos has increased or the log_file
        # has rotated and the numeric suffix has increased
        if expected_log_file == bookmark['log_file']:
            self.assertGreater(bookmark['log_pos'], expected_log_pos)
        else:
            expected_log_file_suffix = re.search('^.*\.(\d+)$',
                                                 expected_log_file).groups()[0]
            updated_log_file_suffix = re.search(
                '^.*\.(\d+)$', bookmark['log_file']).groups()[0]

            self.assertGreater(int(updated_log_file_suffix),
                               int(expected_log_file_suffix))

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        expected_rec_2 = copy.deepcopy(rec)

        # check for expected records
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(message_actions, ['upsert'])

        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]
        del upsert_records[0]['_sdc_deleted_at']

        expected_json = json.loads(expected_rec_2.get('our_json', {}))
        actual_json = json.loads(upsert_records[0].get('our_json', {}))

        self.assertTrue(len(actual_json.keys()) > 0)
        self.assertEqual(expected_json, actual_json)

Beispiel #27

Datei anzeigen

    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]

        self.assertEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        chicken_catalog = found_catalogs[0]

        self.assertEqual('chicken_view', chicken_catalog['stream_name'])
        print("discovered streams are correct")

        print('checking discoverd metadata for ROOT-CHICKEN_VIEW')
        md = menagerie.get_annotated_schema(
            conn_id, chicken_catalog['stream_id'])['metadata']

        self.assertEqual(
            {
                (): {
                    'database-name': 'postgres',
                    'is-view': True,
                    'row-count': 0,
                    'schema-name': 'public',
                    'table-key-properties': []
                },
                ('properties', 'fk_id'): {
                    'inclusion': 'available',
                    'sql-datatype': 'bigint',
                    'selected-by-default': True
                },
                ('properties', 'name'): {
                    'inclusion': 'available',
                    'sql-datatype': 'character varying',
                    'selected-by-default': True
                },
                ('properties', 'age'): {
                    'inclusion': 'available',
                    'sql-datatype': 'integer',
                    'selected-by-default': True
                },
                ('properties', 'size'): {
                    'inclusion': 'available',
                    'sql-datatype': 'character varying',
                    'selected-by-default': True
                },
                ('properties', 'id'): {
                    'inclusion': 'available',
                    'sql-datatype': 'integer',
                    'selected-by-default': True
                }
            }, metadata.to_map(md))

        # 'ID' selected as view-key-properties
        replication_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-key': None,
                "replication-method": "FULL_TABLE",
                'view-key-properties': ["id"]
            }
        }]

        connections.select_catalog_and_fields_via_metadata(
            conn_id, chicken_catalog,
            menagerie.get_annotated_schema(conn_id,
                                           chicken_catalog['stream_id']),
            replication_md)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        self.assertEqual(record_count_by_stream, {'chicken_view': 1})
        records_by_stream = runner.get_records_from_target_output()

        table_version = records_by_stream['chicken_view']['table_version']
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][0]['action'],
            'activate_version')
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][1]['action'],
            'upsert')
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][2]['action'],
            'activate_version')

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        actual_chicken_record = records_by_stream['chicken_view']['messages'][
            1]['data']

        expected_chicken_record = {
            'id': 1,
            'fk_id': 1,
            'name': 'fred',
            'age': 99,
            'size': 'big'
        }
        self.assertEqual(
            actual_chicken_record,
            expected_chicken_record,
            msg=
            "Expected `various_types` upsert record data to be {}, but target output {}"
            .format(expected_chicken_record, actual_chicken_record))

        print("records are correct")

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)

        chicken_bookmark = state['bookmarks']['postgres-public-chicken_view']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        self.assertEqual(
            chicken_bookmark['version'],
            table_version,
            msg="expected bookmark for stream ROOT-CHICKEN to match version")

Beispiel #28

Datei anzeigen

    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        for c in found_catalogs:
            catalog_props_to_check = ['stream_name', 'tap_stream_id']
            stream = c['stream_name']

            for prop in catalog_props_to_check:
                self.assertEqual(
                    c[prop],
                    expected_catalogs[stream][prop],
                    msg=
                    "unexpected stream catalog property `{}` for stream `{}`: `{}` != `{}`"
                    .format(prop, stream, expected_catalogs[stream][prop],
                            c[prop]))

        print("discovered streams are correct")

        print('checking discoverd metadata for tap_tester_mysql_0-incremental')
        incremental_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_0-incremental'
        ][0]
        md = menagerie.get_annotated_schema(
            conn_id, incremental_catalog['stream_id'])['metadata']

        incremental_stream_metadata = {
            'database-name': 'tap_tester_mysql_0',
            'row-count': 3,
            'is-view': False,
            'selected-by-default': False,
            'table-key-properties': ['c_pk']
        }

        self.assertEqual(
            sorted(md, key=lambda x: x['breadcrumb']),
            [{
                'breadcrumb': [],
                'metadata': incremental_stream_metadata
            }, {
                'breadcrumb': ['properties', 'c_dt'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'datetime'
                }
            }, {
                'breadcrumb': ['properties', 'c_pk'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'int(11)'
                }
            }, {
                'breadcrumb': ['properties', 'c_varchar'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'varchar(255)'
                }
            }, {
                'breadcrumb': ['properties', 'c_varchar_to_deselect'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'varchar(255)'
                }
            }])

        print('checking discovered metadata for tap_tester_mysql_1-view')
        view_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_1-view'
        ][0]
        view_catalog_key_properties_md = [{
            'breadcrumb': [],
            'metadata': {
                'view-key-properties': ['c_pk']
            }
        }]

        connections.set_non_discoverable_metadata(
            conn_id, view_catalog,
            menagerie.get_annotated_schema(conn_id, view_catalog['stream_id']),
            view_catalog_key_properties_md)
        md = menagerie.get_annotated_schema(
            conn_id, view_catalog['stream_id'])['metadata']

        view_stream_metadata = {
            'database-name': 'tap_tester_mysql_1',
            'is-view': True,
            'selected-by-default': False,
            'view-key-properties': ['c_pk']
        }

        self.assertEqual(sorted(md, key=lambda x: x['breadcrumb']),
                         [{
                             'breadcrumb': [],
                             'metadata': view_stream_metadata
                         }, {
                             'breadcrumb': ['properties', 'c_pk'],
                             'metadata': {
                                 'selected-by-default': True,
                                 'sql-datatype': 'int(11)'
                             }
                         }, {
                             'breadcrumb': ['properties', 'c_varchar'],
                             'metadata': {
                                 'selected-by-default': True,
                                 'sql-datatype': 'varchar(255)'
                             }
                         }])

        #No selected-by-default MD for c_year because it is an unsupported type
        various_types_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_0-various_types'
        ][0]
        md = menagerie.get_annotated_schema(
            conn_id, various_types_catalog['stream_id'])['metadata']
        c_year_md = [
            x for x in md if x['breadcrumb'] == ['properties', 'c_year']
        ]
        self.assertEqual(c_year_md, [{
            'breadcrumb': ['properties', 'c_year'],
            'metadata': {
                'selected-by-default': False,
                'sql-datatype': 'year(4)'
            }
        }])

        ##select_simple_example
        catalogs_to_select = [
            c for c in found_catalogs
            if c['tap_stream_id'] != 'tap_tester_mysql_0-simple_example'
        ]

        for a_catalog in catalogs_to_select:
            additional_md = []
            unselected_fields = []
            if a_catalog['tap_stream_id'] == 'tap_tester_mysql_0-incremental':
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-key': 'c_dt',
                        'replication-method': 'INCREMENTAL'
                    }
                }]
                unselected_fields = ['c_varchar_to_deselect']

            elif a_catalog['tap_stream_id'] == 'tap_tester_mysql_1-view':
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'view-key-properties': ['c_pk'],
                        'replication-method': 'FULL_TABLE'
                    }
                }]
            else:
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-method': 'FULL_TABLE'
                    }
                }]

            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, a_catalog,
                menagerie.get_annotated_schema(conn_id,
                                               a_catalog['stream_id']),
                additional_md, unselected_fields)
        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        expected_row_count = 8  # {'my_isam': 1, 'various_types': 3, 'incremental': 3, 'view': 1}
        self.assertEqual(
            replicated_row_count,
            expected_row_count,
            msg="failed to replicate correct number of rows: {}".format(
                record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        records_by_stream = runner.get_records_from_target_output()

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify that activate version messages were sent in the proper position
            self.assertEqual(
                recs['messages'][0]['action'],
                'activate_version',
                msg=
                "Expected first message sent for stream `{}` to have action `activate_version`"
                .format(stream))

            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # verify that the target output the proper numeric and date representations
        expected_various_types_records = [{
            'c_time':
            '1970-01-01T12:34:56.000000Z',
            'c_mediumint':
            8388607,
            'c_smallint':
            32767,
            'c_tinyint':
            127,
            'c_date':
            '2017-09-13T00:00:00.000000Z',
            'c_bigint':
            9223372036854775807,
            'c_decimal':
            -1,
            'c_int':
            2147483647,
            'c_bit':
            True,
            'c_decimal_2':
            Decimal('123456789.0'),
            'c_pk':
            1,
            'c_double':
            Decimal("1.234"),
            'c_float':
            Decimal("1.234"),
            'c_decimal_2_unsigned':
            Decimal("1.23"),
            'c_tinyint_1':
            True
        }, {
            'c_time':
            '1970-01-01T12:34:57.000000Z',
            'c_mediumint':
            -8388608,
            'c_smallint':
            -32768,
            'c_tinyint':
            -128,
            'c_date':
            '2017-09-14T00:00:00.000000Z',
            'c_bigint':
            -9223372036854775808,
            'c_decimal':
            0,
            'c_int':
            -2147483648,
            'c_bit':
            False,
            'c_decimal_2':
            Decimal("123456790.0"),
            'c_pk':
            2,
            'c_double':
            Decimal("2.234"),
            'c_float':
            Decimal("2.234"),
            'c_decimal_2_unsigned':
            Decimal("0.23"),
            'c_tinyint_1':
            False
        }, {
            'c_time':
            '1970-01-01T12:34:57.000000Z',
            'c_mediumint':
            -8388608,
            'c_smallint':
            -32768,
            'c_tinyint':
            -128,
            'c_date':
            '2017-09-14T00:00:00.000000Z',
            'c_bigint':
            -9223372036854775808,
            'c_decimal':
            0,
            'c_int':
            -2147483648,
            'c_bit':
            None,
            'c_decimal_2':
            Decimal("123456790.0"),
            'c_pk':
            3,
            'c_double':
            Decimal("2.234"),
            'c_float':
            Decimal("2.234"),
            'c_decimal_2_unsigned':
            Decimal("0.23"),
            'c_tinyint_1':
            None
        }]

        actual_various_types_records = [
            r['data']
            for r in records_by_stream['various_types']['messages'][1:4]
        ]

        self.assertEqual(
            actual_various_types_records,
            expected_various_types_records,
            msg=
            "Expected `various_types` upsert record data to be {}, but target output {}"
            .format(expected_various_types_records,
                    actual_various_types_records))

        # verify that deselected property was not output
        expected_incremental_record = {
            'c_pk': 1,
            'c_dt': '2017-01-01T00:00:00.000000Z',
            'c_varchar': 'a'
        }

        actual_incremental_record = records_by_stream['incremental'][
            'messages'][1]['data']

        self.assertEqual(
            actual_incremental_record,
            expected_incremental_record,
            msg=
            "Expected first `incremental` upsert record data to be {}, but target output {}"
            .format(expected_incremental_record, actual_incremental_record))

        print("records are correct")

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)
        bookmarks = state['bookmarks']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        for k, v in bookmarks.items():
            if k == 'tap_tester_mysql_0-incremental':
                self.assertIsNotNone(
                    v['version'],
                    msg="expected bookmark for stream `{}` to have a version set"
                    .format(k))
                self.assertEqual(
                    v['replication_key_value'],
                    '2017-01-01T00:00:02.000000Z',
                    msg=
                    "incorrect replication_key_value in bookmark for stream `{}`"
                    .format(k))
                self.assertEqual(
                    v['replication_key'],
                    'c_dt',
                    msg=
                    "incorrect replication_key specified in bookmark for stream `{}`"
                    .format(k))
            else:
                self.assertFalse(
                    'version' in v,
                    msg=
                    "expected bookmark for stream `{}` to not have a version key"
                    .format(k))
                self.assertTrue(
                    'initial_full_table_complete' in v,
                    msg=
                    "expected bookmark for stream `{}` to have a true initial_full_table_complete key"
                    .format(k))
        print("state and bookmarks are correct")

        incremental_table_initial_table_version = bookmarks[
            'tap_tester_mysql_0-incremental']['version']

        #----------------------------------------------------------------------
        # invoke the sync job again after some modifications
        #----------------------------------------------------------------------

        print("adding a column to an existing table in the source db")
        connection = db_utils.get_db_connection(self.get_properties(),
                                                self.get_credentials())

        with connection.cursor() as cursor:
            add_column_sql = '''
                ALTER TABLE tap_tester_mysql_0.incremental
                  ADD COLUMN favorite_number INTEGER;
                INSERT INTO tap_tester_mysql_0.incremental VALUES (4, '4', '2017-01-01 00:00:03', 'yeehaw', 999);
            '''
            cursor.execute(add_column_sql)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        expected_row_count = 7  # {'my_isam': 1, 'various_types': 3, 'incremental': 2, 'view': 1}
        self.assertEqual(
            replicated_row_count,
            expected_row_count,
            msg="failed to replicate correct number of rows: {}".format(
                record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        records_by_stream = runner.get_records_from_target_output()

        expected_schema_of_new_column = {
            'maximum': 2147483647,
            'selected': True,
            'inclusion': 'available',
            'type': ['null', 'integer'],
            'minimum': -2147483648
        }

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify that a activate version messages were sent in the proper position
            if stream == 'incremental':
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version',
                    msg=
                    "Expected first message sent for stream `{}` not to have action `activate_version`"
                    .format(stream))
                expected_schema_of_new_column = {
                    'maximum': 2147483647,
                    'inclusion': 'available',
                    'type': ['null', 'integer'],
                    'minimum': -2147483648
                }
                self.assertEqual(
                    records_by_stream[stream]['schema']['properties']
                    ['favorite_number'],
                    expected_schema_of_new_column,
                    msg=
                    "Expected newly-added column to be present in schema for stream `{}`, but it was not."
                    .format(stream))
            else:
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'upsert',
                    msg=
                    "Expected first message sent for stream `{}` to have action `upsert`"
                    .format(stream))
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version',
                    msg=
                    "Expected last message sent for stream `{}` to have action `activate_version`"
                    .format(stream))

        state = menagerie.get_state(conn_id)
        bookmarks = state['bookmarks']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        for k, v in bookmarks.items():
            if k == 'tap_tester_mysql_0-incremental':
                self.assertIsNotNone(
                    v['version'],
                    msg="expected bookmark for stream `{}` to have a version set"
                    .format(k))
                self.assertEqual(
                    v['replication_key_value'],
                    '2017-01-01T00:00:03.000000Z',
                    msg=
                    "incorrect replication_key_value in bookmark for stream `{}`"
                    .format(k))
                self.assertEqual(
                    v['replication_key'],
                    'c_dt',
                    msg=
                    "incorrect replication_key specified in bookmark for stream `{}`"
                    .format(k))
            else:
                self.assertFalse(
                    'version' in v,
                    msg=
                    "expected bookmark for stream `{}` to not have a version key"
                    .format(k))
                self.assertTrue(
                    'initial_full_table_complete' in v,
                    msg=
                    "expected bookmark for stream `{}` to have a true initial_full_table_complete key"
                    .format(k))

        print("state and bookmarks are correct")

        # verify incremental table_version didn't change
        incremental_table_new_table_version = bookmarks[
            'tap_tester_mysql_0-incremental']['version']

        self.assertEqual(
            incremental_table_initial_table_version,
            incremental_table_new_table_version,
            msg=
            "Expected incrementally-replicated table's table_version to remain unchanged over multiple invocations."
        )

Beispiel #29

Datei anzeigen

    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            2,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties

        test_catalog_cows = list(
            filter(
                lambda c: c['stream_name'] ==
                'postgres_logical_replication_test_cows', found_catalogs))[0]
        self.assertEqual('postgres_logical_replication_test_cows',
                         test_catalog_cows['stream_name'])

        test_catalog_chickens = list(
            filter(
                lambda c: c['stream_name'
                            ] == 'postgres_logical_replication_test_chickens',
                found_catalogs))[0]
        self.assertEqual('postgres_logical_replication_test_chickens',
                         test_catalog_chickens['stream_name'])
        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog_cows,
            menagerie.get_annotated_schema(conn_id,
                                           test_catalog_cows['stream_id']),
            additional_md)
        connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog_chickens,
            menagerie.get_annotated_schema(conn_id,
                                           test_catalog_chickens['stream_id']),
            additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        self.assertEqual(
            record_count_by_stream, {
                'postgres_logical_replication_test_cows': 1,
                'postgres_logical_replication_test_chickens': 1
            })
        records_by_stream = runner.get_records_from_target_output()

        table_version_cows = records_by_stream[
            'postgres_logical_replication_test_cows']['table_version']
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_cows']
            ['messages'][0]['action'], 'activate_version')
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_cows']
            ['messages'][1]['action'], 'upsert')
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_cows']
            ['messages'][2]['action'], 'activate_version')

        table_version_chickens = records_by_stream[
            'postgres_logical_replication_test_chickens']['table_version']
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_chickens']
            ['messages'][0]['action'], 'activate_version')
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_chickens']
            ['messages'][1]['action'], 'upsert')
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_chickens']
            ['messages'][2]['action'], 'activate_version')

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")

        bookmark_cows = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_cows']
        self.assertIsNotNone(bookmark_cows['lsn'],
                             msg="expected bookmark for stream to have an lsn")
        lsn_cows_1 = bookmark_cows['lsn']
        self.assertEqual(bookmark_cows['version'],
                         table_version_cows,
                         msg="expected bookmark for stream to match version")

        bookmark_chickens = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_chickens']
        self.assertIsNotNone(bookmark_chickens['lsn'],
                             msg="expected bookmark for stream to have an lsn")
        lsn_chickens_1 = bookmark_chickens['lsn']
        self.assertEqual(bookmark_chickens['version'],
                         table_version_chickens,
                         msg="expected bookmark for stream to match version")

        #----------------------------------------------------------------------
        # invoke the sync job again after adding records
        #----------------------------------------------------------------------
        print("inserting 2 more cows and 2 more chickens")

        with db_utils.get_test_connection('dev') as conn:
            conn.autocommit = True
            with conn.cursor() as cur:
                # insert another cow
                self.cows_rec_2 = {'cow_name': "betty cow", 'cow_age': 21}
                insert_record(cur, test_table_name_cows, self.cows_rec_2)
                # update that cow's expected values
                self.cows_rec_2['id'] = 2
                self.cows_rec_2['_sdc_deleted_at'] = None

                # insert another chicken
                self.chicken_rec_2 = {
                    'chicken_name': "burt chicken",
                    'chicken_age': 14
                }
                insert_record(cur, test_table_name_chickens,
                              self.chicken_rec_2)
                # update that cow's expected values
                self.chicken_rec_2['id'] = 2
                self.chicken_rec_2['_sdc_deleted_at'] = None

                # and repeat...

                self.cows_rec_3 = {'cow_name': "cindy cow", 'cow_age': 10}
                insert_record(cur, test_table_name_cows, self.cows_rec_3)
                self.cows_rec_3['id'] = 3
                self.cows_rec_3['_sdc_deleted_at'] = None

                self.chicken_rec_3 = {
                    'chicken_name': "carl chicken",
                    'chicken_age': 4
                }
                insert_record(cur, test_table_name_chickens,
                              self.chicken_rec_3)
                self.chicken_rec_3['id'] = 3
                self.chicken_rec_3['_sdc_deleted_at'] = None

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        self.assertEqual(
            record_count_by_stream, {
                'postgres_logical_replication_test_cows': 2,
                'postgres_logical_replication_test_chickens': 2
            })
        records_by_stream = runner.get_records_from_target_output()
        chicken_messages = records_by_stream[
            "postgres_logical_replication_test_chickens"]['messages']
        cow_messages = records_by_stream[
            "postgres_logical_replication_test_cows"]['messages']

        self.assertDictEqual(self.cows_rec_2, cow_messages[0]['data'])
        self.assertDictEqual(self.chicken_rec_2, chicken_messages[0]['data'])
        self.assertDictEqual(self.cows_rec_3, cow_messages[1]['data'])
        self.assertDictEqual(self.chicken_rec_3, chicken_messages[1]['data'])

        print("inserted record is correct")

        state = menagerie.get_state(conn_id)
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        cows_bookmark = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_cows']
        self.assertIsNotNone(
            cows_bookmark['lsn'],
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to have an scn"
        )
        lsn_cows_2 = cows_bookmark['lsn']
        self.assertTrue(lsn_cows_2 >= lsn_cows_1)

        chickens_bookmark = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_chickens']
        self.assertIsNotNone(
            chickens_bookmark['lsn'],
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to have an scn"
        )
        lsn_chickens_2 = chickens_bookmark['lsn']
        self.assertTrue(lsn_chickens_2 >= lsn_chickens_1)

        #table_version does NOT change
        self.assertEqual(
            chickens_bookmark['version'],
            table_version_chickens,
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to match version"
        )

        #table_version does NOT change
        self.assertEqual(
            cows_bookmark['version'],
            table_version_cows,
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to match version"
        )

Beispiel #30

Datei anzeigen

    def binlog_edge_test(self, expected_records=[]):
        """
        Test binlog replication edge cases
        • Verify an initial sync returns expected records of various datatypes
        • Verify we bookmark correctly when a transaction spans multiple files
        • Insert and delete a record prior to sync. Verify both events are replicated
        • Insert and update a record prior to sync. Verify both events are replicated
        • Verify a valid log_file and log_pos state are persisted after each sync
        """

        conn_id = connections.ensure_connection(self)

        # prior to first sync update a record...
        updated_timestamp = datetime.datetime.now()
        updated_id = 1
        expected_records[1]['our_timestamp_2'] = datetime.datetime.strftime(
            updated_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")

        # insert a record and...
        inserted_record = self.generate_record_n(len(expected_records))
        expected_records += [inserted_record]  # TODO need to format

        # delete a record
        deleted_id = 2

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            cur.execute(
                "UPDATE {}.{} SET our_timestamp_2 = '{}' WHERE id = {}".format(
                    self.database_name(), self.table_name_1(),
                    updated_timestamp, updated_id))

            self.insert_record(cur, inserted_record, self.table_name_1())

            delete_time = datetime.datetime.now()
            cur.execute("DELETE FROM {}.{} WHERE id = {}".format(
                self.database_name(), self.table_name_1(), deleted_id))

        print(
            "\n\nMySQL DB Actions." + \
            "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \
            "\nEVENTS: {} records updated".format(1) + \
            "\n        {} records deleted\n\n".format(1)
        )

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        t1 = self.table_name_1()
        t2 = self.table_name_2()
        expected_check_streams = {
            self.tap_stream_id(t1),
            self.tap_stream_id(t2)
        }
        expected_sync_streams = {t1, t2}
        expected_pks = {t1: {'id'}, t2: {'id'}}

        # verify the tap discovered the right streams
        found_catalogs = [
            catalog for catalog in menagerie.get_catalogs(conn_id)
            if catalog['tap_stream_id'] in expected_check_streams
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = expected_check_streams.symmetric_difference(found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        self.assertEqual(self.table_name_1(), found_catalogs[0]['stream_name'])
        self.assertEqual(self.table_name_2(), found_catalogs[1]['stream_name'])
        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        for catalog in found_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            _ = connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, catalog, additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run initial full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        self.maxDiff = None
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        # BUG missing deleted record | https://stitchdata.atlassian.net/browse/SRCE-4258
        # self.assertEqual({self.table_name_1(): len(expected_records)}, record_count_by_stream)
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name_1()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        # verify activate version messages are present
        self.assertEqual('activate_version', message_actions[0])
        self.assertEqual('activate_version', message_actions[-1])

        # ensure some log_file and log_pos state was persisted
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertIsNotNone(bookmark['log_file'])
        self.assertIsNotNone(bookmark['log_pos'])

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        # grab version, log_file and log_pos from state to check later
        expected_table_version = records_for_stream['table_version']

        self.assertEqual(expected_table_version, bookmark['version'])

        # check for expected records
        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]
        # we need to compare record by record since there are so many.
        # a failure comparing expected_records to upsert_records would result in
        # an output message greater in length than a standard tmux buffer
        # BUG missing datetime precision | https://stitchdata.atlassian.net/browse/SRCE-4257
        # for expected_record in expected_records:
        #     upsert_record = [rec for rec in upsert_records
        #                      if rec['id'] == expected_record['id']]
        #     self.assertEqual(1, len(upsert_record),
        #                      msg="multiple upsert_recs with same pk: {}".format(upsert_record))
        #     self.assertEqual(expected_record, upsert_record.pop())

        # TODO add check for _sdc_delete_at for deleted record once bug addressed

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertEqual(expected_table_version, bookmark['version'])

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # record count should be empty as we did not persist anything to the gate
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)
        self.assertEqual(record_count_by_stream, {})

        # Create 1 more record prior to 2nd sync
        new_record = self.generate_record_n(len(expected_records))
        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            self.insert_record(cur, new_record, self.table_name_1())
        print(
            "\n\nMySQL DB Actions." + \
            "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \
            "\nEVENTS: {} records inserted".format(1)
        )

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertEqual(expected_table_version, bookmark['version'])

        # Either the log_file is the same but the log_pos has increased or the log_file
        # has rotated and the numeric suffix has increased
        if expected_log_file == bookmark['log_file']:
            print("PATH A")
            self.assertGreater(bookmark['log_pos'], expected_log_pos)
        else:
            expected_log_file_suffix = re.search('^.*\.(\d+)$',
                                                 expected_log_file).groups()[0]
            updated_log_file_suffix = re.search(
                '^.*\.(\d+)$', bookmark['log_file']).groups()[0]
            print("PATH B")
            self.assertGreater(int(updated_log_file_suffix),
                               int(expected_log_file_suffix))

        # Execute delete across tables using join prior to 3rd sync
        deleted_id = 4

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:

            delete_time = datetime.datetime.now()
            # DELETE T1, T2
            # FROM T1
            # INNER JOIN T2 ON T1.key = T2.key
            # WHERE condition;
            db = self.database_name()
            db_t1 = db + "." + t1
            db_t2 = db + "." + t2
            t1_key = db_t1 + ".id"
            t2_key = db_t2 + ".id"
            statement = "DELETE {}, {} ".format(db_t1, db_t2) + \
                "FROM {} ".format(t1) + \
                "INNER JOIN {} ON {} = {} ".format(db_t2, t1_key, t2_key) + \
                "WHERE {} = {}".format(t1_key, deleted_id)
            cur.execute(statement)

        print(
            "\n\nMySQL DB Actions." + \
            "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_2()) + \
            "\nTABLE: {}".format(self.table_name_2()) + \
            "\nEVENTS:  {} records deleted\n\n".format(1)
        )

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertEqual(expected_table_version, bookmark['version'])

        target_records = runner.get_records_from_target_output()
        records_stream_1 = target_records[self.table_name_1()]
        upsert_records_1 = [
            m['data'] for m in records_stream_1['messages']
            if m['action'] == 'upsert'
        ]
        records_stream_2 = target_records[self.table_name_2()]
        upsert_records_2 = [
            m['data'] for m in records_stream_2['messages']
            if m['action'] == 'upsert'
        ]

        # make sure the record is in the target for both tables with a delete time
        deleted_at_t1 = upsert_records_1[0].get('_sdc_deleted_at')
        deleted_at_t1_timestamp = utils.strptime_to_utc(
            deleted_at_t1).timestamp()
        self.assertIsNotNone(deleted_at_t1)

        deleted_at_t2 = upsert_records_2[0].get('_sdc_deleted_at')
        deleted_at_t2_timestamp = utils.strptime_to_utc(
            deleted_at_t2).timestamp()
        self.assertIsNotNone(deleted_at_t2)

        # the delete times should be equal since it was a single transaction
        self.assertEqual(deleted_at_t1_timestamp, deleted_at_t2_timestamp)
        time_delta = delete_time.timestamp() - deleted_at_t1_timestamp
        print("Delete time vs record: difference in seconds", time_delta)
        self.assertLess(time_delta,
                        3)  # time delta less than 3 seconds in magnitude