Exemple #1
0
    def file_not_modified_test(self):

        # sync 1
        conn_id_1 = connections.ensure_connection(self)

        found_catalogs_1 = self.run_and_verify_check_mode(conn_id_1)

        self.perform_and_verify_table_and_field_selection(conn_id_1,found_catalogs_1)

        record_count_by_stream1 = self.run_and_verify_sync(conn_id_1)

        self.assertGreater(sum(record_count_by_stream1.values()), 0)

        # changing start date to "utcnow"
        self.START_DATE = dt.strftime(dt.utcnow(), "%Y-%m-%dT00:00:00Z")

        # sync 2
        conn_id_2 = connections.ensure_connection(self, original_properties = False)

        found_catalogs_2 = self.run_and_verify_check_mode(conn_id_2)

        self.perform_and_verify_table_and_field_selection(conn_id_2,found_catalogs_2)

        # as we have not added any data, so file is not modified and
        # we should not get any data and recieve error: failed to replicate any data
        try:
            self.run_and_verify_sync(conn_id_2)
        except AssertionError as e:
            self.assertRegex(str(e), r'failed to replicate any data')
    def test_run(self):
        """Parametrized automatic fields test running against each replication method."""

        # Test running a sync with no fields selected using full-table replication
        self.default_replication_method = self.FULL_TABLE
        full_table_conn_id = connections.ensure_connection(self)
        self.automatic_fields_test(full_table_conn_id)

        # NB | We expect primary keys and replication keys to have inclusion automatic for
        #      key-based incremental replication. But that is only true for primary keys.
        #      As a result we cannot run a sync with no fields selected. This BUG should not
        #      be carried over into hp-postgres, but will not be fixed for this tap.

        # Test running a sync with no fields selected using key-based incremental replication
        # self.default_replication_method = self.INCREMENTAL
        # incremental_conn_id = connections.ensure_connection(self, original_properties=False)
        # self.automatic_fields_test(incremental_conn_id)

        # Test running a sync with no fields selected using logical replication
        self.default_replication_method = self.LOG_BASED
        with db_utils.get_test_connection('dev') as conn:
            conn.autocommit = True
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                db_utils.ensure_replication_slot(cur, test_db)
        log_based_conn_id = connections.ensure_connection(
            self, original_properties=False)
        self.automatic_fields_test(log_based_conn_id)
Exemple #3
0
    def file_modified_test(self):

        # sync 1
        conn_id_1 = connections.ensure_connection(self)

        found_catalogs_1 = self.run_and_verify_check_mode(conn_id_1)

        self.perform_and_verify_table_and_field_selection(conn_id_1,found_catalogs_1)

        record_count_by_stream_1 = self.run_and_verify_sync(conn_id_1)
        synced_records_1 = runner.get_records_from_target_output()

        # checking if we got any records
        self.assertGreater(sum(record_count_by_stream_1.values()), 0)

        # changing start date to "utcnow"
        self.START_DATE = dt.strftime(dt.utcnow(), "%Y-%m-%dT00:00:00Z")

        # adding some data to the file
        self.append_to_files()

        # sync 2
        conn_id_2 = connections.ensure_connection(self, original_properties = False)

        found_catalogs_2 = self.run_and_verify_check_mode(conn_id_2)

        self.perform_and_verify_table_and_field_selection(conn_id_2,found_catalogs_2)

        record_count_by_stream_2 = self.run_and_verify_sync(conn_id_2)
        synced_records_2 = runner.get_records_from_target_output()

        # checking if we got any data
        self.assertGreater(sum(record_count_by_stream_2.values()), 0)

        # verifying if we got more data in sync 2 than sync 1
        self.assertGreater(sum(record_count_by_stream_2.values()), sum(record_count_by_stream_1.values()))

        for stream in self.expected_check_streams():
            expected_primary_keys = self.expected_pks()

            record_count_sync_1 = record_count_by_stream_1.get(stream, 0)
            record_count_sync_2 = record_count_by_stream_2.get(stream, 0)

            primary_keys_list_1 = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys)
                                    for message in synced_records_1.get(stream).get('messages')
                                    if message.get('action') == 'upsert']
            primary_keys_list_2 = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys)
                                    for message in synced_records_2.get(stream).get('messages')
                                    if message.get('action') == 'upsert']

            primary_keys_sync_1 = set(primary_keys_list_1)
            primary_keys_sync_2 = set(primary_keys_list_2)

            # Verify the number of records replicated in sync 2 is greater than the number
            # of records replicated in sync 1 for stream
            self.assertGreater(record_count_sync_2, record_count_sync_1)

            # Verify the records replicated in sync 1 were also replicated in sync 2
            self.assertTrue(primary_keys_sync_1.issubset(primary_keys_sync_2))
Exemple #4
0
    def test_run(self):

        # sync 1
        conn_id_1 = connections.ensure_connection(self)

        found_catalogs_1 = self.run_and_verify_check_mode(conn_id_1)

        self.perform_and_verify_table_and_field_selection(conn_id_1,found_catalogs_1)

        record_count_by_stream_1 = self.run_and_verify_sync(conn_id_1)

        # checking if we got any data from sync 1
        self.assertGreater(sum(record_count_by_stream_1.values()), 0)

        # checking if data after in 1st sync is as expected
        for tap_stream_id in self.expected_first_sync_streams():
            self.assertEqual(self.expected_first_sync_row_counts()[tap_stream_id],
                             record_count_by_stream_1[tap_stream_id])

        # creating file "table_1_fileB"
        with self.get_test_connection() as client:
            root_dir = os.getenv('TAP_SFTP_ROOT_DIR')
            client.chdir(root_dir + '/tap_tester/folderA')

            file_group = self.get_files()[0]
            with client.open('table_1_fileB.csv', 'w') as f:
                writer = csv.writer(f)
                lines = [file_group['headers']] + file_group['generator'](file_group['num_rows'])
                writer.writerows(lines)

        # adding some data to file "table_1_fileA" and "table_3_fileA"
        self.append_to_files()

        # sync 2
        conn_id_2 = connections.ensure_connection(self)

        found_catalogs_2 = self.run_and_verify_check_mode(conn_id_2)

        self.perform_and_verify_table_and_field_selection(conn_id_2,found_catalogs_2)

        record_count_by_stream_2 = self.run_and_verify_sync(conn_id_2, second_sync = True)

        # checking if we got any data from sync 2
        self.assertGreater(sum(record_count_by_stream_2.values()), 0)

        # checking if data after in 2nd sync is as expected
        # here as we have not modified start date, so we should recieve all the data 
        # ie. before appending and after appending
        for tap_stream_id in self.expected_second_sync_streams():
            self.assertEqual(self.expected_second_sync_row_counts()[tap_stream_id],
                             record_count_by_stream_2[tap_stream_id])
Exemple #5
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")
    def test_run(self):
        conn_id = connections.ensure_connection(self)
        runner.run_check_mode(self, conn_id)

        found_catalog = menagerie.get_catalog(conn_id)
        for catalog_entry in found_catalog['streams']:
            field_names_in_schema = set(
                [k for k in catalog_entry['schema']['properties'].keys()])
            field_names_in_breadcrumbs = set([
                x['breadcrumb'][1] for x in catalog_entry['metadata']
                if len(x['breadcrumb']) == 2
            ])
            self.assertEqual(field_names_in_schema, field_names_in_breadcrumbs)

            inclusions_set = set([
                (x['breadcrumb'][1], x['metadata']['inclusion'])
                for x in catalog_entry['metadata'] if len(x['breadcrumb']) == 2
            ])
            # Validate that all fields are in metadata
            self.assertEqual(len(inclusions_set), len(field_names_in_schema))
            self.assertEqual(set([i[0] for i in inclusions_set]),
                             field_names_in_schema)
            # Validate that all metadata['inclusion'] are 'available'
            unique_inclusions = set([i[1] for i in inclusions_set])
            self.assertTrue(
                len(unique_inclusions) == 1
                and 'available' in unique_inclusions)
Exemple #7
0
    def test_run(self):
        """
        Testing that sync creates the appropriate catalog with valid metadata.
        Verify that all fields and all streams have selected set to True in the metadata
        """
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # removed "ad_analytics_by_campaign" and "ad_analytics_by_creative" as
        # it makes lots of api calls so sync canary test for these streams is covered in the start date test
        expected_streams = self.expected_streams() - set(
            {"ad_analytics_by_campaign", "ad_analytics_by_creative"})
        test_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('stream_name') in expected_streams
        ]

        self.perform_and_verify_table_and_field_selection(
            conn_id, test_catalogs)

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        # check if all streams have collected records
        for stream in expected_streams:
            self.assertGreater(record_count_by_stream.get(stream, 0), 0)
Exemple #8
0
    def run_standard_sync(self,
                          environment,
                          data_type,
                          select_all_fields=True):
        """
        Run the tap in check mode.
        Perform table selection based on testable streams.
        Select all fields or no fields based on the select_all_fields param.
        Run a sync.
        """
        conn_id = connections.ensure_connection(self,
                                                original_properties=False)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        streams_to_select = self.testable_streams(environment, data_type)

        print("\n\nRUNNING {}".format(self.name()))
        print("WITH STREAMS: {}".format(streams_to_select))
        print("WITH START DATE: {}\n\n".format(self.START_DATE))

        self.perform_and_verify_table_and_field_selection(
            conn_id,
            found_catalogs,
            streams_to_select,
            select_all_fields=select_all_fields)

        return self.run_and_verify_sync(conn_id)
Exemple #9
0
    def test_run(self):
        """
        Verify that we can get multiple pages of unique records for each
        stream
        """

        conn_id = connections.ensure_connection(self)
        self.run_and_verify_check_mode(conn_id)

        self.select_and_verify_fields(conn_id)

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        all_records_by_stream = runner.get_records_from_target_output()
        page_size = int(self.get_properties()['page_size'])

        for stream in self.expected_sync_streams():
            with self.subTest(stream=stream):
                # Assert all expected streams synced at least a full pages of records
                self.assertGreater(
                    record_count_by_stream.get(stream, 0),
                    page_size,
                    msg="{} did not sync more than a page of records".format(stream)
                )

                records = [ x['data'] for x in all_records_by_stream[stream]['messages']]

                unique_records = self.get_unique_records(stream, records)

                self.assertGreater(len(unique_records),
                                   page_size)
Exemple #10
0
    def test_run(self):
        """
        Verify that we can get multiple pages of automatic fields for each
        stream
        """

        conn_id = connections.ensure_connection(self)
        self.run_and_verify_check_mode(conn_id)

        self.select_and_verify_fields(conn_id, select_all_fields=False)

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        actual_fields_by_stream = runner.examine_target_output_for_fields()

        # Assert all expected streams synced at least a full pages of records
        for stream in self.expected_sync_streams():
            with self.subTest(stream=stream):
                self.assertGreater(
                    record_count_by_stream.get(stream, 0),
                    int(self.get_properties()['page_size']),
                    msg="{} did not sync more than a page of records".format(
                        stream))

        for stream_name, actual_fields in actual_fields_by_stream.items():
            with self.subTest(stream=stream_name):
                self.assertSetEqual(
                    self.expected_automatic_fields()[stream_name],
                    actual_fields)
Exemple #11
0
    def test_run(self):
        """
        Testing that all the automatic fields are replicated despite de-selecting them
        - Verify that only the automatic fields are sent to the target.
        - Verify that all replicated records have unique primary key values.
        """
        conn_id = connections.ensure_connection(self)

        # we are getting duplicate records for 'id' fields for this stream
        # when asked support about this, but this is known behavior from the API side
        # Please refer card: https://jira.talendforge.org/browse/TDL-18686 for more details
        known_failing_streams = {"targeting_android_versions"}
        expected_streams = self.expected_streams(
        ) - known_failing_streams - self.stats_streams

        # run check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # de-select all the fields
        self.select_found_catalogs(conn_id,
                                   found_catalogs,
                                   only_streams=expected_streams,
                                   deselect_all_fields=True)

        # run sync
        record_count_by_stream = self.run_and_verify_sync(conn_id)
        synced_records = runner.get_records_from_target_output()

        for stream in expected_streams:
            with self.subTest(stream=stream):

                # expected values
                expected_primary_keys = self.expected_primary_keys()[stream]
                expected_keys = expected_primary_keys | self.expected_replication_keys(
                )[stream]

                # collect actual values
                messages = synced_records.get(stream)
                record_messages_keys = [
                    set(row['data'].keys()) for row in messages['messages']
                ]

                # check if the stream has collected some records
                self.assertGreater(record_count_by_stream.get(stream, 0), 0)

                # Verify that only the automatic fields are sent to the target
                for actual_keys in record_messages_keys:
                    self.assertSetEqual(expected_keys, actual_keys)

                # Verify we did not duplicate any records across pages
                records_pks_list = [
                    tuple([
                        message.get('data').get(primary_key)
                        for primary_key in expected_primary_keys
                    ]) for message in messages.get('messages')
                ]
                self.assertCountEqual(
                    records_pks_list,
                    set(records_pks_list),
                    msg="We have duplicate records for {}".format(stream))
Exemple #12
0
    def test_run(self):
        """
        Run tap in check mode, then select all streams and all fields within streams. Run a sync and
        verify exit codes do not throw errors. This is meant to be a smoke test for the tap. If this
        is failing do not expect any other tests to pass.
        """
        expected_streams = self.expected_streams()

        conn_id = connections.ensure_connection(self)
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        test_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in expected_streams
        ]

        self.perform_and_verify_table_and_field_selection(
            conn_id, test_catalogs, select_all_fields=True)

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        # Assert all expected streams synced at least one record
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                self.assertGreater(
                    record_count_by_stream.get(stream, 0),
                    0,
                    msg="{} did not sync any records".format(stream))
Exemple #13
0
 def setUp(self):
     required_creds = {
         "client_id": 'TAP_XERO_CLIENT_ID',
         "client_secret": 'TAP_XERO_CLIENT_SECRET',
         "refresh_token": 'TAP_XERO_REFRESH_TOKEN',
     }
     required_props = {
         "tenant_id": 'TAP_XERO_TENANT_ID',
         "xero_user_id": 'TAP_XERO_USER_ID'
     }
     missing_creds = [
         v for v in required_creds.values() if not os.getenv(v)
     ]
     missing_props = [
         v for v in required_props.values() if not os.getenv(v)
     ]
     if missing_creds or missing_props:
         missing_envs = missing_creds + missing_props
         raise Exception("set " + ", ".join(missing_envs))
     self._credentials = {
         k: os.getenv(v)
         for k, v in required_creds.items()
     }
     self.conn_id = connections.ensure_connection(
         self, payload_hook=preserve_refresh_token)
Exemple #14
0
    def test_organizations_dynamic_fields(self):
        """
        Run tap in check mode and verify more than one page is retruned for dynamic fields. 
        """
        conn_id = connections.ensure_connection(self)

        # run and verify the tap in discovermode.
        found_catalog = self.run_and_verify_check_mode(conn_id)

        # Verify number of dynamic fields in organizations stream metadata
        # (Need enough dynamic fields for organizations)
        for catalog in found_catalog:
            if catalog['stream_name'] == "organizations":
                organization_fields_page_limit = 100

                schema_and_metadata = menagerie.get_annotated_schema(
                    conn_id, catalog['stream_id'])
                schema_fields = schema_and_metadata.get(
                    'annotated-schema').get('properties').keys()
                organizations_dynamic_fields = [
                    field for field in schema_fields
                    if field not in self.organizations_static_fields()
                ]

                #Verify count of dynamic fields is more than page limit for organization fields(Pagination)
                self.assertGreater(len(organizations_dynamic_fields),
                                   organization_fields_page_limit)
Exemple #15
0
    def test_run(self):
        """
        Verify that for each stream you can get multiple pages of data
        when no fields are selected and only the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """

        expected_streams = self.expected_streams()

        # instantiate connection
        conn_id = connections.ensure_connection(self)

        # run check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # table and field selection
        test_catalogs_automatic_fields = [
            catalog for catalog in found_catalogs
            if catalog.get('stream_name') in expected_streams
        ]

        self.perform_and_verify_table_and_field_selection(
            conn_id,
            test_catalogs_automatic_fields,
            select_all_fields=False,
        )

        # run initial sync
        record_count_by_stream = self.run_and_verify_sync(conn_id)
        synced_records = runner.get_records_from_target_output()

        for stream in expected_streams:
            with self.subTest(stream=stream):

                # expected values
                expected_keys = self.expected_automatic_fields().get(stream)

                # collect actual values
                data = synced_records.get(stream, {})
                record_messages_keys = [
                    set(row.get('data').keys())
                    for row in data.get('messages', {})
                ]

                # Verify that you get some records for each stream
                self.assertGreater(
                    record_count_by_stream.get(stream, -1),
                    0,
                    msg=
                    "The number of records is not over the stream max limit for the {} stream"
                    .format(stream))

                # Verify that only the automatic fields are sent to the target
                for actual_keys in record_messages_keys:
                    self.assertSetEqual(expected_keys, actual_keys)
Exemple #16
0
    def run_test(self, only_automatic_fields=False):
        expected_streams = self.streams_to_select()
        conn_id = connections.ensure_connection(self)
        runner.run_check_mode(self, conn_id)

        expected_stream_fields = dict()

        found_catalogs = menagerie.get_catalogs(conn_id)
        for catalog in found_catalogs:
            stream_name = catalog['stream_name']
            catalog_entry = menagerie.get_annotated_schema(conn_id, catalog['stream_id'])
            if not stream_name in expected_streams:
                continue
            # select catalog fields
            self.select_found_catalogs(conn_id,
                                       [catalog],
                                       only_streams=[stream_name],
                                       deselect_all_fields=True if only_automatic_fields else False,
                                       non_selected_props=[] if only_automatic_fields else self.non_selected_fields[stream_name])
            # add expected fields for assertion
            fields_from_field_level_md = [md_entry['breadcrumb'][1] for md_entry in catalog_entry['metadata']
                                          if md_entry['breadcrumb'] != []]
            if only_automatic_fields:
                expected_stream_fields[stream_name] = self.expected_primary_keys()[stream_name] | self.expected_replication_keys()[stream_name]
            else:
                expected_stream_fields[stream_name] = set(fields_from_field_level_md) - set(self.non_selected_fields[stream_name])

        self.run_and_verify_sync(conn_id)
        synced_records = runner.get_records_from_target_output()

        for stream in expected_streams:
            with self.subTest(stream=stream):

                expected_primary_keys = self.expected_primary_keys()[stream]

                # get expected keys
                expected_keys = expected_stream_fields[stream]

                # collect all actual values
                messages = synced_records.get(stream)

                # collect actual synced fields
                actual_keys = [set(message['data'].keys()) for message in messages['messages']
                                   if message['action'] == 'upsert'][0]

                fields = self.fields_to_remove.get(stream) or []
                expected_keys = expected_keys - set(fields)

                # verify expected and actual fields
                self.assertEqual(expected_keys, actual_keys,
                                 msg='Selected keys in catalog is not as expected')

                # Verify we did not duplicate any records across pages
                records_pks_set = {tuple([message.get('data').get(primary_key) for primary_key in expected_primary_keys])
                                   for message in messages.get('messages')}
                records_pks_list = [tuple([message.get('data').get(primary_key) for primary_key in expected_primary_keys])
                                    for message in messages.get('messages')]
                self.assertCountEqual(records_pks_set, records_pks_list,
                                      msg="We have duplicate records for {}".format(stream))
Exemple #17
0
    def run_test(self):
        conn_id = connections.ensure_connection(self)

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        catalog = menagerie.get_catalogs(conn_id)
        found_catalog_names = set(map(lambda c: c['tap_stream_id'], catalog))

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(), found_catalog_names)

        for tap_stream_id in self.expected_check_streams():
            found_stream = [
                c for c in catalog if c['tap_stream_id'] == tap_stream_id
            ][0]
            schema_and_metadata = menagerie.get_annotated_schema(
                conn_id, found_stream['stream_id'])
            main_metadata = schema_and_metadata["metadata"]
            stream_metadata = [
                mdata for mdata in main_metadata if mdata["breadcrumb"] == []
            ]

            # assert that the pks are correct
            self.assertEqual(
                self.expected_pks()[tap_stream_id],
                set(stream_metadata[0]['metadata']['table-key-properties']))

        for stream_catalog in catalog:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema['annotated-schema'],
                [])

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_first_sync_streams(),
            self.expected_pks())

        # Verify that the full table was syncd
        for tap_stream_id in self.expected_first_sync_streams():
            self.assertEqual(
                self.expected_first_sync_row_counts()[tap_stream_id],
                record_count_by_stream[tap_stream_id])
Exemple #18
0
    def test_run(self):
        page_size = 250
        conn_id = connections.ensure_connection(self)

        # Checking pagination for streams with enough data
        expected_streams = [
            "addresses",
            "customers",
            "discounts",
            "metafields_subscription",
            "onetimes",
        ]
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # table and field selection
        test_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('stream_name') in expected_streams
        ]

        self.perform_and_verify_table_and_field_selection(
            conn_id, test_catalogs)

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        synced_records = runner.get_records_from_target_output()

        for stream in expected_streams:
            with self.subTest(stream=stream):
                # expected values
                expected_primary_keys = self.expected_primary_keys()

                # collect information for assertions from syncs 1 & 2 base on expected values
                record_count_sync = record_count_by_stream.get(stream, 0)
                primary_keys_list = [
                    tuple(
                        message.get('data').get(expected_pk)
                        for expected_pk in expected_primary_keys[stream])
                    for message in synced_records.get(stream).get('messages')
                    if message.get('action') == 'upsert'
                ]

                # verify records are more than page size so multiple page is working
                self.assertGreater(record_count_sync, page_size)

                primary_keys_list_1 = primary_keys_list[:page_size]
                primary_keys_list_2 = primary_keys_list[page_size:2 *
                                                        page_size]

                primary_keys_page_1 = set(primary_keys_list_1)
                primary_keys_page_2 = set(primary_keys_list_2)

                # Verify by private keys that data is unique for page
                self.assertEqual(
                    len(primary_keys_page_1),
                    page_size)  # verify there are no dupes on a page
                self.assertTrue(
                    primary_keys_page_1.isdisjoint(primary_keys_page_2)
                )  # verify there are no dupes between pages
Exemple #19
0
    def setUp(self):
        missing_envs = [x for x in [os.getenv('TAP_TOGGL_API_TOKEN'),
                                os.getenv('TAP_TOGGL_DETAILED_REPORT_TRAILING_DAYS')] if x == None]
        if len(missing_envs) != 0:
            #pylint: disable=line-too-long
            raise Exception("set TAP_TOGGL_API_TOKEN, TAP_TOGGL_DETAILED_REPORT_TRAILING_DAYS")

        self.conn_id = connections.ensure_connection(self)
Exemple #20
0
    def pre_sync_test(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # tap discovered the right streams
        catalog = menagerie.get_catalog(conn_id)

        table_configs = self.expected_table_config()

        for stream in catalog['streams']:
            # schema is open {} for each stream
            self.assertEqual({'type': 'object'}, stream['schema'])

        expected_streams = {x['TableName'] for x in table_configs}
        # assert we find the correct streams
        self.assertEqual(expected_streams,
                         {c['tap_stream_id'] for c in catalog['streams']})
        # Verify that the table_name is in the format <collection_name> for each stream
        self.assertEqual(expected_streams, {c['table_name'] for c in catalog['streams']})

        for tap_stream_id in expected_streams:
            found_stream = [c for c in catalog['streams'] if c['tap_stream_id'] == tap_stream_id][0]
            stream_metadata = [x['metadata'] for x in found_stream['metadata'] if x['breadcrumb'] == []][0]
            expected_config = [x for x in table_configs if x['TableName'] == tap_stream_id][0]

            # table-key-properties metadata
            keys = [expected_config['HashKey']]
            if expected_config.get('SortKey'):
                keys.append(expected_config.get('SortKey'))

            self.assertEqual(set(keys),
                             set(stream_metadata.get('table-key-properties')))

            # Assert the hash key is the first key in the list
            self.assertEqual(expected_config['HashKey'],
                             stream_metadata.get('table-key-properties')[0])

            # row-count metadata
            self.assertEqual(expected_config['num_rows'],
                             stream_metadata.get('row-count'))

            # selected metadata is None for all streams
            self.assertNotIn('selected', stream_metadata.keys())

            # is-view metadata is False
            self.assertFalse(stream_metadata.get('is-view'))

            # no forced-replication-method metadata
            self.assertNotIn('forced-replication-method', stream_metadata.keys())

        return (table_configs, conn_id, expected_streams)
Exemple #21
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")

        #select all catalogs

        for c in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id'])
            if c['stream_name'] in self.expected_sync_streams().keys():
                stream = c['stream_name']
                pks = self.expected_sync_streams()[stream]

                for pk in pks:
                    mdata = next((m for m in catalog_entry['metadata']
                                  if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == pk), None)
                    print("Validating inclusion on {}: {}".format(c['stream_name'], mdata))
                    self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic')

                connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry)

        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        first_record_count_by_stream = runner.examine_target_output_file(self, conn_id, set(self.expected_sync_streams().keys()), self.expected_sync_streams())
        replicated_row_count =  reduce(lambda accum,c : accum + c, first_record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Verify that automatic fields are all emitted with records
        synced_records = runner.get_records_from_target_output()
        for stream_name, data in synced_records.items():
            record_messages = [set(row['data'].keys()) for row in data['messages']]
            self.assertGreater(len(record_messages), 0, msg="stream {} did not sync any records.".format(stream_name))
            for record_keys in record_messages:
                self.assertEqual(self.expected_sync_streams().get(stream_name, set()) - record_keys, set())
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        #select all catalogs
        for catalog in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id']))

        future_time = "2050-01-01T00:00:00.000000Z"

        #clear state
        future_bookmarks = {"currently_syncing" : None,
                            "bookmarks":  {"contacts" : {"offset" : {},
                                                         "versionTimestamp" :  future_time},
                                           "subscription_changes" : {"startTimestamp" : future_time,
                                                                     "offset" :  {}},
                                           "campaigns" :  {"offset" : {}},
                                           "forms" : {"updatedAt" :  future_time},
                                           "deals" :  {"offset" :  {},
                                                       "hs_lastmodifieddate" :  future_time},
                                           "workflows" :  {"updatedAt" : future_time},
                                           "owners" :  {"updatedAt" :  future_time},
                                           "contact_lists" :  {"updatedAt" :  future_time,
                                                               "offset" :  {}},
                                           "email_events" :  {"startTimestamp" : future_time,
                                                              "offset" : {}},
                                           "companies" :  {"offset" : {},
                                                           "hs_lastmodifieddate" :  future_time},
                                           "engagements" :  {"lastUpdated" :  future_time,
                                                             "offset" : {}}}}

        menagerie.set_state(conn_id, future_bookmarks)

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        #because the bookmarks were set into the future, we should NOT actually replicate any data.
        #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks
        streams_with_bookmarks = self.expected_sync_streams()
        streams_with_bookmarks.remove('campaigns')
        streams_with_bookmarks.remove('deal_pipelines')
        bad_streams = streams_with_bookmarks.intersection(record_count_by_stream.keys())
        self.assertEqual(len(bad_streams), 0, msg="still pulled down records from {} despite future bookmarks".format(bad_streams))


        state = menagerie.get_state(conn_id)

        # NB: Companies and engagements won't set a bookmark in the future.
        state["bookmarks"].pop("companies")
        state["bookmarks"].pop("engagements")
        future_bookmarks["bookmarks"].pop("companies")
        future_bookmarks["bookmarks"].pop("engagements")

        self.assertEqual(state, future_bookmarks, msg="state should not have been modified because we didn't replicate any data")
        bookmarks = state.get('bookmarks')
        bookmark_streams = set(state.get('bookmarks').keys())
Exemple #23
0
    def test_run(self):
        """
        • Verify we can deselect all fields except when inclusion=automatic, which is handled by base.py methods
        • Verify that only the automatic fields are sent to the target.
        • Verify that all replicated records have unique primary key values.
        """
        # We are not able to generate test data so skipping two streams(mark_as_spam, dropped_email)
        expected_streams = self.expected_streams() - {"mark_as_spam", "dropped_email"}
        
        conn_id = connections.ensure_connection(self)

        # Run in check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)
        
        # table and field selection
        test_catalogs = [catalog for catalog in found_catalogs
                                      if catalog.get('stream_name') in expected_streams]

        # Select all streams and no fields within streams
        self.perform_and_verify_table_and_field_selection(
            conn_id, test_catalogs, select_all_fields=False)

        record_count_by_stream = self.run_and_verify_sync(conn_id)
        synced_records = runner.get_records_from_target_output()
        
        for stream in expected_streams:
            with self.subTest(stream=stream):

                # expected values
                expected_keys = self.expected_automatic_fields().get(stream)
                expected_primary_keys = self.expected_primary_keys()[stream]
                
                # collect actual values
                data = synced_records.get(stream, {})
                record_messages_keys = [set(row['data'].keys())
                                        for row in data.get('messages', [])]
                primary_keys_list = [tuple(message.get('data').get(expected_pk) for expected_pk in expected_primary_keys)
                                       for message in data.get('messages')
                                       if message.get('action') == 'upsert']
                
                unique_primary_keys_list = set(primary_keys_list)
                # Verify that you get some records for each stream
                self.assertGreater(
                    record_count_by_stream.get(stream, -1), 0,
                    msg="The number of records is not over the stream min limit")

                # Verify that only the automatic fields are sent to the target
                for actual_keys in record_messages_keys:
                    self.assertSetEqual(expected_keys, 
                                        actual_keys, 
                                        msg="The fields sent to the target are not the automatic fields")
                    
                #Verify that all replicated records have unique primary key values.
                self.assertEqual(len(primary_keys_list), 
                                    len(unique_primary_keys_list), 
                                    msg="Replicated record does not have unique primary key values.")
Exemple #24
0
    def test_run(self):

        conn_id = connections.ensure_connection(self, payload_hook=None)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")
        #
        # # Select some catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for catalog in our_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], [])

        # # Verify that all streams sync at least one row for initial sync
        # # This test is also verifying access token expiration handling. If test fails with
        # # authentication error, refresh token was not replaced after expiring.
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        zero_count_streams = {
            k
            for k, v in record_count_by_stream.items() if v == 0
        }
        self.assertFalse(
            zero_count_streams,
            msg="The following streams did not sync any rows {}".format(
                zero_count_streams))
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        all_excluded_fields = {}
        # select all catalogs
        for c in found_catalogs:
            if c['stream_name'] == 'ads':
                continue

            discovered_schema = menagerie.get_annotated_schema(conn_id, c['stream_id'])['annotated-schema']
            all_excluded_fields[c['stream_name']] = list(set(discovered_schema.keys()) - self.expected_automatic_fields().get(c['stream_name'], set()))[:5]
            connections.select_catalog_and_fields_via_metadata(
                conn_id,
                c,
                discovered_schema,
                non_selected_fields=all_excluded_fields[c['stream_name']])

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # This should be validating the the PKs are written in each record
        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        synced_records = runner.get_records_from_target_output()
        self.assertTrue('ads' not in synced_records.keys())
        for stream_name, data in synced_records.items():
            record_messages = [set(row['data'].keys()) for row in data['messages']]
            for record_keys in record_messages:
                # The intersection should be empty
                self.assertFalse(record_keys.intersection(all_excluded_fields[stream_name]))
Exemple #26
0
    def test_run(self):
        """
        Ensure running the tap with all streams selected and all fields deselected results in the
        replication of just the primary keys and replication keys (automatic fields).
         - Verify we can deselect all fields except when inclusion=automatic (SaaS Taps).
         - Verify that only the automatic fields are sent to the target.
        """

        expected_streams = self.expected_sync_streams()

        # instantiate connection
        conn_id = connections.ensure_connection(self)

        # run check mode
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # table and field selection
        test_catalogs_automatic_fields = [
            catalog for catalog in found_catalogs
            if catalog.get('stream_name') in expected_streams
        ]

        self.perform_and_verify_table_and_field_selection(
            conn_id,
            test_catalogs_automatic_fields,
            select_all_fields=False,
        )

        # run initial sync
        record_count_by_stream = self.run_and_verify_sync(conn_id)
        synced_records = runner.get_records_from_target_output()

        for stream in expected_streams:
            with self.subTest(stream=stream):

                # expected values
                expected_keys = self.expected_automatic_fields().get(stream)

                # collect actual values
                messages = synced_records.get(stream)
                record_messages_keys = [
                    set(message['data'].keys())
                    for message in messages['messages']
                    if message['action'] == 'upsert'
                ]

                # Verify that you get some records for each stream
                self.assertGreater(record_count_by_stream.get(stream, -1), 0)

                # Verify that only the automatic fields are sent to the target
                # BUG TDL-14241 | Replication keys are not automatic
                if stream == "file_metadata":
                    expected_keys.remove('modifiedTime')
                for actual_keys in record_messages_keys:
                    self.assertSetEqual(expected_keys, actual_keys)
Exemple #27
0
    def ensure_connection(self, original=True):
        def preserve_refresh_token(existing_conns, payload):
            if not existing_conns:
                return payload
            conn_with_creds = connections.fetch_existing_connection_with_creds(existing_conns[0]['id'])
            # Even though is a credential, this API posts the entire payload using properties
            payload['properties']['refresh_token'] = conn_with_creds['credentials']['refresh_token']
            return payload

        conn_id = connections.ensure_connection(self, payload_hook=preserve_refresh_token, original_properties = original)
        return conn_id
Exemple #28
0
    def pagination_test_run(self):
        """
        Testing that sync creates the appropriate catalog with valid metadata.
        • Verify that all fields and all streams have selected set to True in the metadata
        """
        page_size = 100  # Page size for events
        conn_id = connections.ensure_connection(self)

        # Expected stream is only events
        expected_streams = ["events"]
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # table and field selection
        test_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('stream_name') in expected_streams
        ]

        self.perform_and_verify_table_and_field_selection(
            conn_id, test_catalogs)

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        synced_records = runner.get_records_from_target_output()

        for stream in expected_streams:
            with self.subTest(stream=stream):
                # expected values
                expected_primary_keys = self.expected_primary_keys()[stream]

                # collect information for assertions from syncs 1 & 2 base on expected values
                record_count_sync = record_count_by_stream.get(stream, 0)
                primary_keys_list = [
                    tuple(
                        message.get('data').get(expected_pk)
                        for expected_pk in expected_primary_keys)
                    for message in synced_records.get(stream).get('messages')
                    if message.get('action') == 'upsert'
                ]

                # verify records are more than page size so multiple page is working
                self.assertGreater(record_count_sync, page_size)

                if record_count_sync > page_size:
                    primary_keys_list_1 = primary_keys_list[:page_size]
                    primary_keys_list_2 = primary_keys_list[page_size:2 *
                                                            page_size]

                    primary_keys_page_1 = set(primary_keys_list_1)
                    primary_keys_page_2 = set(primary_keys_list_2)

                    #Verify by private keys that data is unique for page
                    self.assertTrue(
                        primary_keys_page_1.isdisjoint(primary_keys_page_2))
Exemple #29
0
    def create_connection(self, original_properties: bool = True):
        """Create a new connection with the test name"""
        # Create the connection
        conn_id = connections.ensure_connection(self, original_properties)

        # Run a check job using orchestrator (discovery)
        check_job_name = runner.run_check_mode(self, conn_id)

        # Assert that the check job succeeded
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)
        return conn_id
Exemple #30
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")

        # Select some catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for catalog in our_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema)

        # Clear State and run sync
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))