コード例 #1
0
    def test_run(self):

        self.setUpTestEnvironment(COMPRESSION_FOLDER_PATH)

        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(
                self.conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        self.select_specific_catalog(found_catalogs,
                                     "gz_file_having_empty_csv")

        runner.run_sync_job_and_check_status(self)

        expected_records = 0
        # Verify actual rows were synced
        records = runner.get_upserts_from_target_output()

        self.assertEqual(expected_records, len(records))
コード例 #2
0
    def test_catalog_without_properties(self):

        self.setUpTestEnvironment()

        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertEqual(len(found_catalogs), 1,
                         msg="unable to locate schemas for connection {}".format(self.conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset, msg="Expected check streams are not subset of discovered catalog")

        our_catalogs = [c for c in found_catalogs if c.get(
            'tap_stream_id') in self.expected_streams()]

        # Select our catalogs
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                self.conn_id, c['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                self.conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, self.conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        synced_records = runner.get_records_from_target_output()
        upsert_messages = [m for m in synced_records.get(
            'csv_with_empty_lines').get('messages') if m['action'] == 'upsert']

        records = [message.get('data') for message in upsert_messages]

        #Empty line should be ignored in emitted records.

        expected_records = [
            {'id': 1, 'name': 'John', '_sdc_extra': [{'name': 'carl'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 2},
            {'id': 2, 'name': 'Bob', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 3},
            {'id': 3, '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 4},
            {'id': 4, 'name': 'Alice', '_sdc_extra': [{'no_headers': ['Ben', '5']}, {
                'name': 'Barak'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 5}
        ]

        self.assertListEqual(expected_records, records)
コード例 #3
0
    def test_run(self):
        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.check_all_streams_in_catalogs(found_catalogs)
        self.select_found_catalogs(found_catalogs)

        # clear state and run the actual sync
        menagerie.set_state(self.conn_id, {})
        runner.run_sync_job_and_check_status(self)
        self.check_output_record_counts()

        max_bookmarks_from_records = runner.get_max_bookmarks_from_target(self)
        state = menagerie.get_state(self.conn_id)
        bookmarks = state.get("bookmarks", {})
        self.check_bookmarks(bookmarks, max_bookmarks_from_records)
        self.check_offsets(bookmarks)
        self.look_for_unexpected_bookmarks(bookmarks)
        self.assertIsNone(state.get("currently_syncing"))
コード例 #4
0
 def test_000_run(self):
     """
     run discovery as the first test and ensure that it completed as expected.
     """
     runner.run_check_job_and_check_status(self)
コード例 #5
0
    def test_run(self):
        # Connect to stitch service.
        runner.run_check_job_and_check_status(self)

        # Get and check streams.
        self.found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertEqual(len(self.found_catalogs), 5, msg="unable to locate schemas for connection {}".format(self.conn_id))

        # Match streams.
        our_catalogs = [c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(self.conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(self.conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        # Run a sync job using orchestrator, verify tap and target exit codes
        # and verify actual rows were synced.
        first_sync_record_count = self.run_sync(self.conn_id)

        replicated_row_count =  reduce(lambda accum, c : accum + c, first_sync_record_count.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_sync_record_count))
        print("total replicated row count: {}".format(replicated_row_count))

        # Get incremental vs. non-incremental streams.
        non_incremental_streams = {key for key, value in self.expected_replication_method().items() if value != 'INCREMENTAL'}
        incremental_streams = {key for key, value in self.expected_replication_method().items() if value == 'INCREMENTAL'}

        # Get bookmark and state data for first sync, excluding full table streams.
        first_sync_state = menagerie.get_state(self.conn_id)
        first_sync_records = runner.get_records_from_target_output()

        for v in non_incremental_streams:
            first_sync_records.pop(v, None)

        first_max_bookmarks = self.max_bookmarks_by_stream(first_sync_records)
        first_min_bookmarks = self.min_bookmarks_by_stream(first_sync_records)

        # Run a second sync job using orchestrator.
        second_sync_record_count = self.run_sync(self.conn_id)

        # Get data about rows synced, excluding full table streams.
        second_sync_records = runner.get_records_from_target_output()

        for v in non_incremental_streams:
            second_sync_records.pop(v, None)

        second_min_bookmarks = self.min_bookmarks_by_stream(second_sync_records)

        for stream in incremental_streams:
            # get bookmark values from state and target data
            stream_bookmark_key = self.expected_rks().get(stream, set())
            assert len(stream_bookmark_key) == 1  # There shouldn't be a compound replication key
            stream_bookmark_key = stream_bookmark_key.pop()

            state_value = first_sync_state.get("bookmarks", {}).get(
                stream, {None: None}).get(stream_bookmark_key)
            target_value = first_max_bookmarks.get(
                stream, {None: None}).get(stream_bookmark_key)
            target_min_value = first_min_bookmarks.get(
                stream, {None: None}).get(stream_bookmark_key)

            if target_value:
                # Convert everything to datetime.
                state_value = utils.strptime_with_tz(state_value)
                target_value = utils.strptime_with_tz(target_value)
                target_min_value = utils.strptime_with_tz(target_min_value)

                # verify that there is data with different bookmark values - setup necessary
                self.assertTrue(target_value >= target_min_value, msg="Data isn't set up to be able to test bookmarks")

                # verify state agrees with target data after 1st sync
                self.assertEqual(state_value, target_value, msg="The bookmark value isn't correct based on target data")

                # verify that you get less data the 2nd time around
                self.assertGreater(
                    first_sync_record_count.get(stream, 0),
                    second_sync_record_count.get(stream, 0),
                    msg="second sync for stream {} didn't have less records, bookmark usage not verified".format(stream))

                if len(second_sync_records) > 0 and len(second_min_bookmarks) > 0:
                    # verify all data from 2nd sync >= 1st bookmark
                    target_value = second_min_bookmarks.get(stream, {None: None}).get(stream_bookmark_key)
                    target_value = utils.strptime_with_tz(target_value)
                    # verify that the minimum bookmark sent to the target for the second sync
                    # is greater than or equal to the bookmark from the first sync
                    self.assertTrue(target_value >= state_value)
コード例 #6
0
    def test_catalog_without_properties(self):

        self.setUpTestEnvironment()

        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertEqual(len(found_catalogs), 1,
                         msg="unable to locate schemas for connection {}".format(self.conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset, msg="Expected check streams are not subset of discovered catalog")

        our_catalogs = [c for c in found_catalogs if c.get(
            'tap_stream_id') in self.expected_streams()]

        # Select our catalogs
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                self.conn_id, c['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                self.conn_id, c, c_annotated, [], [])

        #Verify that schema contains empty properties
        expected_schema = {
            'type': 'object',
            'properties': {}
        }
        self.assertEqual(expected_schema, c_annotated.get('annotated-schema', {}))

        # Stream properties should be zero as all 5 files considered in sampling are containing headers only.
        # No fields with breadcumb will be present in schema
        metadata = c_annotated["metadata"]
        stream_properties = [item for item in metadata if item.get("breadcrumb") != []]
        self.assertEqual(len(stream_properties), 0)

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, self.conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        synced_records = runner.get_records_from_target_output()
        upsert_messages = [m for m in synced_records.get(
            'catalog_without_properties').get('messages') if m['action'] == 'upsert']

        records = [message.get('data') for message in upsert_messages]

        #All fields from file test_empty_catalog_7.csv should be emitted with duplicate & no header handling
        #as catalog is without any fields.

        expected_records = [
            {'id': '1', 'name': 'John', '_sdc_extra': [{'name': 'carl'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_empty_catalog_7.csv', '_sdc_source_lineno': 2},
            {'id': '2', 'name': 'Bob', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_empty_catalog_7.csv', '_sdc_source_lineno': 3},
            {'id': '3', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_empty_catalog_7.csv', '_sdc_source_lineno': 4},
            {'id': '4', 'name': 'Alice', '_sdc_extra': [{'no_headers': ['Ben', '5']}, {
                'name': 'Barak'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_empty_catalog_7.csv', '_sdc_source_lineno': 5}
        ]

        self.assertListEqual(expected_records, records)
コード例 #7
0
    def test_duplicate_headers_in_csv(self):
        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(
                self.conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")

        # Select our catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_streams()
        ]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                self.conn_id, c['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                self.conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, self.conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, self.conn_id, self.expected_streams(), self.expected_pks())
        self.assertGreater(sum(record_count_by_stream.values()),
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(
            sum(record_count_by_stream.values())))

        synced_records = runner.get_records_from_target_output()
        upsert_messages = [
            m for m in synced_records.get('duplicate_headers').get('messages')
            if m['action'] == 'upsert'
        ]

        records = [message.get('data') for message in upsert_messages]

        expected_records = [{
            "a0":
            "a1",
            "b0":
            "b1",
            "c0":
            "c1",
            "d0":
            "d1",
            "e0":
            "e1",
            "f0":
            "f1",
            "_sdc_extra": [{
                "a0": "a11"
            }, {
                "b0": ["b11", "b12", "b13"]
            }, {
                "c0": "c11"
            }],
            "_sdc_source_bucket":
            "com-stitchdata-prod-circleci-assets",
            "_sdc_source_file":
            "tap_tester/tap-s3-csv/duplicate_headers.csv",
            "_sdc_source_lineno":
            2
        }, {
            "a0": "a2",
            "b0": "b2",
            "c0": "c2",
            "d0": "d2",
            "e0": "e2",
            "f0": "f2",
            "_sdc_extra": [{
                "a0": "a21"
            }, {
                "b0": "b21"
            }],
            "_sdc_source_bucket": "com-stitchdata-prod-circleci-assets",
            "_sdc_source_file": "tap_tester/tap-s3-csv/duplicate_headers.csv",
            "_sdc_source_lineno": 3
        }, {
            "a0": "a3",
            "b0": "b3",
            "c0": "c3",
            "_sdc_extra": [{
                "a0": "a31"
            }],
            "_sdc_source_bucket": "com-stitchdata-prod-circleci-assets",
            "_sdc_source_file": "tap_tester/tap-s3-csv/duplicate_headers.csv",
            "_sdc_source_lineno": 4
        }, {
            "a0": "a4",
            "_sdc_source_bucket": "com-stitchdata-prod-circleci-assets",
            "_sdc_source_file": "tap_tester/tap-s3-csv/duplicate_headers.csv",
            "_sdc_source_lineno": 5
        }, {
            "a0": "a5",
            "b0": "",
            "c0": "c5",
            "d0": "d5",
            "_sdc_extra": [{
                "a0": ""
            }],
            "_sdc_source_bucket": "com-stitchdata-prod-circleci-assets",
            "_sdc_source_file": "tap_tester/tap-s3-csv/duplicate_headers.csv",
            "_sdc_source_lineno": 6
        }, {
            "a0":
            "a6",
            "b0":
            "b6",
            "c0":
            "c6",
            "d0":
            "d6",
            "e0":
            "e6",
            "f0":
            "f6",
            "_sdc_extra": [{
                "no_headers": ["g0", "h0", "i0"]
            }, {
                "a0": "a61"
            }, {
                "b0": ["b61", "b62", "b63"]
            }, {
                "c0": "c61"
            }],
            "_sdc_source_bucket":
            "com-stitchdata-prod-circleci-assets",
            "_sdc_source_file":
            "tap_tester/tap-s3-csv/duplicate_headers.csv",
            "_sdc_source_lineno":
            7
        }]

        self.assertListEqual(expected_records, records)
コード例 #8
0
ファイル: test_bookmarks.py プロジェクト: symon-ai/tap-s3-csv
    def test_run(self):
        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(
                self.conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")

        # Select our catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                self.conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(
                self.conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, self.conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, self.conn_id, self.expected_sync_streams(),
            self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Put a new file to S3
        delete_and_push_csv(self.get_properties(), "bookmarks2.csv")

        # Run another Sync
        sync_job_name = runner.run_sync_mode(self, self.conn_id)
        exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Check that we synced new records.
        records = runner.get_records_from_target_output()
        messages = records.get('chickens').get('messages')
        self.assertEqual(len(messages),
                         2,
                         msg="Sync'd incorrect count of messages: {}".format(
                             len(messages)))

        # Run a final sync
        sync_job_name = runner.run_sync_mode(self, self.conn_id)
        exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Check that we synced new records.
        records = runner.get_records_from_target_output()
        messages = records.get('chickens', {}).get('messages', [])
        self.assertEqual(len(messages),
                         0,
                         msg="Sync'd incorrect count of messages: {}".format(
                             len(messages)))
コード例 #9
0
    def test_run(self):

        self.setUpTestEnvironment()

        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertEqual(
            len(found_catalogs),
            len(self.expected_check_streams()),
            msg="unable to locate schemas for connection {}".format(
                self.conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        self.select_found_catalogs(found_catalogs)

        runner.run_sync_job_and_check_status(self)

        no_csv_records = 998
        no_jsonl_records = 10
        no_gz_has_csv_records = 998
        no_gz_has_jsonl_records = 2
        no_zip_records = 40

        expected_records = no_csv_records + no_jsonl_records + no_gz_has_csv_records + no_gz_has_jsonl_records + no_zip_records

        with open(
                utils.get_resources_path(
                    "output_csv_records.json",
                    ALL_SUPPORTED_FOLDER_PATH)) as json_file:
            expected_csv_records = simplejson.load(json_file,
                                                   use_decimal=True).get(
                                                       "records", [])
        with open(
                utils.get_resources_path(
                    "output_jsonl_records.json",
                    ALL_SUPPORTED_FOLDER_PATH)) as json_file:
            expected_jsonl_records = simplejson.load(json_file,
                                                     use_decimal=True).get(
                                                         "records", [])
        with open(
                utils.get_resources_path(
                    "output_gz_csv_records.json",
                    ALL_SUPPORTED_FOLDER_PATH)) as json_file:
            expected_gz_has_csv_records = simplejson.load(
                json_file, use_decimal=True).get("records", [])
        with open(
                utils.get_resources_path(
                    "output_gz_jsonl_records.json",
                    ALL_SUPPORTED_FOLDER_PATH)) as json_file:
            expected_gz_has_jsonl_records = simplejson.load(
                json_file, use_decimal=True).get("records", [])
        with open(
                utils.get_resources_path(
                    "output_zip_records.json",
                    ALL_SUPPORTED_FOLDER_PATH)) as json_file:
            expected_zip_records = simplejson.load(json_file,
                                                   use_decimal=True).get(
                                                       "records", [])

        synced_records = runner.get_records_from_target_output()

        csv_upsert_messages = [
            m for m in synced_records.get('all_support_csv').get('messages')
            if m['action'] == 'upsert'
        ]
        jsonl_upsert_messages = [
            m for m in synced_records.get('all_support_jsonl').get('messages')
            if m['action'] == 'upsert'
        ]
        gz_with_csv_upsert_messages = [
            m for m in synced_records.get('all_support_gz_has_csv').get(
                'messages') if m['action'] == 'upsert'
        ]
        gz_with_jsonl_upsert_messages = [
            m for m in synced_records.get('all_support_gz_has_jsonl').get(
                'messages') if m['action'] == 'upsert'
        ]
        zip_upsert_messages = [
            m for m in synced_records.get('all_support_zip').get('messages')
            if m['action'] == 'upsert'
        ]

        csv_records = [message.get('data') for message in csv_upsert_messages]
        jsonl_records = [
            message.get('data') for message in jsonl_upsert_messages
        ]
        gz_has_csv_records = [
            message.get('data') for message in gz_with_csv_upsert_messages
        ]
        gz_has_jsonl_records = [
            message.get('data') for message in gz_with_jsonl_upsert_messages
        ]
        zip_records = [message.get('data') for message in zip_upsert_messages]

        no_records = len(csv_records) + len(jsonl_records) + len(
            gz_has_csv_records) + len(gz_has_jsonl_records) + len(zip_records)
        self.assertEqual(expected_records, no_records)

        self.assertEquals(expected_csv_records, csv_records)
        self.assertEquals(expected_jsonl_records, jsonl_records)
        self.assertEquals(expected_gz_has_csv_records, gz_has_csv_records)
        self.assertEquals(expected_gz_has_jsonl_records, gz_has_jsonl_records)
        self.assertEquals(expected_zip_records, zip_records)