def test_run(self): self.setUpTestEnvironment(COMPRESSION_FOLDER_PATH) runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format( self.conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # Clear state before our run menagerie.set_state(self.conn_id, {}) self.select_specific_catalog(found_catalogs, "gz_file_having_empty_csv") runner.run_sync_job_and_check_status(self) expected_records = 0 # Verify actual rows were synced records = runner.get_upserts_from_target_output() self.assertEqual(expected_records, len(records))
def test_catalog_without_properties(self): self.setUpTestEnvironment() runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual(len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(self.conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") our_catalogs = [c for c in found_catalogs if c.get( 'tap_stream_id') in self.expected_streams()] # Select our catalogs for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( self.conn_id, c['stream_id']) connections.select_catalog_and_fields_via_metadata( self.conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(self.conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, self.conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) synced_records = runner.get_records_from_target_output() upsert_messages = [m for m in synced_records.get( 'csv_with_empty_lines').get('messages') if m['action'] == 'upsert'] records = [message.get('data') for message in upsert_messages] #Empty line should be ignored in emitted records. expected_records = [ {'id': 1, 'name': 'John', '_sdc_extra': [{'name': 'carl'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 2}, {'id': 2, 'name': 'Bob', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 3}, {'id': 3, '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 4}, {'id': 4, 'name': 'Alice', '_sdc_extra': [{'no_headers': ['Ben', '5']}, { 'name': 'Barak'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 5} ] self.assertListEqual(expected_records, records)
def test_run(self): runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.check_all_streams_in_catalogs(found_catalogs) self.select_found_catalogs(found_catalogs) # clear state and run the actual sync menagerie.set_state(self.conn_id, {}) runner.run_sync_job_and_check_status(self) self.check_output_record_counts() max_bookmarks_from_records = runner.get_max_bookmarks_from_target(self) state = menagerie.get_state(self.conn_id) bookmarks = state.get("bookmarks", {}) self.check_bookmarks(bookmarks, max_bookmarks_from_records) self.check_offsets(bookmarks) self.look_for_unexpected_bookmarks(bookmarks) self.assertIsNone(state.get("currently_syncing"))
def test_000_run(self): """ run discovery as the first test and ensure that it completed as expected. """ runner.run_check_job_and_check_status(self)
def test_run(self): # Connect to stitch service. runner.run_check_job_and_check_status(self) # Get and check streams. self.found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual(len(self.found_catalogs), 5, msg="unable to locate schemas for connection {}".format(self.conn_id)) # Match streams. our_catalogs = [c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema(self.conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata(self.conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(self.conn_id, {}) # Run a sync job using orchestrator, verify tap and target exit codes # and verify actual rows were synced. first_sync_record_count = self.run_sync(self.conn_id) replicated_row_count = reduce(lambda accum, c : accum + c, first_sync_record_count.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_sync_record_count)) print("total replicated row count: {}".format(replicated_row_count)) # Get incremental vs. non-incremental streams. non_incremental_streams = {key for key, value in self.expected_replication_method().items() if value != 'INCREMENTAL'} incremental_streams = {key for key, value in self.expected_replication_method().items() if value == 'INCREMENTAL'} # Get bookmark and state data for first sync, excluding full table streams. first_sync_state = menagerie.get_state(self.conn_id) first_sync_records = runner.get_records_from_target_output() for v in non_incremental_streams: first_sync_records.pop(v, None) first_max_bookmarks = self.max_bookmarks_by_stream(first_sync_records) first_min_bookmarks = self.min_bookmarks_by_stream(first_sync_records) # Run a second sync job using orchestrator. second_sync_record_count = self.run_sync(self.conn_id) # Get data about rows synced, excluding full table streams. second_sync_records = runner.get_records_from_target_output() for v in non_incremental_streams: second_sync_records.pop(v, None) second_min_bookmarks = self.min_bookmarks_by_stream(second_sync_records) for stream in incremental_streams: # get bookmark values from state and target data stream_bookmark_key = self.expected_rks().get(stream, set()) assert len(stream_bookmark_key) == 1 # There shouldn't be a compound replication key stream_bookmark_key = stream_bookmark_key.pop() state_value = first_sync_state.get("bookmarks", {}).get( stream, {None: None}).get(stream_bookmark_key) target_value = first_max_bookmarks.get( stream, {None: None}).get(stream_bookmark_key) target_min_value = first_min_bookmarks.get( stream, {None: None}).get(stream_bookmark_key) if target_value: # Convert everything to datetime. state_value = utils.strptime_with_tz(state_value) target_value = utils.strptime_with_tz(target_value) target_min_value = utils.strptime_with_tz(target_min_value) # verify that there is data with different bookmark values - setup necessary self.assertTrue(target_value >= target_min_value, msg="Data isn't set up to be able to test bookmarks") # verify state agrees with target data after 1st sync self.assertEqual(state_value, target_value, msg="The bookmark value isn't correct based on target data") # verify that you get less data the 2nd time around self.assertGreater( first_sync_record_count.get(stream, 0), second_sync_record_count.get(stream, 0), msg="second sync for stream {} didn't have less records, bookmark usage not verified".format(stream)) if len(second_sync_records) > 0 and len(second_min_bookmarks) > 0: # verify all data from 2nd sync >= 1st bookmark target_value = second_min_bookmarks.get(stream, {None: None}).get(stream_bookmark_key) target_value = utils.strptime_with_tz(target_value) # verify that the minimum bookmark sent to the target for the second sync # is greater than or equal to the bookmark from the first sync self.assertTrue(target_value >= state_value)
def test_catalog_without_properties(self): self.setUpTestEnvironment() runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual(len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(self.conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") our_catalogs = [c for c in found_catalogs if c.get( 'tap_stream_id') in self.expected_streams()] # Select our catalogs for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( self.conn_id, c['stream_id']) connections.select_catalog_and_fields_via_metadata( self.conn_id, c, c_annotated, [], []) #Verify that schema contains empty properties expected_schema = { 'type': 'object', 'properties': {} } self.assertEqual(expected_schema, c_annotated.get('annotated-schema', {})) # Stream properties should be zero as all 5 files considered in sampling are containing headers only. # No fields with breadcumb will be present in schema metadata = c_annotated["metadata"] stream_properties = [item for item in metadata if item.get("breadcrumb") != []] self.assertEqual(len(stream_properties), 0) # Clear state before our run menagerie.set_state(self.conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, self.conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) synced_records = runner.get_records_from_target_output() upsert_messages = [m for m in synced_records.get( 'catalog_without_properties').get('messages') if m['action'] == 'upsert'] records = [message.get('data') for message in upsert_messages] #All fields from file test_empty_catalog_7.csv should be emitted with duplicate & no header handling #as catalog is without any fields. expected_records = [ {'id': '1', 'name': 'John', '_sdc_extra': [{'name': 'carl'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_empty_catalog_7.csv', '_sdc_source_lineno': 2}, {'id': '2', 'name': 'Bob', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_empty_catalog_7.csv', '_sdc_source_lineno': 3}, {'id': '3', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_empty_catalog_7.csv', '_sdc_source_lineno': 4}, {'id': '4', 'name': 'Alice', '_sdc_extra': [{'no_headers': ['Ben', '5']}, { 'name': 'Barak'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_empty_catalog_7.csv', '_sdc_source_lineno': 5} ] self.assertListEqual(expected_records, records)
def test_duplicate_headers_in_csv(self): runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format( self.conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # Select our catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_streams() ] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( self.conn_id, c['stream_id']) connections.select_catalog_and_fields_via_metadata( self.conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(self.conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, self.conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file( self, self.conn_id, self.expected_streams(), self.expected_pks()) self.assertGreater(sum(record_count_by_stream.values()), 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format( sum(record_count_by_stream.values()))) synced_records = runner.get_records_from_target_output() upsert_messages = [ m for m in synced_records.get('duplicate_headers').get('messages') if m['action'] == 'upsert' ] records = [message.get('data') for message in upsert_messages] expected_records = [{ "a0": "a1", "b0": "b1", "c0": "c1", "d0": "d1", "e0": "e1", "f0": "f1", "_sdc_extra": [{ "a0": "a11" }, { "b0": ["b11", "b12", "b13"] }, { "c0": "c11" }], "_sdc_source_bucket": "com-stitchdata-prod-circleci-assets", "_sdc_source_file": "tap_tester/tap-s3-csv/duplicate_headers.csv", "_sdc_source_lineno": 2 }, { "a0": "a2", "b0": "b2", "c0": "c2", "d0": "d2", "e0": "e2", "f0": "f2", "_sdc_extra": [{ "a0": "a21" }, { "b0": "b21" }], "_sdc_source_bucket": "com-stitchdata-prod-circleci-assets", "_sdc_source_file": "tap_tester/tap-s3-csv/duplicate_headers.csv", "_sdc_source_lineno": 3 }, { "a0": "a3", "b0": "b3", "c0": "c3", "_sdc_extra": [{ "a0": "a31" }], "_sdc_source_bucket": "com-stitchdata-prod-circleci-assets", "_sdc_source_file": "tap_tester/tap-s3-csv/duplicate_headers.csv", "_sdc_source_lineno": 4 }, { "a0": "a4", "_sdc_source_bucket": "com-stitchdata-prod-circleci-assets", "_sdc_source_file": "tap_tester/tap-s3-csv/duplicate_headers.csv", "_sdc_source_lineno": 5 }, { "a0": "a5", "b0": "", "c0": "c5", "d0": "d5", "_sdc_extra": [{ "a0": "" }], "_sdc_source_bucket": "com-stitchdata-prod-circleci-assets", "_sdc_source_file": "tap_tester/tap-s3-csv/duplicate_headers.csv", "_sdc_source_lineno": 6 }, { "a0": "a6", "b0": "b6", "c0": "c6", "d0": "d6", "e0": "e6", "f0": "f6", "_sdc_extra": [{ "no_headers": ["g0", "h0", "i0"] }, { "a0": "a61" }, { "b0": ["b61", "b62", "b63"] }, { "c0": "c61" }], "_sdc_source_bucket": "com-stitchdata-prod-circleci-assets", "_sdc_source_file": "tap_tester/tap-s3-csv/duplicate_headers.csv", "_sdc_source_lineno": 7 }] self.assertListEqual(expected_records, records)
def test_run(self): runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format( self.conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # Select our catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( self.conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata( self.conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(self.conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, self.conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file( self, self.conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Put a new file to S3 delete_and_push_csv(self.get_properties(), "bookmarks2.csv") # Run another Sync sync_job_name = runner.run_sync_mode(self, self.conn_id) exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Check that we synced new records. records = runner.get_records_from_target_output() messages = records.get('chickens').get('messages') self.assertEqual(len(messages), 2, msg="Sync'd incorrect count of messages: {}".format( len(messages))) # Run a final sync sync_job_name = runner.run_sync_mode(self, self.conn_id) exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Check that we synced new records. records = runner.get_records_from_target_output() messages = records.get('chickens', {}).get('messages', []) self.assertEqual(len(messages), 0, msg="Sync'd incorrect count of messages: {}".format( len(messages)))
def test_run(self): self.setUpTestEnvironment() runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual( len(found_catalogs), len(self.expected_check_streams()), msg="unable to locate schemas for connection {}".format( self.conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # Clear state before our run menagerie.set_state(self.conn_id, {}) self.select_found_catalogs(found_catalogs) runner.run_sync_job_and_check_status(self) no_csv_records = 998 no_jsonl_records = 10 no_gz_has_csv_records = 998 no_gz_has_jsonl_records = 2 no_zip_records = 40 expected_records = no_csv_records + no_jsonl_records + no_gz_has_csv_records + no_gz_has_jsonl_records + no_zip_records with open( utils.get_resources_path( "output_csv_records.json", ALL_SUPPORTED_FOLDER_PATH)) as json_file: expected_csv_records = simplejson.load(json_file, use_decimal=True).get( "records", []) with open( utils.get_resources_path( "output_jsonl_records.json", ALL_SUPPORTED_FOLDER_PATH)) as json_file: expected_jsonl_records = simplejson.load(json_file, use_decimal=True).get( "records", []) with open( utils.get_resources_path( "output_gz_csv_records.json", ALL_SUPPORTED_FOLDER_PATH)) as json_file: expected_gz_has_csv_records = simplejson.load( json_file, use_decimal=True).get("records", []) with open( utils.get_resources_path( "output_gz_jsonl_records.json", ALL_SUPPORTED_FOLDER_PATH)) as json_file: expected_gz_has_jsonl_records = simplejson.load( json_file, use_decimal=True).get("records", []) with open( utils.get_resources_path( "output_zip_records.json", ALL_SUPPORTED_FOLDER_PATH)) as json_file: expected_zip_records = simplejson.load(json_file, use_decimal=True).get( "records", []) synced_records = runner.get_records_from_target_output() csv_upsert_messages = [ m for m in synced_records.get('all_support_csv').get('messages') if m['action'] == 'upsert' ] jsonl_upsert_messages = [ m for m in synced_records.get('all_support_jsonl').get('messages') if m['action'] == 'upsert' ] gz_with_csv_upsert_messages = [ m for m in synced_records.get('all_support_gz_has_csv').get( 'messages') if m['action'] == 'upsert' ] gz_with_jsonl_upsert_messages = [ m for m in synced_records.get('all_support_gz_has_jsonl').get( 'messages') if m['action'] == 'upsert' ] zip_upsert_messages = [ m for m in synced_records.get('all_support_zip').get('messages') if m['action'] == 'upsert' ] csv_records = [message.get('data') for message in csv_upsert_messages] jsonl_records = [ message.get('data') for message in jsonl_upsert_messages ] gz_has_csv_records = [ message.get('data') for message in gz_with_csv_upsert_messages ] gz_has_jsonl_records = [ message.get('data') for message in gz_with_jsonl_upsert_messages ] zip_records = [message.get('data') for message in zip_upsert_messages] no_records = len(csv_records) + len(jsonl_records) + len( gz_has_csv_records) + len(gz_has_jsonl_records) + len(zip_records) self.assertEqual(expected_records, no_records) self.assertEquals(expected_csv_records, csv_records) self.assertEquals(expected_jsonl_records, jsonl_records) self.assertEquals(expected_gz_has_csv_records, gz_has_csv_records) self.assertEquals(expected_gz_has_jsonl_records, gz_has_jsonl_records) self.assertEquals(expected_zip_records, zip_records)