def run_test(self): conn_id = connections.ensure_connection(self) # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams catalog = menagerie.get_catalogs(conn_id) found_catalog_names = set(map(lambda c: c['tap_stream_id'], catalog)) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), found_catalog_names) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in catalog if c['tap_stream_id'] == tap_stream_id ][0] schema_and_metadata = menagerie.get_annotated_schema( conn_id, found_stream['stream_id']) main_metadata = schema_and_metadata["metadata"] stream_metadata = [ mdata for mdata in main_metadata if mdata["breadcrumb"] == [] ] # assert that the pks are correct self.assertEqual( self.expected_pks()[tap_stream_id], set(stream_metadata[0]['metadata']['table-key-properties'])) for stream_catalog in catalog: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema['annotated-schema'], []) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_first_sync_streams(), self.expected_pks()) # Verify that the full table was syncd for tap_stream_id in self.expected_first_sync_streams(): self.assertEqual( self.expected_first_sync_row_counts()[tap_stream_id], record_count_by_stream[tap_stream_id])
def select_all_streams_and_fields(self, conn_id, catalogs, select_all_fields: bool = True, select_default_fields: bool = False): """Select all streams and all fields within streams""" for catalog in catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) non_selected_properties = [] if not select_all_fields: # get a list of all properties so that none are selected non_selected_properties = set( schema.get('annotated-schema', {}).get('properties', {}).keys()) if select_default_fields and self.is_custom_report( catalog['stream_name']): non_selected_properties = non_selected_properties.difference( self.custom_report_minimum_valid_field_selection( catalog['stream_name'])) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], non_selected_properties)
def _select_streams_and_fields(self, conn_id, catalogs, select_default_fields, select_pagination_fields): """Select all streams and all fields within streams""" for catalog in catalogs: schema_and_metadata = menagerie.get_annotated_schema( conn_id, catalog['stream_id']) metadata = schema_and_metadata['metadata'] properties = set(md['breadcrumb'][-1] for md in metadata if len(md['breadcrumb']) > 0 and md['breadcrumb'][0] == 'properties') # get a list of all properties so that none are selected if select_default_fields: non_selected_properties = properties.difference( self.expected_default_fields()[catalog['stream_name']]) elif select_pagination_fields: non_selected_properties = properties.difference( self.expected_pagination_fields()[catalog['stream_name']]) else: non_selected_properties = properties connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema_and_metadata, [], non_selected_properties)
def select_all_streams_and_fields(conn_id, catalogs, select_all_fields: bool = True): """Select all streams and all fields within streams""" for catalog in catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) non_selected_properties = [] if not select_all_fields: # get a list of all properties so that none are selected non_selected_properties = set( schema.get('annotated-schema', {}).get('properties', {}).keys()) # HACK: This can be removed if the tap unwraps envelope # objects and declares replication keys as automatic if catalog[ "tap_stream_id"] == 'issues' and 'fields' in non_selected_properties: non_selected_properties.remove( "fields") # This contains replication key for issues elif catalog[ "tap_stream_id"] == "worklogs" and 'updated' in non_selected_properties: non_selected_properties.remove( "updated") # Replication key for worklogs connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], non_selected_properties)
def select_found_catalogs(self, conn_id, catalogs, only_streams=None, deselect_all_fields: bool = False, non_selected_props=[]): """Select all streams and all fields within streams""" for catalog in catalogs: if only_streams and catalog["stream_name"] not in only_streams: continue schema = menagerie.get_annotated_schema(conn_id, catalog["stream_id"]) non_selected_properties = non_selected_props if not deselect_all_fields else [] if deselect_all_fields: # get a list of all properties so that none are selected non_selected_properties = schema.get("annotated-schema", {}).get("properties", {}) non_selected_properties = non_selected_properties.keys() additional_md = [] connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, additional_md=additional_md, non_selected_fields=non_selected_properties)
def select_all_streams_and_fields(self, conn_id, catalogs, select_all_fields: bool = True, exclude_streams=None): """Select all streams and all fields within streams""" for catalog in catalogs: if exclude_streams and catalog.get( 'stream_name') in exclude_streams: continue schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) non_selected_properties = [] if not select_all_fields: # get a list of all properties so that none are selected non_selected_properties = schema.get('annotated-schema', {}).get('properties', {}) # remove properties that are automatic for prop in self.expected_automatic_fields().get( catalog['stream_name'], []): if prop in non_selected_properties: del non_selected_properties[prop] non_selected_properties = non_selected_properties.keys() additional_md = [] connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, additional_md=additional_md, non_selected_fields=non_selected_properties)
def test_organizations_dynamic_fields(self): """ Run tap in check mode and verify more than one page is retruned for dynamic fields. """ conn_id = connections.ensure_connection(self) # run and verify the tap in discovermode. found_catalog = self.run_and_verify_check_mode(conn_id) # Verify number of dynamic fields in organizations stream metadata # (Need enough dynamic fields for organizations) for catalog in found_catalog: if catalog['stream_name'] == "organizations": organization_fields_page_limit = 100 schema_and_metadata = menagerie.get_annotated_schema( conn_id, catalog['stream_id']) schema_fields = schema_and_metadata.get( 'annotated-schema').get('properties').keys() organizations_dynamic_fields = [ field for field in schema_fields if field not in self.organizations_static_fields() ] #Verify count of dynamic fields is more than page limit for organization fields(Pagination) self.assertGreater(len(organizations_dynamic_fields), organization_fields_page_limit)
def do_test(self, conn_id): # Select our catalogs our_catalogs = [c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream,{}).get('messages',[]) if stream in ['tickets', 'groups', 'users']: self.assertGreater(len(messages), 100, msg="Stream {} has fewer than 100 records synced".format(stream)) for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="Missing primary-key for message {}".format(m))
def set_replication_methods(self, conn_id, catalogs, replication_methods): replication_keys = self.expected_replication_keys() for catalog in catalogs: replication_method = replication_methods.get( catalog['stream_name']) if replication_method == self.INCREMENTAL: replication_key = list( replication_keys.get(catalog['stream_name']))[0] replication_md = [{ "breadcrumb": [], "metadata": { 'replication-key': replication_key, "replication-method": replication_method, "selected": True } }] else: replication_md = [{ "breadcrumb": [], "metadata": { 'replication-key': None, "replication-method": "FULL_TABLE", "selected": True } }] connections.set_non_discoverable_metadata( conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id']), replication_md)
def select_all_streams_and_fields(conn_id, catalogs): """Select all streams and all fields within streams""" for catalog in catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema)
def test_primary_keys(self): """ Verify that the configuration can be used to set primary key fields when * the primary key is an empty list * the primary key is a single field * the primary key is a composite of multiple fields """ found_catalogs = menagerie.get_catalogs(S3TypesAndData.conn_id) all_catalogs = [x for x in found_catalogs] for catalog in all_catalogs: with self.subTest(c=catalog): expected_key_properties = \ S3TypesAndData.expected_pks()[catalog["stream_name"]] metadata_and_annotated_schema = menagerie.get_annotated_schema( S3TypesAndData.conn_id, catalog['stream_id']) # verify that expected_key_properties show as automatic in metadata metadata = metadata_and_annotated_schema["metadata"] actual_key_properties = { item.get("breadcrumb", ["", ""])[1] for item in metadata if item.get("metadata").get("inclusion") == "automatic" } self.assertEqual(actual_key_properties, expected_key_properties)
def run_test(self, only_automatic_fields=False): expected_streams = self.streams_to_select() conn_id = connections.ensure_connection(self) runner.run_check_mode(self, conn_id) expected_stream_fields = dict() found_catalogs = menagerie.get_catalogs(conn_id) for catalog in found_catalogs: stream_name = catalog['stream_name'] catalog_entry = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) if not stream_name in expected_streams: continue # select catalog fields self.select_found_catalogs(conn_id, [catalog], only_streams=[stream_name], deselect_all_fields=True if only_automatic_fields else False, non_selected_props=[] if only_automatic_fields else self.non_selected_fields[stream_name]) # add expected fields for assertion fields_from_field_level_md = [md_entry['breadcrumb'][1] for md_entry in catalog_entry['metadata'] if md_entry['breadcrumb'] != []] if only_automatic_fields: expected_stream_fields[stream_name] = self.expected_primary_keys()[stream_name] | self.expected_replication_keys()[stream_name] else: expected_stream_fields[stream_name] = set(fields_from_field_level_md) - set(self.non_selected_fields[stream_name]) self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() for stream in expected_streams: with self.subTest(stream=stream): expected_primary_keys = self.expected_primary_keys()[stream] # get expected keys expected_keys = expected_stream_fields[stream] # collect all actual values messages = synced_records.get(stream) # collect actual synced fields actual_keys = [set(message['data'].keys()) for message in messages['messages'] if message['action'] == 'upsert'][0] fields = self.fields_to_remove.get(stream) or [] expected_keys = expected_keys - set(fields) # verify expected and actual fields self.assertEqual(expected_keys, actual_keys, msg='Selected keys in catalog is not as expected') # Verify we did not duplicate any records across pages records_pks_set = {tuple([message.get('data').get(primary_key) for primary_key in expected_primary_keys]) for message in messages.get('messages')} records_pks_list = [tuple([message.get('data').get(primary_key) for primary_key in expected_primary_keys]) for message in messages.get('messages')] self.assertCountEqual(records_pks_set, records_pks_list, msg="We have duplicate records for {}".format(stream))
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") #select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id']) if c['stream_name'] in self.expected_sync_streams().keys(): stream = c['stream_name'] pks = self.expected_sync_streams()[stream] for pk in pks: mdata = next((m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == pk), None) print("Validating inclusion on {}: {}".format(c['stream_name'], mdata)) self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) first_record_count_by_stream = runner.examine_target_output_file(self, conn_id, set(self.expected_sync_streams().keys()), self.expected_sync_streams()) replicated_row_count = reduce(lambda accum,c : accum + c, first_record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Verify that automatic fields are all emitted with records synced_records = runner.get_records_from_target_output() for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] self.assertGreater(len(record_messages), 0, msg="stream {} did not sync any records.".format(stream_name)) for record_keys in record_messages: self.assertEqual(self.expected_sync_streams().get(stream_name, set()) - record_keys, set())
def perform_and_verify_table_and_field_selection( self, # TODO clean this up and select_all_streams_and_fields conn_id, test_catalogs, select_all_fields=True, select_default_fields=False): """ Perform table and field selection based off of the streams to select set and field selection parameters. Verify this results in the expected streams selected and all or no fields selected for those streams. """ # Select all available fields or select no fields from all testable streams self.select_all_streams_and_fields( conn_id=conn_id, catalogs=test_catalogs, select_all_fields=select_all_fields, select_default_fields=select_default_fields) catalogs = menagerie.get_catalogs(conn_id) # Ensure our selection affects the catalog expected_selected = [tc.get('stream_name') for tc in test_catalogs] for cat in catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, cat['stream_id']) # Verify all testable streams are selected selected = catalog_entry.get('annotated-schema').get('selected') print("Validating selection on {}: {}".format( cat['stream_name'], selected)) if cat['stream_name'] not in expected_selected: self.assertFalse(selected, msg="Stream selected, but not testable.") continue # Skip remaining assertions if we aren't selecting this stream self.assertTrue(selected, msg="Stream not selected.") if select_all_fields: # Verify all fields within each selected stream are selected for field, field_props in catalog_entry.get( 'annotated-schema').get('properties').items(): field_selected = field_props.get('selected') print("\tValidating selection on {}.{}: {}".format( cat['stream_name'], field, field_selected)) self.assertTrue(field_selected, msg="Field not selected.") else: if not self.is_custom_report(cat['stream_name']): # Verify only automatic fields are selected expected_automatic_fields = self.expected_automatic_fields( ).get(cat['stream_name']) selected_fields = self.get_selected_fields_from_metadata( catalog_entry['metadata']) self.assertEqual(expected_automatic_fields, selected_fields)
def test_catalog_without_properties(self): self.setUpTestEnvironment() runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual(len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(self.conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") our_catalogs = [c for c in found_catalogs if c.get( 'tap_stream_id') in self.expected_streams()] # Select our catalogs for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( self.conn_id, c['stream_id']) connections.select_catalog_and_fields_via_metadata( self.conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(self.conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, self.conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) synced_records = runner.get_records_from_target_output() upsert_messages = [m for m in synced_records.get( 'csv_with_empty_lines').get('messages') if m['action'] == 'upsert'] records = [message.get('data') for message in upsert_messages] #Empty line should be ignored in emitted records. expected_records = [ {'id': 1, 'name': 'John', '_sdc_extra': [{'name': 'carl'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 2}, {'id': 2, 'name': 'Bob', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 3}, {'id': 3, '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 4}, {'id': 4, 'name': 'Alice', '_sdc_extra': [{'no_headers': ['Ben', '5']}, { 'name': 'Barak'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 5} ] self.assertListEqual(expected_records, records)
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) #select all catalogs for catalog in found_catalogs: connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id'])) future_time = "2050-01-01T00:00:00.000000Z" #clear state future_bookmarks = {"currently_syncing" : None, "bookmarks": {"contacts" : {"offset" : {}, "versionTimestamp" : future_time}, "subscription_changes" : {"startTimestamp" : future_time, "offset" : {}}, "campaigns" : {"offset" : {}}, "forms" : {"updatedAt" : future_time}, "deals" : {"offset" : {}, "hs_lastmodifieddate" : future_time}, "workflows" : {"updatedAt" : future_time}, "owners" : {"updatedAt" : future_time}, "contact_lists" : {"updatedAt" : future_time, "offset" : {}}, "email_events" : {"startTimestamp" : future_time, "offset" : {}}, "companies" : {"offset" : {}, "hs_lastmodifieddate" : future_time}, "engagements" : {"lastUpdated" : future_time, "offset" : {}}}} menagerie.set_state(conn_id, future_bookmarks) record_count_by_stream = self.run_and_verify_sync(conn_id) #because the bookmarks were set into the future, we should NOT actually replicate any data. #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks streams_with_bookmarks = self.expected_sync_streams() streams_with_bookmarks.remove('campaigns') streams_with_bookmarks.remove('deal_pipelines') bad_streams = streams_with_bookmarks.intersection(record_count_by_stream.keys()) self.assertEqual(len(bad_streams), 0, msg="still pulled down records from {} despite future bookmarks".format(bad_streams)) state = menagerie.get_state(conn_id) # NB: Companies and engagements won't set a bookmark in the future. state["bookmarks"].pop("companies") state["bookmarks"].pop("engagements") future_bookmarks["bookmarks"].pop("companies") future_bookmarks["bookmarks"].pop("engagements") self.assertEqual(state, future_bookmarks, msg="state should not have been modified because we didn't replicate any data") bookmarks = state.get('bookmarks') bookmark_streams = set(state.get('bookmarks').keys())
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") all_excluded_fields = {} # select all catalogs for c in found_catalogs: if c['stream_name'] == 'ads': continue discovered_schema = menagerie.get_annotated_schema(conn_id, c['stream_id'])['annotated-schema'] all_excluded_fields[c['stream_name']] = list(set(discovered_schema.keys()) - self.expected_automatic_fields().get(c['stream_name'], set()))[:5] connections.select_catalog_and_fields_via_metadata( conn_id, c, discovered_schema, non_selected_fields=all_excluded_fields[c['stream_name']]) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) synced_records = runner.get_records_from_target_output() self.assertTrue('ads' not in synced_records.keys()) for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] for record_keys in record_messages: # The intersection should be empty self.assertFalse(record_keys.intersection(all_excluded_fields[stream_name]))
def test_run(self): conn_id = connections.ensure_connection(self, payload_hook=None) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # # # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], []) # # Verify that all streams sync at least one row for initial sync # # This test is also verifying access token expiration handling. If test fails with # # authentication error, refresh token was not replaced after expiring. menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) zero_count_streams = { k for k, v in record_count_by_stream.items() if v == 0 } self.assertFalse( zero_count_streams, msg="The following streams did not sync any rows {}".format( zero_count_streams))
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) # Select only the expected streams tables expected_streams = self.expected_streams() catalog_entries = [ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams] self.select_all_streams_and_fields(conn_id, catalog_entries, select_all_fields=False) # Verify our selection worked as expected catalogs_selection = menagerie.get_catalogs(conn_id) for cat in catalogs_selection: catalog_entry = menagerie.get_annotated_schema(conn_id, cat['stream_id']) # Verify the expected stream tables are selected selected = catalog_entry.get('annotated-schema').get('selected') print("Validating selection on {}: {}".format(cat['stream_name'], selected)) if cat['stream_name'] not in expected_streams: self.assertFalse(selected, msg="Stream selected, but not testable.") continue # Skip remaining assertions if we aren't selecting this stream self.assertTrue(selected, msg="Stream not selected.") # Verify only automatic fields are selected expected_automatic_fields = self.expected_automatic_fields().get(cat['tap_stream_id']) selected_fields = self.get_selected_fields_from_metadata(catalog_entry['metadata']) self.assertEqual(expected_automatic_fields, selected_fields, msg='for stream {}, expected: {} actual: {}'.format(cat['stream_name'], expected_automatic_fields, selected_fields)) # Run a sync job using orchestrator sync_record_count = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() # Assert the records for each stream for stream in self.expected_streams(): with self.subTest(stream=stream): data = synced_records.get(stream) if not data: print('WARNING: Add data for {}'.format(stream)) continue record_messages_keys = [set(row['data'].keys()) for row in data['messages']] expected_keys = self.expected_automatic_fields().get(stream) # Verify that only the automatic fields are sent to the target for actual_keys in record_messages_keys: self.assertEqual( actual_keys.symmetric_difference(expected_keys), set(), msg="Expected automatic fields and nothing else.") # Verify the sync meets or exceeds the default record count record_count = sync_record_count.get(stream, 0) self.assertLessEqual(1, record_count)
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") # select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema( conn_id, c['stream_id']) connections.select_catalog_via_metadata(conn_id, c, catalog_entry) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count))
def test_run(self): conn_id = connections.ensure_connection(self) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema) # Clear State and run sync menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count))
def perform_and_verify_table_and_field_selection(self, conn_id, test_catalogs, select_all_fields=True): """ Perform table and field selection based off of the streams to select set and field selection parameters. Verify this results in the expected streams selected and all or no fields selected for those streams. """ # Select all available fields or select no fields from all testable streams self.select_all_streams_and_fields( conn_id=conn_id, catalogs=test_catalogs, select_all_fields=select_all_fields ) catalogs = menagerie.get_catalogs(conn_id) # Ensure our selection affects the catalog expected_selected = [tc.get('stream_name') for tc in test_catalogs] for cat in catalogs: catalog_entry = menagerie.get_annotated_schema(conn_id, cat['stream_id']) # Verify all testable streams are selected top_level_md = [md_entry for md_entry in catalog_entry['metadata'] if md_entry['breadcrumb'] == []] selected = top_level_md[0]['metadata'].get('selected') print("Validating selection on {}: {}".format(cat['stream_name'], selected)) if cat['stream_name'] not in expected_selected: self.assertFalse(selected, msg="Stream selected, but not testable.") continue # Skip remaining assertions if we aren't selecting this stream self.assertTrue(selected, msg="Stream not selected.") if select_all_fields: # Verify all fields within each selected stream are selected field_level_md = [md_entry for md_entry in catalog_entry['metadata'] if md_entry['breadcrumb'] != []] for field_md in field_level_md: field = field_md['breadcrumb'][1] field_selected = field_md['metadata'].get('selected') print("\tValidating selection on {}.{}: {}".format( cat['stream_name'], field, field_selected)) self.assertTrue(field_selected, msg="Field not selected.") else: # Verify only automatic fields are selected expected_automatic_fields = self.expected_automatic_fields().get(cat['stream_name']) selected_fields = self.get_selected_fields_from_metadata(catalog_entry['metadata']) # BUG TDL-14241 | Replication keys are not automatic if cat['stream_name'] == "file_metadata": expected_automatic_fields.remove('modifiedTime') self.assertEqual(expected_automatic_fields, selected_fields)
def perform_and_verify_adjusted_selection(self, conn_id, test_catalogs, select_all_fields, specific_fields): """ Perform table and field selection based off of the streams to select set and field selection parameters. Verify this results in the expected streams selected and all or no fields selected for those streams. """ # Select specifc fields from all testable streams self.select_specific_fields(conn_id=conn_id, catalogs=test_catalogs, select_all_fields=select_all_fields, specific_fields=specific_fields) catalogs = menagerie.get_catalogs(conn_id) # Ensure our selection affects the catalog expected_selected = [tc.get('tap_stream_id') for tc in test_catalogs] for cat in catalogs: with self.subTest(cat=cat): catalog_entry = menagerie.get_annotated_schema( conn_id, cat['stream_id']) # Verify intended streams are selected selected = catalog_entry.get('annotated-schema').get( 'selected') print("Validating selection on {}: {}".format( cat['tap_stream_id'], selected)) if cat['stream_name'] not in expected_selected: continue # Skip remaining assertions if we aren't selecting this stream self.assertTrue(selected, msg="Stream not selected.") if select_all_fields: # Verify all fields within each selected stream are selected for field, field_props in catalog_entry.get( 'annotated-schema').get('properties').items(): field_selected = field_props.get('selected') print("\tValidating selection on {}.{}: {}".format( cat['stream_name'], field, field_selected)) self.assertTrue(field_selected, msg="Field not selected.") else: for field, field_props in catalog_entry.get( 'annotated-schema').get('properties').items(): field_selected = field_props.get('selected') if field_selected: print("\tValidating selection on {}.{}: {}".format( cat['stream_name'], field, field_selected))
def test_data_type_sampling(self): """ Verify that each data type can be sampled and determined correctly. A file for stream `test_data_types_no_coercion` was setup which has one column for each test. Tests include each data type: * integer * number * date-time * string integers are tested for boundary conditions of signed and unsigned big-ints, strings ore tested for length including a null string and 65536 chars. numbers are tested for float and double representations at the borders which are exponents for extremely large and small positive and negative numbers plus zero. numbers are also tested for precision date-times are tested at the borders of allowed python date-times in multiple types of formats including just dates, just times, date-times with timezone and date-times without timezone. The test below uses subtests so that each data-type is tested and reported on individually """ found_catalogs = menagerie.get_catalogs(S3TypesAndData.conn_id) # only testing the data types stream for now, may want to test all of them # or add more tests for different things for other catalogs. data_type_catalogs = [ x for x in found_catalogs if x["stream_name"] in ("test_data_types_no_coercion", "test_switching_data_types") ] for data_type in DataTypes: for catalog in data_type_catalogs: with self.subTest(dt=(data_type, catalog)): # verify each data type is sampled correctly in the annotated-schema expected_properties = S3TypesAndData.expected_properties_for_data_types( data_type, catalog['stream_name']) metadata_and_annotated_schema = menagerie.get_annotated_schema( S3TypesAndData.conn_id, catalog['stream_id']) properties = metadata_and_annotated_schema[ "annotated-schema"]["properties"] actual_properties = { k for k, v in properties.items() if v.get(data_type.value[0]) == data_type.value[1] } self.assertEqual(expected_properties, actual_properties)
def select_all_streams_and_fields(conn_id, catalogs, select_all_fields: bool = True): """Select all streams and all fields within streams""" for catalog in catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) non_selected_properties = [] if not select_all_fields: # get a list of all properties so that none are selected non_selected_properties = schema.get('annotated-schema', {}).get( 'properties', {}).keys() connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], non_selected_properties)
def test_run(self): conn_id = self.create_connection() # Select our catalogs our_catalogs = [ c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata( conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream).get('messages') for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="oh no! {}".format(m)) bookmarks = menagerie.get_state(conn_id)['bookmarks'] self.assertTrue('orders' in bookmarks)
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") #select all catalogs #selected_catalogs = list(map(lambda catalog: self.perform_field_selection(conn_id, catalog), found_catalogs)) #menagerie.post_annotated_catalogs(conn_id, selected_catalogs) for c in found_catalogs: connections.select_catalog_and_fields_via_metadata(conn_id, c, menagerie.get_annotated_schema(conn_id, c['stream_id'])) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # bookmarks for the 4 streams should be 2015-03-16 states = menagerie.get_state(conn_id)["bookmarks"] end_date = self.get_properties()["end_date"].split()[0] for k, v in states.items(): if "insights" in k: bm_date = v.get("date_start") self.assertEqual(end_date, bm_date) print("bookmarks match end_date of {}".format(end_date))
def select_specific_fields(conn_id, catalogs, select_all_fields: bool = True, specific_fields: dict = {}): """Select all streams and all fields within streams""" for catalog in catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) non_selected_properties = [] if not select_all_fields: # get a list of all properties and remove measuer fields non_selected_properties = set(schema.get('annotated-schema', {}).get( 'properties', {}).keys()) spec_fields = specific_fields.get(catalog['stream_name'], set()) non_selected_properties_adjusted = non_selected_properties.difference(spec_fields) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], non_selected_properties_adjusted)
def do_test(self, conn_id): # Get the Streams for Organizations and Users streams = [c for c in self.found_catalogs if c['stream_name'] in ['organizations', 'users']] # Create an array of arrays where the first element is the word minus the last letter ie: "organization" # and the second element is the annotated schema schemas = [(s['stream_name'][:-1], menagerie.get_annotated_schema(conn_id, s['stream_id'])) for s in streams] # Loop over them for schema in schemas: properties = schema[1]['annotated-schema']['properties'] # Ensure that "organization_fields" or "user_fields" are objects in the annotated schema # with their own set of properties self.assertIsNotNone(properties.get('{}_fields'.format(schema[0]), {}).get('properties'), msg='{}_fields not present in schema!'.format(schema[0]))
def select_found_catalogs(self, found_catalogs): # selected = [menagerie.select_catalog(self.conn_id, c) for c in found_catalogs] # menagerie.post_annotated_catalogs(self.conn_id, selected) for catalog in found_catalogs: schema = menagerie.get_annotated_schema(self.conn_id, catalog['stream_id']) non_selected_properties = [] additional_md = [] connections.select_catalog_and_fields_via_metadata( self.conn_id, catalog, schema, additional_md=additional_md, non_selected_fields=non_selected_properties)