def select_all_streams_and_fields(conn_id, catalogs, select_all_fields: bool = True): """Select all streams and all fields within streams""" for catalog in catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) non_selected_properties = [] if not select_all_fields: # get a list of all properties so that none are selected non_selected_properties = set( schema.get('annotated-schema', {}).get('properties', {}).keys()) # HACK: This can be removed if the tap unwraps envelope # objects and declares replication keys as automatic if catalog[ "tap_stream_id"] == 'issues' and 'fields' in non_selected_properties: non_selected_properties.remove( "fields") # This contains replication key for issues elif catalog[ "tap_stream_id"] == "worklogs" and 'updated' in non_selected_properties: non_selected_properties.remove( "updated") # Replication key for worklogs connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], non_selected_properties)
def select_all_streams_and_fields(self, conn_id, catalogs, select_all_fields: bool = True, select_default_fields: bool = False): """Select all streams and all fields within streams""" for catalog in catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) non_selected_properties = [] if not select_all_fields: # get a list of all properties so that none are selected non_selected_properties = set( schema.get('annotated-schema', {}).get('properties', {}).keys()) if select_default_fields and self.is_custom_report( catalog['stream_name']): non_selected_properties = non_selected_properties.difference( self.custom_report_minimum_valid_field_selection( catalog['stream_name'])) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], non_selected_properties)
def select_found_catalogs(self, conn_id, found_catalogs, only_streams=None): selected = [] for catalog in found_catalogs: if only_streams and catalog["tap_stream_id"] not in only_streams: continue schema = menagerie.select_catalog(conn_id, catalog) selected.append({ "key_properties": catalog.get("key_properties"), "schema": schema, "tap_stream_id": catalog.get("tap_stream_id"), "replication_method": catalog.get("replication_method"), "replication_key": catalog.get("replication_key"), }) for catalog_entry in selected: connections.select_catalog_and_fields_via_metadata( conn_id, catalog_entry, {"annotated-schema": catalog_entry['schema']})
def _select_streams_and_fields(self, conn_id, catalogs, select_default_fields, select_pagination_fields): """Select all streams and all fields within streams""" for catalog in catalogs: schema_and_metadata = menagerie.get_annotated_schema( conn_id, catalog['stream_id']) metadata = schema_and_metadata['metadata'] properties = set(md['breadcrumb'][-1] for md in metadata if len(md['breadcrumb']) > 0 and md['breadcrumb'][0] == 'properties') # get a list of all properties so that none are selected if select_default_fields: non_selected_properties = properties.difference( self.expected_default_fields()[catalog['stream_name']]) elif select_pagination_fields: non_selected_properties = properties.difference( self.expected_pagination_fields()[catalog['stream_name']]) else: non_selected_properties = properties connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema_and_metadata, [], non_selected_properties)
def select_found_catalogs(self, conn_id, catalogs, only_streams=None, deselect_all_fields: bool = False, non_selected_props=[]): """Select all streams and all fields within streams""" for catalog in catalogs: if only_streams and catalog["stream_name"] not in only_streams: continue schema = menagerie.get_annotated_schema(conn_id, catalog["stream_id"]) non_selected_properties = non_selected_props if not deselect_all_fields else [] if deselect_all_fields: # get a list of all properties so that none are selected non_selected_properties = schema.get("annotated-schema", {}).get("properties", {}) non_selected_properties = non_selected_properties.keys() additional_md = [] connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, additional_md=additional_md, non_selected_fields=non_selected_properties)
def do_test(self, conn_id): # Select our catalogs our_catalogs = [c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream,{}).get('messages',[]) if stream in ['tickets', 'groups', 'users']: self.assertGreater(len(messages), 100, msg="Stream {} has fewer than 100 records synced".format(stream)) for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="Missing primary-key for message {}".format(m))
def select_all_streams_and_fields(self, conn_id, catalogs, select_all_fields: bool = True, exclude_streams=None): """Select all streams and all fields within streams""" for catalog in catalogs: if exclude_streams and catalog.get( 'stream_name') in exclude_streams: continue schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) non_selected_properties = [] if not select_all_fields: # get a list of all properties so that none are selected non_selected_properties = schema.get('annotated-schema', {}).get('properties', {}) # remove properties that are automatic for prop in self.expected_automatic_fields().get( catalog['stream_name'], []): if prop in non_selected_properties: del non_selected_properties[prop] non_selected_properties = non_selected_properties.keys() additional_md = [] connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, additional_md=additional_md, non_selected_fields=non_selected_properties)
def select_all_streams_and_fields(conn_id, catalogs): """Select all streams and all fields within streams""" for catalog in catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema)
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) #select all catalogs for catalog in found_catalogs: connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id'])) future_time = "2050-01-01T00:00:00.000000Z" #clear state future_bookmarks = {"currently_syncing" : None, "bookmarks": {"contacts" : {"offset" : {}, "versionTimestamp" : future_time}, "subscription_changes" : {"startTimestamp" : future_time, "offset" : {}}, "campaigns" : {"offset" : {}}, "forms" : {"updatedAt" : future_time}, "deals" : {"offset" : {}, "hs_lastmodifieddate" : future_time}, "workflows" : {"updatedAt" : future_time}, "owners" : {"updatedAt" : future_time}, "contact_lists" : {"updatedAt" : future_time, "offset" : {}}, "email_events" : {"startTimestamp" : future_time, "offset" : {}}, "companies" : {"offset" : {}, "hs_lastmodifieddate" : future_time}, "engagements" : {"lastUpdated" : future_time, "offset" : {}}}} menagerie.set_state(conn_id, future_bookmarks) record_count_by_stream = self.run_and_verify_sync(conn_id) #because the bookmarks were set into the future, we should NOT actually replicate any data. #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks streams_with_bookmarks = self.expected_sync_streams() streams_with_bookmarks.remove('campaigns') streams_with_bookmarks.remove('deal_pipelines') bad_streams = streams_with_bookmarks.intersection(record_count_by_stream.keys()) self.assertEqual(len(bad_streams), 0, msg="still pulled down records from {} despite future bookmarks".format(bad_streams)) state = menagerie.get_state(conn_id) # NB: Companies and engagements won't set a bookmark in the future. state["bookmarks"].pop("companies") state["bookmarks"].pop("engagements") future_bookmarks["bookmarks"].pop("companies") future_bookmarks["bookmarks"].pop("engagements") self.assertEqual(state, future_bookmarks, msg="state should not have been modified because we didn't replicate any data") bookmarks = state.get('bookmarks') bookmark_streams = set(state.get('bookmarks').keys())
def test_catalog_without_properties(self): self.setUpTestEnvironment() runner.run_check_job_and_check_status(self) found_catalogs = menagerie.get_catalogs(self.conn_id) self.assertEqual(len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(self.conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") our_catalogs = [c for c in found_catalogs if c.get( 'tap_stream_id') in self.expected_streams()] # Select our catalogs for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( self.conn_id, c['stream_id']) connections.select_catalog_and_fields_via_metadata( self.conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(self.conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, self.conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) synced_records = runner.get_records_from_target_output() upsert_messages = [m for m in synced_records.get( 'csv_with_empty_lines').get('messages') if m['action'] == 'upsert'] records = [message.get('data') for message in upsert_messages] #Empty line should be ignored in emitted records. expected_records = [ {'id': 1, 'name': 'John', '_sdc_extra': [{'name': 'carl'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 2}, {'id': 2, 'name': 'Bob', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 3}, {'id': 3, '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 4}, {'id': 4, 'name': 'Alice', '_sdc_extra': [{'no_headers': ['Ben', '5']}, { 'name': 'Barak'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 5} ] self.assertListEqual(expected_records, records)
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are OK") #select all catalogs for c in found_catalogs: catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id']) if c['stream_name'] in self.expected_sync_streams().keys(): stream = c['stream_name'] pks = self.expected_sync_streams()[stream] for pk in pks: mdata = next((m for m in catalog_entry['metadata'] if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == pk), None) print("Validating inclusion on {}: {}".format(c['stream_name'], mdata)) self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic') connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) first_record_count_by_stream = runner.examine_target_output_file(self, conn_id, set(self.expected_sync_streams().keys()), self.expected_sync_streams()) replicated_row_count = reduce(lambda accum,c : accum + c, first_record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Verify that automatic fields are all emitted with records synced_records = runner.get_records_from_target_output() for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] self.assertGreater(len(record_messages), 0, msg="stream {} did not sync any records.".format(stream_name)) for record_keys in record_messages: self.assertEqual(self.expected_sync_streams().get(stream_name, set()) - record_keys, set())
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") all_excluded_fields = {} # select all catalogs for c in found_catalogs: if c['stream_name'] == 'ads': continue discovered_schema = menagerie.get_annotated_schema(conn_id, c['stream_id'])['annotated-schema'] all_excluded_fields[c['stream_name']] = list(set(discovered_schema.keys()) - self.expected_automatic_fields().get(c['stream_name'], set()))[:5] connections.select_catalog_and_fields_via_metadata( conn_id, c, discovered_schema, non_selected_fields=all_excluded_fields[c['stream_name']]) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # This should be validating the the PKs are written in each record record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) synced_records = runner.get_records_from_target_output() self.assertTrue('ads' not in synced_records.keys()) for stream_name, data in synced_records.items(): record_messages = [set(row['data'].keys()) for row in data['messages']] for record_keys in record_messages: # The intersection should be empty self.assertFalse(record_keys.intersection(all_excluded_fields[stream_name]))
def test_run(self): conn_id = connections.ensure_connection(self, payload_hook=None) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # # # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], []) # # Verify that all streams sync at least one row for initial sync # # This test is also verifying access token expiration handling. If test fails with # # authentication error, refresh token was not replaced after expiring. menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) zero_count_streams = { k for k, v in record_count_by_stream.items() if v == 0 } self.assertFalse( zero_count_streams, msg="The following streams did not sync any rows {}".format( zero_count_streams))
def test_run(self): conn_id = connections.ensure_connection(self) # Run the tap in check mode check_job_name = runner.run_check_mode(self, conn_id) # Verify the check's exit status exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # Verify that there are catalogs found found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) subset = self.expected_check_streams().issubset(found_catalog_names) self.assertTrue( subset, msg="Expected check streams are not subset of discovered catalog") # Select some catalogs our_catalogs = [ c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for catalog in our_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema) # Clear State and run sync menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count))
def select_all_streams_and_fields(conn_id, catalogs, select_all_fields: bool = True): """Select all streams and all fields within streams""" for catalog in catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) non_selected_properties = [] if not select_all_fields: # get a list of all properties so that none are selected non_selected_properties = schema.get('annotated-schema', {}).get( 'properties', {}).keys() connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], non_selected_properties)
def test_run(self): conn_id = self.create_connection() # Select our catalogs our_catalogs = [ c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams() ] for c in our_catalogs: c_annotated = menagerie.get_annotated_schema( conn_id, c['stream_id']) c_metadata = metadata.to_map(c_annotated['metadata']) connections.select_catalog_and_fields_via_metadata( conn_id, c, c_annotated, [], []) # Clear state before our run menagerie.set_state(conn_id, {}) # Run a sync job using orchestrator sync_job_name = runner.run_sync_mode(self, conn_id) # Verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # Verify actual rows were synced record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # Ensure all records have a value for PK(s) records = runner.get_records_from_target_output() for stream in self.expected_sync_streams(): messages = records.get(stream).get('messages') for m in messages: pk_set = self.expected_pks()[stream] for pk in pk_set: self.assertIsNotNone(m.get('data', {}).get(pk), msg="oh no! {}".format(m)) bookmarks = menagerie.get_state(conn_id)['bookmarks'] self.assertTrue('orders' in bookmarks)
def test_run(self): conn_id = connections.ensure_connection(self) #run in check mode check_job_name = runner.run_check_mode(self, conn_id) #verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) found_catalogs = menagerie.get_catalogs(conn_id) self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names ) self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) print("discovered schemas are kosher") #select all catalogs #selected_catalogs = list(map(lambda catalog: self.perform_field_selection(conn_id, catalog), found_catalogs)) #menagerie.post_annotated_catalogs(conn_id, selected_catalogs) for c in found_catalogs: connections.select_catalog_and_fields_via_metadata(conn_id, c, menagerie.get_annotated_schema(conn_id, c['stream_id'])) #clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) #verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum,c : accum + c, record_count_by_stream.values()) self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) # bookmarks for the 4 streams should be 2015-03-16 states = menagerie.get_state(conn_id)["bookmarks"] end_date = self.get_properties()["end_date"].split()[0] for k, v in states.items(): if "insights" in k: bm_date = v.get("date_start") self.assertEqual(end_date, bm_date) print("bookmarks match end_date of {}".format(end_date))
def select_found_catalogs(self, found_catalogs): # selected = [menagerie.select_catalog(self.conn_id, c) for c in found_catalogs] # menagerie.post_annotated_catalogs(self.conn_id, selected) for catalog in found_catalogs: schema = menagerie.get_annotated_schema(self.conn_id, catalog['stream_id']) non_selected_properties = [] additional_md = [] connections.select_catalog_and_fields_via_metadata( self.conn_id, catalog, schema, additional_md=additional_md, non_selected_fields=non_selected_properties)
def select_specific_fields(conn_id, catalogs, select_all_fields: bool = True, specific_fields: dict = {}): """Select all streams and all fields within streams""" for catalog in catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) non_selected_properties = [] if not select_all_fields: # get a list of all properties and remove measuer fields non_selected_properties = set(schema.get('annotated-schema', {}).get( 'properties', {}).keys()) spec_fields = specific_fields.get(catalog['stream_name'], set()) non_selected_properties_adjusted = non_selected_properties.difference(spec_fields) connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, [], non_selected_properties_adjusted)
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) # Select all Catalogs for catalog in found_catalogs: if catalog['tap_stream_id'] in self.expected_sync_streams(): connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id'])) #clear state menagerie.set_state(conn_id, {}) record_count_by_stream = self.run_and_verify_sync(conn_id) max_bookmarks_from_records = runner.get_most_recent_records_from_target(self, self.expected_bookmarks(), self.get_properties()['start_date']) start_of_today = utils.strftime(datetime.datetime(datetime.datetime.utcnow().year, datetime.datetime.utcnow().month, datetime.datetime.utcnow().day, 0, 0, 0, 0, datetime.timezone.utc)) max_bookmarks_from_records['subscription_changes'] = start_of_today max_bookmarks_from_records['email_events'] = start_of_today #if we didn't replicate data, the bookmark should be the start_date for k in self.expected_bookmarks().keys(): if max_bookmarks_from_records.get(k) is None: max_bookmarks_from_records[k] = utils.strftime(datetime.datetime(2017, 5, 1, 0, 0, 0, 0, datetime.timezone.utc)) state = menagerie.get_state(conn_id) bookmarks = state.get('bookmarks') bookmark_streams = set(state.get('bookmarks').keys()) #verify bookmarks and offsets for k,v in sorted(list(self.expected_bookmarks().items())): for w in v: bk_value = bookmarks.get(k,{}).get(w) self.assertEqual(utils.strptime_with_tz(bk_value), utils.strptime_with_tz(max_bookmarks_from_records[k]), "Bookmark {} ({}) for stream {} should have been updated to {}".format(bk_value, w, k, max_bookmarks_from_records[k])) print("bookmark {}({}) updated to {} from max record value {}".format(k, w, bk_value, max_bookmarks_from_records[k])) for k,v in self.expected_offsets().items(): self.assertEqual(bookmarks.get(k,{}).get('offset', {}), v, msg="unexpected offset found for stream {} {}. state: {}".format(k, v, state)) print("offsets {} cleared".format(k)) diff = bookmark_streams.difference(self.acceptable_bookmarks()) self.assertEqual(len(diff), 0, msg="Unexpected bookmarks: {} Expected: {} Actual: {}".format(diff, self.acceptable_bookmarks(), bookmarks)) self.assertEqual(state.get('currently_syncing'), None,"Unexpected `currently_syncing` bookmark value: {} Expected: None".format(state.get('currently_syncing')))
def run_test(self): conn_id = connections.ensure_connection(self) # run in discovery mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams catalog = menagerie.get_catalogs(conn_id) found_catalog_names = set(map(lambda c: c['tap_stream_id'], catalog)) # assert we find the correct streams self.assertEqual(self.expected_check_streams(), found_catalog_names) for tap_stream_id in self.expected_check_streams(): found_stream = [ c for c in catalog if c['tap_stream_id'] == tap_stream_id ][0] schema_and_metadata = menagerie.get_annotated_schema( conn_id, found_stream['stream_id']) main_metadata = schema_and_metadata["metadata"] stream_metadata = [ mdata for mdata in main_metadata if mdata["breadcrumb"] == [] ] # assert that the pks are correct self.assertEqual( self.expected_pks()[tap_stream_id], set(stream_metadata[0]['metadata']['table-key-properties'])) for stream_catalog in catalog: annotated_schema = menagerie.get_annotated_schema( conn_id, stream_catalog['stream_id']) selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, stream_catalog, annotated_schema['annotated-schema'], []) # Run sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct messages_by_stream = runner.get_records_from_target_output() # assert that each of the streams that we synced are the ones that we expect to see record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_first_sync_streams(), self.expected_pks()) # Verify that the full table was syncd for tap_stream_id in self.expected_first_sync_streams(): self.assertEqual( self.expected_first_sync_row_counts()[tap_stream_id], record_count_by_stream[tap_stream_id])
def select_specific_catalog(self, found_catalogs, catalog_to_select): for catalog in found_catalogs: if catalog['tap_stream_id'] != catalog_to_select: continue schema = menagerie.get_annotated_schema(self.conn_id, catalog['stream_id']) non_selected_properties = [] additional_md = [] connections.select_catalog_and_fields_via_metadata( self.conn_id, catalog, schema, additional_md=additional_md, non_selected_fields=non_selected_properties) break
def test_run(self): conn_id = connections.ensure_connection(self) found_catalogs = self.run_and_verify_check_mode(conn_id) # Select only the expected streams tables expected_streams = self.testable_streams() catalog_entries = [ ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams ] for catalog_entry in catalog_entries: stream_schema = menagerie.get_annotated_schema( conn_id, catalog_entry['stream_id']) connections.select_catalog_and_fields_via_metadata( conn_id, catalog_entry, stream_schema) # Run sync first_record_count_by_stream = self.run_and_verify_sync(conn_id) replicated_row_count = sum(first_record_count_by_stream.values()) synced_records = runner.get_records_from_target_output() # Test by Stream for stream in self.testable_streams(): with self.subTest(stream=stream): expected_fields = set( synced_records.get(stream)['schema']['properties'].keys()) print('Number of expected keys ', len(expected_fields)) actual_fields = set( runner.examine_target_output_for_fields()[stream]) print('Number of actual keys ', len(actual_fields)) print('Number of known missing keys ', len(KNOWN_MISSING_FIELDS[stream])) unexpected_fields = actual_fields & KNOWN_MISSING_FIELDS[stream] if unexpected_fields: print('WARNING: Found new fields: {}'.format( unexpected_fields)) self.assertSetEqual( expected_fields, actual_fields | KNOWN_MISSING_FIELDS[stream])
def select_streams_and_fields(self, conn_id, catalog, select_all_fields: bool = False): """Select all streams and all fields within streams or all streams and no fields.""" schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) if self.default_replication_method is self.FULL_TABLE: additional_md = [{ "breadcrumb": [], "metadata": { "replication-method": self.FULL_TABLE } }] elif self.default_replication_method is self.INCREMENTAL: additional_md = [{ "breadcrumb": [], "metadata": { "replication-method": self.INCREMENTAL, "replication-key": "our_integer" } }] else: additional_md = [{ "breadcrumb": [], "metadata": { "replication-method": self.LOG_BASED } }] non_selected_properties = [] if not select_all_fields: # get a list of all properties so that none are selected non_selected_properties = schema.get('annotated-schema', {}).get('properties', {}).keys() connections.select_catalog_and_fields_via_metadata( conn_id, catalog, schema, additional_md, non_selected_properties)
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify discovery produced (at least) 1 expected catalog found_catalogs = [ found_catalog for found_catalog in menagerie.get_catalogs(conn_id) if found_catalog['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual(len(found_catalogs), 1) # verify the tap discovered the expected streams found_catalog_names = { catalog['tap_stream_id'] for catalog in found_catalogs } self.assertSetEqual(self.expected_check_streams(), found_catalog_names) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(test_table_name, test_catalog['stream_name']) print("discovered streams are correct") # perform table selection print('selecting {} and all fields within the table'.format( test_table_name)) schema_and_metadata = menagerie.get_annotated_schema( conn_id, test_catalog['stream_id']) additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] _ = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, schema_and_metadata, additional_md) # clear state menagerie.set_state(conn_id, {}) # run sync job 1 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_1 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('activate_version', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the persisted schema matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records match expectations self.assertDictEqual(self.expected_records[0], messages[1]['data']) self.assertDictEqual(self.expected_records[1], messages[2]['data']) self.assertDictEqual(self.expected_records[2], messages[3]['data']) print("records are correct") # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_1, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN and get the same 3 records #---------------------------------------------------------------------- # run sync job 2 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_2 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(3, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(4, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('activate_version', messages[3]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_2, table_version_1) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # verify replicated records still match expectations self.assertDictEqual(self.expected_records[0], messages[0]['data']) self.assertDictEqual(self.expected_records[1], messages[1]['data']) self.assertDictEqual(self.expected_records[2], messages[2]['data']) # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_2, bookmark['version']) #---------------------------------------------------------------------- # invoke the sync job AGAIN following various manipulations to the data #---------------------------------------------------------------------- with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: # NB | We will perform the following actions prior to the next sync: # [Action (EXPECTED RESULT)] # Insert a record # Insert a record to be updated prior to sync # Insert a record to be deleted prior to sync (NOT REPLICATED) # Update an existing record # Update a newly inserted record # Delete an existing record # Delete a newly inserted record # inserting... # a new record nyc_tz = pytz.timezone('America/New_York') our_time_offset = "-04:00" our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(6, 6, 6) our_time_tz = our_time.isoformat() + our_time_offset our_date = datetime.date(1970, 7, 1) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '1', 'our_json': json.dumps({'nymn': 77}), 'our_jsonb': json.dumps({'burgers': 'good++'}), 'our_uuid': my_uuid, 'our_citext': 'cyclops 2', 'our_store': 'dances=>"floor",name=>"betty"', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': '$0.98789' }) self.expected_records.append({ 'id': 4, 'our_varchar': "our_varchar 2", 'our_varchar_10': "varchar_10", 'our_text': "some text 2", 'our_integer': 44101, 'our_smallint': 2, 'our_bigint': 1000001, 'our_decimal': decimal.Decimal('9876543210.02'), 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'OUR DATE': '1970-07-01T00:00:00+00:00', 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': True, 'our_json': '{"nymn": 77}', 'our_jsonb': '{"burgers": "good++"}', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_citext': self.inserted_records[-1]['our_citext'], 'our_store': { "name": "betty", "dances": "floor" }, 'our_cidr': self.inserted_records[-1]['our_cidr'], 'our_inet': self.inserted_records[-1]['our_inet'], 'our_mac': self.inserted_records[-1]['our_mac'], 'our_money': '$0.99', 'our_alignment_enum': None, }) # a new record which we will then update prior to sync our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 5, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) # a new record to be deleted prior to sync our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111) nyc_tz = pytz.timezone('America/New_York') our_ts_tz = nyc_tz.localize(our_ts) our_time = datetime.time(12, 11, 10) our_time_tz = our_time.isoformat() + "-04:00" our_date = datetime.date(1999, 9, 9) my_uuid = str(uuid.uuid1()) self.inserted_records.append({ 'our_varchar': "our_varchar 4", 'our_varchar_10': "varchar_3", 'our_text': "some text 4", 'our_integer': 55200, 'our_smallint': 1, 'our_bigint': 100000, 'our_decimal': decimal.Decimal('1234567899.99'), quote_ident('OUR TS', cur): our_ts, quote_ident('OUR TS TZ', cur): our_ts_tz, quote_ident('OUR TIME', cur): our_time, quote_ident('OUR TIME TZ', cur): our_time_tz, quote_ident('OUR DATE', cur): our_date, 'our_double': decimal.Decimal('1.1'), 'our_real': decimal.Decimal('1.2'), 'our_boolean': True, 'our_bit': '0', 'our_json': json.dumps('some string'), 'our_jsonb': json.dumps(['burgers are good']), 'our_uuid': my_uuid, 'our_store': 'size=>"small",name=>"betty"', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, }) self.expected_records.append({ 'our_decimal': decimal.Decimal('1234567899.99'), 'our_text': 'some text 4', 'our_bit': False, 'our_integer': 55200, 'our_double': decimal.Decimal('1.1'), 'id': 6, 'our_json': self.inserted_records[-1]['our_json'], 'our_boolean': True, 'our_jsonb': self.inserted_records[-1]['our_jsonb'], 'our_bigint': 100000, 'OUR TS': self.expected_ts(our_ts), 'OUR TS TZ': self.expected_ts_tz(our_ts_tz), 'OUR TIME': str(our_time), 'OUR TIME TZ': str(our_time_tz), 'our_store': { "name": "betty", "size": "small" }, 'our_smallint': 1, 'OUR DATE': '1999-09-09T00:00:00+00:00', 'our_varchar': 'our_varchar 4', 'our_uuid': self.inserted_records[-1]['our_uuid'], 'our_real': decimal.Decimal('1.2'), 'our_varchar_10': 'varchar_3', 'our_citext': 'cyclops 3', 'our_cidr': '192.168.101.128/25', 'our_inet': '192.168.101.128/24', 'our_mac': '08:00:2b:01:02:04', 'our_money': None, 'our_alignment_enum': None, }) db_utils.insert_record(cur, test_table_name, self.inserted_records[3]) db_utils.insert_record(cur, test_table_name, self.inserted_records[4]) db_utils.insert_record(cur, test_table_name, self.inserted_records[5]) # updating ... # an existing record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 1 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[0]["our_double"] = decimal.Decimal("6.6") self.expected_records[0]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # a newly inserted record canon_table_name = db_utils.canonicalized_table_name( cur, test_schema_name, test_table_name) record_pk = 5 our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184) our_ts_tz = nyc_tz.localize(our_ts) updated_data = { "OUR TS TZ": our_ts_tz, "our_double": decimal.Decimal("6.6"), "our_money": "$0.00" } self.expected_records[4]["OUR TS TZ"] = self.expected_ts_tz( our_ts_tz) self.expected_records[4]["our_double"] = decimal.Decimal("6.6") self.expected_records[4]["our_money"] = "$0.00" db_utils.update_record(cur, canon_table_name, record_pk, updated_data) # deleting # an existing record record_pk = 2 db_utils.delete_record(cur, canon_table_name, record_pk) # a newly inserted record record_pk = 6 db_utils.delete_record(cur, canon_table_name, record_pk) #---------------------------------------------------------------------- # invoke the sync job AGAIN after vairous manipulations #---------------------------------------------------------------------- # run sync job 3 and verify exit codes sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # get records record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) records_by_stream = runner.get_records_from_target_output() table_version_3 = records_by_stream[test_table_name]['table_version'] messages = records_by_stream[test_table_name]['messages'] # verify the execpted number of records were replicated self.assertEqual(4, record_count_by_stream[test_table_name]) # verify the message actions match expectations self.assertEqual(5, len(messages)) self.assertEqual('upsert', messages[0]['action']) self.assertEqual('upsert', messages[1]['action']) self.assertEqual('upsert', messages[2]['action']) self.assertEqual('upsert', messages[3]['action']) self.assertEqual('activate_version', messages[4]['action']) # verify the new table version increased on the second sync self.assertGreater(table_version_3, table_version_2) # verify the persisted schema still matches expectations self.assertEqual(expected_schemas[test_table_name], records_by_stream[test_table_name]['schema']) # NB | This is a little tough to track mentally so here's a breakdown of # the order of operations by expected records indexes: # Prior to Sync 1 # insert 0, 1, 2 # Prior to Sync 2 # No db changes # Prior to Sync 3 # insert 3, 4, 5 # update 0, 4 # delete 1, 5 # Resulting Synced Records: 2, 3, 0, 4 # verify replicated records still match expectations self.assertDictEqual(self.expected_records[2], messages[0]['data']) # existing insert self.assertDictEqual(self.expected_records[3], messages[1]['data']) # new insert self.assertDictEqual(self.expected_records[0], messages[2]['data']) # existing update self.assertDictEqual(self.expected_records[4], messages[3]['data']) # new insert / update # grab bookmarked state state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][ 'dev-public-postgres_full_table_replication_test'] # verify state and bookmarks meet expectations self.assertIsNone(state['currently_syncing']) self.assertIsNone(bookmark.get('lsn')) self.assertIsNone(bookmark.get('replication_key')) self.assertIsNone(bookmark.get('replication_key_value')) self.assertEqual(table_version_3, bookmark['version'])
def binlog_json_test(self): print("RUNNING {}\n\n".format(self.name())) conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) expected_check_streams = {self.tap_stream_id()} expected_sync_streams = {self.table_name()} expected_pks = {self.table_name(): {'id'}} # verify the tap discovered the right streams found_catalogs = [ catalog for catalog in menagerie.get_catalogs(conn_id) if catalog['tap_stream_id'] in expected_check_streams ] self.assertGreaterEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = expected_check_streams.symmetric_difference(found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties test_catalog = found_catalogs[0] self.assertEqual(self.table_name(), test_catalog['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog, menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']), additional_md) # clear state menagerie.set_state(conn_id, {}) # run initial full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() self.maxDiff = None for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {self.table_name(): 1}) records_for_stream = runner.get_records_from_target_output()[ self.table_name()] messages_for_stream = records_for_stream['messages'] message_actions = [rec['action'] for rec in messages_for_stream] self.assertEqual(message_actions, ['activate_version', 'upsert', 'activate_version']) # ensure some log_file and log_pos state was persisted state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertIsNotNone(bookmark['log_file']) self.assertIsNotNone(bookmark['log_pos']) expected_log_file = bookmark['log_file'] expected_log_pos = bookmark['log_pos'] # grab version, log_file and log_pos from state to check later expected_table_version = records_for_stream['table_version'] self.assertEqual(expected_table_version, bookmark['version']) # check for expected records upsert_records = [ m['data'] for m in messages_for_stream if m['action'] == 'upsert' ] self.assertEqual([expected_rec_1], upsert_records) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertEqual(expected_table_version, bookmark['version']) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # record count should be empty as we did not persist anything to the gate record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {}) # insert a new huge row data = dict([('foooo%i' % i, 'baaaaar%i' % i) for i in range(2560)], literal=True) rec = {'id': 2, 'our_json': json.dumps(data)} with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: self.insert_record(cur, rec) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version from state is unchanged state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id()] self.assertEqual(expected_table_version, bookmark['version']) # Either the log_file is the same but the log_pos has increased or the log_file # has rotated and the numeric suffix has increased if expected_log_file == bookmark['log_file']: self.assertGreater(bookmark['log_pos'], expected_log_pos) else: expected_log_file_suffix = re.search('^.*\.(\d+)$', expected_log_file).groups()[0] updated_log_file_suffix = re.search( '^.*\.(\d+)$', bookmark['log_file']).groups()[0] self.assertGreater(int(updated_log_file_suffix), int(expected_log_file_suffix)) expected_log_file = bookmark['log_file'] expected_log_pos = bookmark['log_pos'] expected_rec_2 = copy.deepcopy(rec) # check for expected records records_for_stream = runner.get_records_from_target_output()[ self.table_name()] messages_for_stream = records_for_stream['messages'] message_actions = [rec['action'] for rec in messages_for_stream] self.assertEqual(message_actions, ['upsert']) upsert_records = [ m['data'] for m in messages_for_stream if m['action'] == 'upsert' ] del upsert_records[0]['_sdc_deleted_at'] expected_json = json.loads(expected_rec_2.get('our_json', {})) actual_json = json.loads(upsert_records[0].get('our_json', {})) self.assertTrue(len(actual_json.keys()) > 0) self.assertEqual(expected_json, actual_json)
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties chicken_catalog = found_catalogs[0] self.assertEqual('chicken_view', chicken_catalog['stream_name']) print("discovered streams are correct") print('checking discoverd metadata for ROOT-CHICKEN_VIEW') md = menagerie.get_annotated_schema( conn_id, chicken_catalog['stream_id'])['metadata'] self.assertEqual( { (): { 'database-name': 'postgres', 'is-view': True, 'row-count': 0, 'schema-name': 'public', 'table-key-properties': [] }, ('properties', 'fk_id'): { 'inclusion': 'available', 'sql-datatype': 'bigint', 'selected-by-default': True }, ('properties', 'name'): { 'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True }, ('properties', 'age'): { 'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True }, ('properties', 'size'): { 'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True }, ('properties', 'id'): { 'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True } }, metadata.to_map(md)) # 'ID' selected as view-key-properties replication_md = [{ "breadcrumb": [], "metadata": { 'replication-key': None, "replication-method": "FULL_TABLE", 'view-key-properties': ["id"] } }] connections.select_catalog_and_fields_via_metadata( conn_id, chicken_catalog, menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id']), replication_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual(record_count_by_stream, {'chicken_view': 1}) records_by_stream = runner.get_records_from_target_output() table_version = records_by_stream['chicken_view']['table_version'] self.assertEqual( records_by_stream['chicken_view']['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream['chicken_view']['messages'][1]['action'], 'upsert') self.assertEqual( records_by_stream['chicken_view']['messages'][2]['action'], 'activate_version') # verifications about individual records for stream, recs in records_by_stream.items(): # verify the persisted schema was correct self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) actual_chicken_record = records_by_stream['chicken_view']['messages'][ 1]['data'] expected_chicken_record = { 'id': 1, 'fk_id': 1, 'name': 'fred', 'age': 99, 'size': 'big' } self.assertEqual( actual_chicken_record, expected_chicken_record, msg= "Expected `various_types` upsert record data to be {}, but target output {}" .format(expected_chicken_record, actual_chicken_record)) print("records are correct") # verify state and bookmarks state = menagerie.get_state(conn_id) chicken_bookmark = state['bookmarks']['postgres-public-chicken_view'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") self.assertEqual( chicken_bookmark['version'], table_version, msg="expected bookmark for stream ROOT-CHICKEN to match version")
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties for c in found_catalogs: catalog_props_to_check = ['stream_name', 'tap_stream_id'] stream = c['stream_name'] for prop in catalog_props_to_check: self.assertEqual( c[prop], expected_catalogs[stream][prop], msg= "unexpected stream catalog property `{}` for stream `{}`: `{}` != `{}`" .format(prop, stream, expected_catalogs[stream][prop], c[prop])) print("discovered streams are correct") print('checking discoverd metadata for tap_tester_mysql_0-incremental') incremental_catalog = [ c for c in found_catalogs if c['tap_stream_id'] == 'tap_tester_mysql_0-incremental' ][0] md = menagerie.get_annotated_schema( conn_id, incremental_catalog['stream_id'])['metadata'] incremental_stream_metadata = { 'database-name': 'tap_tester_mysql_0', 'row-count': 3, 'is-view': False, 'selected-by-default': False, 'table-key-properties': ['c_pk'] } self.assertEqual( sorted(md, key=lambda x: x['breadcrumb']), [{ 'breadcrumb': [], 'metadata': incremental_stream_metadata }, { 'breadcrumb': ['properties', 'c_dt'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'datetime' } }, { 'breadcrumb': ['properties', 'c_pk'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'int(11)' } }, { 'breadcrumb': ['properties', 'c_varchar'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'varchar(255)' } }, { 'breadcrumb': ['properties', 'c_varchar_to_deselect'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'varchar(255)' } }]) print('checking discovered metadata for tap_tester_mysql_1-view') view_catalog = [ c for c in found_catalogs if c['tap_stream_id'] == 'tap_tester_mysql_1-view' ][0] view_catalog_key_properties_md = [{ 'breadcrumb': [], 'metadata': { 'view-key-properties': ['c_pk'] } }] connections.set_non_discoverable_metadata( conn_id, view_catalog, menagerie.get_annotated_schema(conn_id, view_catalog['stream_id']), view_catalog_key_properties_md) md = menagerie.get_annotated_schema( conn_id, view_catalog['stream_id'])['metadata'] view_stream_metadata = { 'database-name': 'tap_tester_mysql_1', 'is-view': True, 'selected-by-default': False, 'view-key-properties': ['c_pk'] } self.assertEqual(sorted(md, key=lambda x: x['breadcrumb']), [{ 'breadcrumb': [], 'metadata': view_stream_metadata }, { 'breadcrumb': ['properties', 'c_pk'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'int(11)' } }, { 'breadcrumb': ['properties', 'c_varchar'], 'metadata': { 'selected-by-default': True, 'sql-datatype': 'varchar(255)' } }]) #No selected-by-default MD for c_year because it is an unsupported type various_types_catalog = [ c for c in found_catalogs if c['tap_stream_id'] == 'tap_tester_mysql_0-various_types' ][0] md = menagerie.get_annotated_schema( conn_id, various_types_catalog['stream_id'])['metadata'] c_year_md = [ x for x in md if x['breadcrumb'] == ['properties', 'c_year'] ] self.assertEqual(c_year_md, [{ 'breadcrumb': ['properties', 'c_year'], 'metadata': { 'selected-by-default': False, 'sql-datatype': 'year(4)' } }]) ##select_simple_example catalogs_to_select = [ c for c in found_catalogs if c['tap_stream_id'] != 'tap_tester_mysql_0-simple_example' ] for a_catalog in catalogs_to_select: additional_md = [] unselected_fields = [] if a_catalog['tap_stream_id'] == 'tap_tester_mysql_0-incremental': additional_md = [{ "breadcrumb": [], "metadata": { 'replication-key': 'c_dt', 'replication-method': 'INCREMENTAL' } }] unselected_fields = ['c_varchar_to_deselect'] elif a_catalog['tap_stream_id'] == 'tap_tester_mysql_1-view': additional_md = [{ "breadcrumb": [], "metadata": { 'view-key-properties': ['c_pk'], 'replication-method': 'FULL_TABLE' } }] else: additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'FULL_TABLE' } }] selected_metadata = connections.select_catalog_and_fields_via_metadata( conn_id, a_catalog, menagerie.get_annotated_schema(conn_id, a_catalog['stream_id']), additional_md, unselected_fields) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) expected_row_count = 8 # {'my_isam': 1, 'various_types': 3, 'incremental': 3, 'view': 1} self.assertEqual( replicated_row_count, expected_row_count, msg="failed to replicate correct number of rows: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) records_by_stream = runner.get_records_from_target_output() # verifications about individual records for stream, recs in records_by_stream.items(): # verify that activate version messages were sent in the proper position self.assertEqual( recs['messages'][0]['action'], 'activate_version', msg= "Expected first message sent for stream `{}` to have action `activate_version`" .format(stream)) # verify the persisted schema was correct self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # verify that the target output the proper numeric and date representations expected_various_types_records = [{ 'c_time': '1970-01-01T12:34:56.000000Z', 'c_mediumint': 8388607, 'c_smallint': 32767, 'c_tinyint': 127, 'c_date': '2017-09-13T00:00:00.000000Z', 'c_bigint': 9223372036854775807, 'c_decimal': -1, 'c_int': 2147483647, 'c_bit': True, 'c_decimal_2': Decimal('123456789.0'), 'c_pk': 1, 'c_double': Decimal("1.234"), 'c_float': Decimal("1.234"), 'c_decimal_2_unsigned': Decimal("1.23"), 'c_tinyint_1': True }, { 'c_time': '1970-01-01T12:34:57.000000Z', 'c_mediumint': -8388608, 'c_smallint': -32768, 'c_tinyint': -128, 'c_date': '2017-09-14T00:00:00.000000Z', 'c_bigint': -9223372036854775808, 'c_decimal': 0, 'c_int': -2147483648, 'c_bit': False, 'c_decimal_2': Decimal("123456790.0"), 'c_pk': 2, 'c_double': Decimal("2.234"), 'c_float': Decimal("2.234"), 'c_decimal_2_unsigned': Decimal("0.23"), 'c_tinyint_1': False }, { 'c_time': '1970-01-01T12:34:57.000000Z', 'c_mediumint': -8388608, 'c_smallint': -32768, 'c_tinyint': -128, 'c_date': '2017-09-14T00:00:00.000000Z', 'c_bigint': -9223372036854775808, 'c_decimal': 0, 'c_int': -2147483648, 'c_bit': None, 'c_decimal_2': Decimal("123456790.0"), 'c_pk': 3, 'c_double': Decimal("2.234"), 'c_float': Decimal("2.234"), 'c_decimal_2_unsigned': Decimal("0.23"), 'c_tinyint_1': None }] actual_various_types_records = [ r['data'] for r in records_by_stream['various_types']['messages'][1:4] ] self.assertEqual( actual_various_types_records, expected_various_types_records, msg= "Expected `various_types` upsert record data to be {}, but target output {}" .format(expected_various_types_records, actual_various_types_records)) # verify that deselected property was not output expected_incremental_record = { 'c_pk': 1, 'c_dt': '2017-01-01T00:00:00.000000Z', 'c_varchar': 'a' } actual_incremental_record = records_by_stream['incremental'][ 'messages'][1]['data'] self.assertEqual( actual_incremental_record, expected_incremental_record, msg= "Expected first `incremental` upsert record data to be {}, but target output {}" .format(expected_incremental_record, actual_incremental_record)) print("records are correct") # verify state and bookmarks state = menagerie.get_state(conn_id) bookmarks = state['bookmarks'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") for k, v in bookmarks.items(): if k == 'tap_tester_mysql_0-incremental': self.assertIsNotNone( v['version'], msg="expected bookmark for stream `{}` to have a version set" .format(k)) self.assertEqual( v['replication_key_value'], '2017-01-01T00:00:02.000000Z', msg= "incorrect replication_key_value in bookmark for stream `{}`" .format(k)) self.assertEqual( v['replication_key'], 'c_dt', msg= "incorrect replication_key specified in bookmark for stream `{}`" .format(k)) else: self.assertFalse( 'version' in v, msg= "expected bookmark for stream `{}` to not have a version key" .format(k)) self.assertTrue( 'initial_full_table_complete' in v, msg= "expected bookmark for stream `{}` to have a true initial_full_table_complete key" .format(k)) print("state and bookmarks are correct") incremental_table_initial_table_version = bookmarks[ 'tap_tester_mysql_0-incremental']['version'] #---------------------------------------------------------------------- # invoke the sync job again after some modifications #---------------------------------------------------------------------- print("adding a column to an existing table in the source db") connection = db_utils.get_db_connection(self.get_properties(), self.get_credentials()) with connection.cursor() as cursor: add_column_sql = ''' ALTER TABLE tap_tester_mysql_0.incremental ADD COLUMN favorite_number INTEGER; INSERT INTO tap_tester_mysql_0.incremental VALUES (4, '4', '2017-01-01 00:00:03', 'yeehaw', 999); ''' cursor.execute(add_column_sql) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertGreater( len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) replicated_row_count = reduce(lambda accum, c: accum + c, record_count_by_stream.values()) expected_row_count = 7 # {'my_isam': 1, 'various_types': 3, 'incremental': 2, 'view': 1} self.assertEqual( replicated_row_count, expected_row_count, msg="failed to replicate correct number of rows: {}".format( record_count_by_stream)) print("total replicated row count: {}".format(replicated_row_count)) records_by_stream = runner.get_records_from_target_output() expected_schema_of_new_column = { 'maximum': 2147483647, 'selected': True, 'inclusion': 'available', 'type': ['null', 'integer'], 'minimum': -2147483648 } # verifications about individual records for stream, recs in records_by_stream.items(): # verify that a activate version messages were sent in the proper position if stream == 'incremental': self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'activate_version', msg= "Expected first message sent for stream `{}` not to have action `activate_version`" .format(stream)) expected_schema_of_new_column = { 'maximum': 2147483647, 'inclusion': 'available', 'type': ['null', 'integer'], 'minimum': -2147483648 } self.assertEqual( records_by_stream[stream]['schema']['properties'] ['favorite_number'], expected_schema_of_new_column, msg= "Expected newly-added column to be present in schema for stream `{}`, but it was not." .format(stream)) else: self.assertEqual( records_by_stream[stream]['messages'][0]['action'], 'upsert', msg= "Expected first message sent for stream `{}` to have action `upsert`" .format(stream)) self.assertEqual( records_by_stream[stream]['messages'][-1]['action'], 'activate_version', msg= "Expected last message sent for stream `{}` to have action `activate_version`" .format(stream)) state = menagerie.get_state(conn_id) bookmarks = state['bookmarks'] self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") for k, v in bookmarks.items(): if k == 'tap_tester_mysql_0-incremental': self.assertIsNotNone( v['version'], msg="expected bookmark for stream `{}` to have a version set" .format(k)) self.assertEqual( v['replication_key_value'], '2017-01-01T00:00:03.000000Z', msg= "incorrect replication_key_value in bookmark for stream `{}`" .format(k)) self.assertEqual( v['replication_key'], 'c_dt', msg= "incorrect replication_key specified in bookmark for stream `{}`" .format(k)) else: self.assertFalse( 'version' in v, msg= "expected bookmark for stream `{}` to not have a version key" .format(k)) self.assertTrue( 'initial_full_table_complete' in v, msg= "expected bookmark for stream `{}` to have a true initial_full_table_complete key" .format(k)) print("state and bookmarks are correct") # verify incremental table_version didn't change incremental_table_new_table_version = bookmarks[ 'tap_tester_mysql_0-incremental']['version'] self.assertEqual( incremental_table_initial_table_version, incremental_table_new_table_version, msg= "Expected incrementally-replicated table's table_version to remain unchanged over multiple invocations." )
def test_run(self): conn_id = connections.ensure_connection(self) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) # verify the tap discovered the right streams found_catalogs = [ fc for fc in menagerie.get_catalogs(conn_id) if fc['tap_stream_id'] in self.expected_check_streams() ] self.assertGreaterEqual( len(found_catalogs), 2, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = self.expected_check_streams().symmetric_difference( found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties test_catalog_cows = list( filter( lambda c: c['stream_name'] == 'postgres_logical_replication_test_cows', found_catalogs))[0] self.assertEqual('postgres_logical_replication_test_cows', test_catalog_cows['stream_name']) test_catalog_chickens = list( filter( lambda c: c['stream_name' ] == 'postgres_logical_replication_test_chickens', found_catalogs))[0] self.assertEqual('postgres_logical_replication_test_chickens', test_catalog_chickens['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog_cows, menagerie.get_annotated_schema(conn_id, test_catalog_cows['stream_id']), additional_md) connections.select_catalog_and_fields_via_metadata( conn_id, test_catalog_chickens, menagerie.get_annotated_schema(conn_id, test_catalog_chickens['stream_id']), additional_md) # clear state menagerie.set_state(conn_id, {}) sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual( record_count_by_stream, { 'postgres_logical_replication_test_cows': 1, 'postgres_logical_replication_test_chickens': 1 }) records_by_stream = runner.get_records_from_target_output() table_version_cows = records_by_stream[ 'postgres_logical_replication_test_cows']['table_version'] self.assertEqual( records_by_stream['postgres_logical_replication_test_cows'] ['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream['postgres_logical_replication_test_cows'] ['messages'][1]['action'], 'upsert') self.assertEqual( records_by_stream['postgres_logical_replication_test_cows'] ['messages'][2]['action'], 'activate_version') table_version_chickens = records_by_stream[ 'postgres_logical_replication_test_chickens']['table_version'] self.assertEqual( records_by_stream['postgres_logical_replication_test_chickens'] ['messages'][0]['action'], 'activate_version') self.assertEqual( records_by_stream['postgres_logical_replication_test_chickens'] ['messages'][1]['action'], 'upsert') self.assertEqual( records_by_stream['postgres_logical_replication_test_chickens'] ['messages'][2]['action'], 'activate_version') # verify state and bookmarks state = menagerie.get_state(conn_id) self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") bookmark_cows = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_cows'] self.assertIsNotNone(bookmark_cows['lsn'], msg="expected bookmark for stream to have an lsn") lsn_cows_1 = bookmark_cows['lsn'] self.assertEqual(bookmark_cows['version'], table_version_cows, msg="expected bookmark for stream to match version") bookmark_chickens = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_chickens'] self.assertIsNotNone(bookmark_chickens['lsn'], msg="expected bookmark for stream to have an lsn") lsn_chickens_1 = bookmark_chickens['lsn'] self.assertEqual(bookmark_chickens['version'], table_version_chickens, msg="expected bookmark for stream to match version") #---------------------------------------------------------------------- # invoke the sync job again after adding records #---------------------------------------------------------------------- print("inserting 2 more cows and 2 more chickens") with db_utils.get_test_connection('dev') as conn: conn.autocommit = True with conn.cursor() as cur: # insert another cow self.cows_rec_2 = {'cow_name': "betty cow", 'cow_age': 21} insert_record(cur, test_table_name_cows, self.cows_rec_2) # update that cow's expected values self.cows_rec_2['id'] = 2 self.cows_rec_2['_sdc_deleted_at'] = None # insert another chicken self.chicken_rec_2 = { 'chicken_name': "burt chicken", 'chicken_age': 14 } insert_record(cur, test_table_name_chickens, self.chicken_rec_2) # update that cow's expected values self.chicken_rec_2['id'] = 2 self.chicken_rec_2['_sdc_deleted_at'] = None # and repeat... self.cows_rec_3 = {'cow_name': "cindy cow", 'cow_age': 10} insert_record(cur, test_table_name_cows, self.cows_rec_3) self.cows_rec_3['id'] = 3 self.cows_rec_3['_sdc_deleted_at'] = None self.chicken_rec_3 = { 'chicken_name': "carl chicken", 'chicken_age': 4 } insert_record(cur, test_table_name_chickens, self.chicken_rec_3) self.chicken_rec_3['id'] = 3 self.chicken_rec_3['_sdc_deleted_at'] = None sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) record_count_by_stream = runner.examine_target_output_file( self, conn_id, self.expected_sync_streams(), self.expected_pks()) self.assertEqual( record_count_by_stream, { 'postgres_logical_replication_test_cows': 2, 'postgres_logical_replication_test_chickens': 2 }) records_by_stream = runner.get_records_from_target_output() chicken_messages = records_by_stream[ "postgres_logical_replication_test_chickens"]['messages'] cow_messages = records_by_stream[ "postgres_logical_replication_test_cows"]['messages'] self.assertDictEqual(self.cows_rec_2, cow_messages[0]['data']) self.assertDictEqual(self.chicken_rec_2, chicken_messages[0]['data']) self.assertDictEqual(self.cows_rec_3, cow_messages[1]['data']) self.assertDictEqual(self.chicken_rec_3, chicken_messages[1]['data']) print("inserted record is correct") state = menagerie.get_state(conn_id) self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") cows_bookmark = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_cows'] self.assertIsNotNone( cows_bookmark['lsn'], msg= "expected bookmark for stream public-postgres_logical_replication_test to have an scn" ) lsn_cows_2 = cows_bookmark['lsn'] self.assertTrue(lsn_cows_2 >= lsn_cows_1) chickens_bookmark = state['bookmarks'][ 'dev-public-postgres_logical_replication_test_chickens'] self.assertIsNotNone( chickens_bookmark['lsn'], msg= "expected bookmark for stream public-postgres_logical_replication_test to have an scn" ) lsn_chickens_2 = chickens_bookmark['lsn'] self.assertTrue(lsn_chickens_2 >= lsn_chickens_1) #table_version does NOT change self.assertEqual( chickens_bookmark['version'], table_version_chickens, msg= "expected bookmark for stream public-postgres_logical_replication_test to match version" ) #table_version does NOT change self.assertEqual( cows_bookmark['version'], table_version_cows, msg= "expected bookmark for stream public-postgres_logical_replication_test to match version" )
def binlog_edge_test(self, expected_records=[]): """ Test binlog replication edge cases • Verify an initial sync returns expected records of various datatypes • Verify we bookmark correctly when a transaction spans multiple files • Insert and delete a record prior to sync. Verify both events are replicated • Insert and update a record prior to sync. Verify both events are replicated • Verify a valid log_file and log_pos state are persisted after each sync """ conn_id = connections.ensure_connection(self) # prior to first sync update a record... updated_timestamp = datetime.datetime.now() updated_id = 1 expected_records[1]['our_timestamp_2'] = datetime.datetime.strftime( updated_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ") # insert a record and... inserted_record = self.generate_record_n(len(expected_records)) expected_records += [inserted_record] # TODO need to format # delete a record deleted_id = 2 with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: cur.execute( "UPDATE {}.{} SET our_timestamp_2 = '{}' WHERE id = {}".format( self.database_name(), self.table_name_1(), updated_timestamp, updated_id)) self.insert_record(cur, inserted_record, self.table_name_1()) delete_time = datetime.datetime.now() cur.execute("DELETE FROM {}.{} WHERE id = {}".format( self.database_name(), self.table_name_1(), deleted_id)) print( "\n\nMySQL DB Actions." + \ "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \ "\nEVENTS: {} records updated".format(1) + \ "\n {} records deleted\n\n".format(1) ) # run in check mode check_job_name = runner.run_check_mode(self, conn_id) # verify check exit codes exit_status = menagerie.get_exit_status(conn_id, check_job_name) menagerie.verify_check_exit_status(self, exit_status, check_job_name) t1 = self.table_name_1() t2 = self.table_name_2() expected_check_streams = { self.tap_stream_id(t1), self.tap_stream_id(t2) } expected_sync_streams = {t1, t2} expected_pks = {t1: {'id'}, t2: {'id'}} # verify the tap discovered the right streams found_catalogs = [ catalog for catalog in menagerie.get_catalogs(conn_id) if catalog['tap_stream_id'] in expected_check_streams ] self.assertGreaterEqual( len(found_catalogs), 1, msg="unable to locate schemas for connection {}".format(conn_id)) found_catalog_names = set( map(lambda c: c['tap_stream_id'], found_catalogs)) diff = expected_check_streams.symmetric_difference(found_catalog_names) self.assertEqual( len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) # verify that persisted streams have the correct properties self.assertEqual(self.table_name_1(), found_catalogs[0]['stream_name']) self.assertEqual(self.table_name_2(), found_catalogs[1]['stream_name']) print("discovered streams are correct") additional_md = [{ "breadcrumb": [], "metadata": { 'replication-method': 'LOG_BASED' } }] for catalog in found_catalogs: schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) _ = connections.select_catalog_and_fields_via_metadata( conn_id, catalog, catalog, additional_md) # clear state menagerie.set_state(conn_id, {}) # run initial full table sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() self.maxDiff = None for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) # BUG missing deleted record | https://stitchdata.atlassian.net/browse/SRCE-4258 # self.assertEqual({self.table_name_1(): len(expected_records)}, record_count_by_stream) records_for_stream = runner.get_records_from_target_output()[ self.table_name_1()] messages_for_stream = records_for_stream['messages'] message_actions = [rec['action'] for rec in messages_for_stream] # verify activate version messages are present self.assertEqual('activate_version', message_actions[0]) self.assertEqual('activate_version', message_actions[-1]) # ensure some log_file and log_pos state was persisted state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id(t1)] self.assertIsNotNone(bookmark['log_file']) self.assertIsNotNone(bookmark['log_pos']) expected_log_file = bookmark['log_file'] expected_log_pos = bookmark['log_pos'] # grab version, log_file and log_pos from state to check later expected_table_version = records_for_stream['table_version'] self.assertEqual(expected_table_version, bookmark['version']) # check for expected records upsert_records = [ m['data'] for m in messages_for_stream if m['action'] == 'upsert' ] # we need to compare record by record since there are so many. # a failure comparing expected_records to upsert_records would result in # an output message greater in length than a standard tmux buffer # BUG missing datetime precision | https://stitchdata.atlassian.net/browse/SRCE-4257 # for expected_record in expected_records: # upsert_record = [rec for rec in upsert_records # if rec['id'] == expected_record['id']] # self.assertEqual(1, len(upsert_record), # msg="multiple upsert_recs with same pk: {}".format(upsert_record)) # self.assertEqual(expected_record, upsert_record.pop()) # TODO add check for _sdc_delete_at for deleted record once bug addressed # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id(t1)] self.assertEqual(expected_table_version, bookmark['version']) # verify the persisted schema was correct records_by_stream = runner.get_records_from_target_output() for stream, recs in records_by_stream.items(): self.assertEqual( recs['schema'], expected_schemas[stream], msg= "Persisted schema did not match expected schema for stream `{}`." .format(stream)) # record count should be empty as we did not persist anything to the gate record_count_by_stream = runner.examine_target_output_file( self, conn_id, expected_sync_streams, expected_pks) self.assertEqual(record_count_by_stream, {}) # Create 1 more record prior to 2nd sync new_record = self.generate_record_n(len(expected_records)) with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: self.insert_record(cur, new_record, self.table_name_1()) print( "\n\nMySQL DB Actions." + \ "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \ "\nEVENTS: {} records inserted".format(1) ) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version from state is unchanged state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id(t1)] self.assertEqual(expected_table_version, bookmark['version']) # Either the log_file is the same but the log_pos has increased or the log_file # has rotated and the numeric suffix has increased if expected_log_file == bookmark['log_file']: print("PATH A") self.assertGreater(bookmark['log_pos'], expected_log_pos) else: expected_log_file_suffix = re.search('^.*\.(\d+)$', expected_log_file).groups()[0] updated_log_file_suffix = re.search( '^.*\.(\d+)$', bookmark['log_file']).groups()[0] print("PATH B") self.assertGreater(int(updated_log_file_suffix), int(expected_log_file_suffix)) # Execute delete across tables using join prior to 3rd sync deleted_id = 4 with db_utils.get_db_connection( self.get_properties(), self.get_credentials()).cursor() as cur: delete_time = datetime.datetime.now() # DELETE T1, T2 # FROM T1 # INNER JOIN T2 ON T1.key = T2.key # WHERE condition; db = self.database_name() db_t1 = db + "." + t1 db_t2 = db + "." + t2 t1_key = db_t1 + ".id" t2_key = db_t2 + ".id" statement = "DELETE {}, {} ".format(db_t1, db_t2) + \ "FROM {} ".format(t1) + \ "INNER JOIN {} ON {} = {} ".format(db_t2, t1_key, t2_key) + \ "WHERE {} = {}".format(t1_key, deleted_id) cur.execute(statement) print( "\n\nMySQL DB Actions." + \ "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_2()) + \ "\nTABLE: {}".format(self.table_name_2()) + \ "\nEVENTS: {} records deleted\n\n".format(1) ) # run binlog sync sync_job_name = runner.run_sync_mode(self, conn_id) # verify tap and target exit codes exit_status = menagerie.get_exit_status(conn_id, sync_job_name) menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) # check that version from state is unchanged state = menagerie.get_state(conn_id) bookmark = state['bookmarks'][self.tap_stream_id(t1)] self.assertEqual(expected_table_version, bookmark['version']) target_records = runner.get_records_from_target_output() records_stream_1 = target_records[self.table_name_1()] upsert_records_1 = [ m['data'] for m in records_stream_1['messages'] if m['action'] == 'upsert' ] records_stream_2 = target_records[self.table_name_2()] upsert_records_2 = [ m['data'] for m in records_stream_2['messages'] if m['action'] == 'upsert' ] # make sure the record is in the target for both tables with a delete time deleted_at_t1 = upsert_records_1[0].get('_sdc_deleted_at') deleted_at_t1_timestamp = utils.strptime_to_utc( deleted_at_t1).timestamp() self.assertIsNotNone(deleted_at_t1) deleted_at_t2 = upsert_records_2[0].get('_sdc_deleted_at') deleted_at_t2_timestamp = utils.strptime_to_utc( deleted_at_t2).timestamp() self.assertIsNotNone(deleted_at_t2) # the delete times should be equal since it was a single transaction self.assertEqual(deleted_at_t1_timestamp, deleted_at_t2_timestamp) time_delta = delete_time.timestamp() - deleted_at_t1_timestamp print("Delete time vs record: difference in seconds", time_delta) self.assertLess(time_delta, 3) # time delta less than 3 seconds in magnitude