def automatic_fields_test(self, conn_id):
        """Just testing we can sync with no fields selected. And that automatic fields still get synced."""

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify discovery produced (at least) 1 expected catalog
        found_catalogs = [
            found_catalog for found_catalog in menagerie.get_catalogs(conn_id)
            if found_catalog['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreaterEqual(len(found_catalogs), 1)

        # verify the tap discovered the expected streams
        found_catalog_names = {
            catalog['tap_stream_id']
            for catalog in found_catalogs
        }
        self.assertSetEqual(self.expected_check_streams(), found_catalog_names)

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]
        self.assertEqual(test_table_name, test_catalog['stream_name'])
        print("discovered streams are correct")

        # perform table selection
        print('selecting {} and NO FIELDS within the table'.format(
            test_table_name))
        self.select_streams_and_fields(conn_id,
                                       test_catalog,
                                       select_all_fields=False)

        # clear state
        menagerie.set_state(conn_id, {})

        # run sync job 1 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(),
            self.expected_primary_keys())
        records_by_stream = runner.get_records_from_target_output()
        messages = records_by_stream[test_table_name]['messages']

        # expected values
        expected_primary_keys = self.expected_primary_keys()[test_table_name]
        expected_replication_keys = self.expected_replication_keys(
        )[test_table_name]
        expected_automatic_fields = expected_primary_keys.union(
            expected_replication_keys)

        # collect actual values
        record_messages_keys = [
            set(message['data'].keys()) for message in messages[1:-1]
        ]

        # verify the message actions match expectations for all replication methods
        self.assertEqual(4, len(messages))
        self.assertEqual('activate_version', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('activate_version', messages[3]['action'])

        # Verify that you get some records for each stream
        self.assertGreater(record_count_by_stream[test_table_name], 0)

        # Verify that only the automatic fields are sent to the target
        for actual_fields in record_messages_keys:
            self.assertSetEqual(expected_automatic_fields, actual_fields)
Ejemplo n.º 2
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [fc for fc
                          in menagerie.get_catalogs(conn_id)
                          if fc['tap_stream_id'] in self.expected_check_streams()]


        self.assertGreaterEqual(len(found_catalogs),
                                1,
                                msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(found_catalog_names)
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]

        self.assertEqual('postgres_logical_replication_test', test_catalog['stream_name'])

        print("discovered streams are correct")

        additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}]
        #don't selcted our_text_2
        _ = connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog,
                                                               menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']),
                                                               additional_md,
                                                               ['our_text_2'])

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())


        self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test': 4})
        records_by_stream = runner.get_records_from_target_output()

        table_version = records_by_stream['postgres_logical_replication_test']['table_version']

        self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][0]['action'],
                         'activate_version')

        self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][1]['action'],
                         'upsert')

        self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][2]['action'],
                         'upsert')

        self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][3]['action'],
                         'upsert')

        self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][4]['action'],
                         'upsert')

        self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][5]['action'],
                         'activate_version')

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)


        bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")

        self.assertIsNotNone(bookmark['lsn'],
                             msg="expected bookmark for stream to have an lsn")
        lsn_1 = bookmark['lsn']

        self.assertEqual(bookmark['version'], table_version,
                         msg="expected bookmark for stream to match version")


        #----------------------------------------------------------------------
        # invoke the sync job again after adding a record
        #----------------------------------------------------------------------
        print("inserting a record 5")

        with db_utils.get_test_connection(test_db) as conn:
            conn.autocommit = True
            with conn.cursor() as cur:
                #insert fixture data 3
                our_ts = datetime.datetime(1993, 3, 3, 3, 3, 3, 333333)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time  = datetime.time(3,4,5)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1933, 3, 3)
                my_uuid =  str(uuid.uuid1())

                #STRINGS:
                #OUR TS: '1993-03-03 03:03:03.333333'
                #OUR TS TZ: '1993-03-03 08:03:03.333333+00'
                #'OUR TIME': '03:04:05'
                #'OUR TIME TZ': '03:04:05+00'
                self.rec_5 = {'our_varchar' : "our_varchar 5", # str
                              'our_varchar_10' : "varchar13", # str
                              'our_text' : "some text 3", #str
                              'our_text_2' : "NOT SELECTED",
                              'our_integer' : 96000, #int
                              'our_smallint' : 3, # int
                              'our_bigint' : 3000000, #int
                              'our_decimal' : decimal.Decimal('1234567890.03'), #1234567890.03 / our_decimal is a <class 'float'>
                              quote_ident('OUR TS', cur) : our_ts,              # str '1993-03-03 03:03:03.333333'
                              quote_ident('OUR TS TZ', cur) : our_ts_tz,        #str '1993-03-03 08:03:03.333333+00'
                              quote_ident('OUR TIME', cur) : our_time,          # str '03:04:05'
                              quote_ident('OUR TIME TZ', cur) : our_time_tz,    # str '03:04:05+00'
                              quote_ident('OUR DATE', cur) : our_date,          #1933-03-03 / OUR DATE is a <class 'str'>
                              'our_double' : 3.3,                               #3.3 / our_double is a <class 'float'>
                              'our_real' : 6.6,                                 #6.6 / our_real is a <class 'float'>
                              'our_boolean' : True,                             #boolean
                              'our_bit' : '1',                                  #string
                              'our_json' : json.dumps({'secret' : 33}),         #string
                              'our_jsonb' : json.dumps(['burgers make me hungry']),
                              'our_uuid' : my_uuid, #string
                              'our_store' : 'jumps=>"high",name=>"betty"', #string
                              'our_citext': 'maGICKal 3',
                              'our_cidr' : '192.168.102.128/32',
                              'our_inet': '192.168.102.128/32',
                              'our_mac' : '08:00:2b:01:02:05',
                              'our_money':     '$412.1234'
                }

                insert_record(cur, test_table_name, self.rec_5)

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test': 1 })
        records_by_stream = runner.get_records_from_target_output()

        self.assertTrue(len(records_by_stream) > 0)

        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertDictEqual(recs['schema'], expected_schemas[stream])


        self.assertEqual(1, len(records_by_stream['postgres_logical_replication_test']['messages']))
        actual_record_2 = records_by_stream['postgres_logical_replication_test']['messages'][0]['data']
        actual_sdc_lsn_2 = int(actual_record_2['_sdc_lsn'])
        del actual_record_2['_sdc_lsn']

        expected_inserted_record = {'our_text': 'some text 3',
                                    'our_real': decimal.Decimal('6.6'),
                                    '_sdc_deleted_at': None,
                                    'our_store' : {'name' : 'betty', 'jumps' : 'high' },
                                    'our_bigint': 3000000,
                                    'our_varchar': 'our_varchar 5',
                                    'our_double': decimal.Decimal('3.3'),
                                    'our_bit': True,
                                    'our_uuid': self.rec_5['our_uuid'],
                                    'OUR TS': '1993-03-03T03:03:03.333333+00:00',
                                    'OUR TS TZ': '1993-03-03T08:03:03.333333+00:00',
                                    'OUR TIME': '03:04:05',
                                    'OUR TIME TZ': '03:04:05-04:00',
                                    'OUR DATE': '1933-03-03T00:00:00+00:00',
                                    'our_decimal': decimal.Decimal('1234567890.03'),
                                    'id': 5,
                                    'our_varchar_10': 'varchar13',
                                    'our_json': '{"secret": 33}',
                                    'our_jsonb': self.rec_5['our_jsonb'],
                                    'our_smallint': 3,
                                    'our_integer': 96000,
                                    'our_boolean': True,
                                    'our_citext': 'maGICKal 3',
                                    'our_cidr': self.rec_5['our_cidr'],
                                    'our_inet': '192.168.102.128',
                                    'our_mac': self.rec_5['our_mac'],
                                    'our_alignment_enum' : None,
                                    'our_money'          :'$412.12'
        }
        self.assertDictEqual(expected_inserted_record, actual_record_2)

        self.assertEqual(records_by_stream['postgres_logical_replication_test']['messages'][0]['action'], 'upsert')
        print("inserted record is correct")

        state = menagerie.get_state(conn_id)
        chicken_bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")

        self.assertIsNotNone(chicken_bookmark['lsn'],
                             msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn")
        lsn_2 = chicken_bookmark['lsn']

        self.assertTrue(lsn_2 >= lsn_1)

        #table_version does NOT change
        self.assertEqual(chicken_bookmark['version'], table_version,
                         msg="expected bookmark for stream public-postgres_logical_replication_test to match version")

        #----------------------------------------------------------------------
        # invoke the sync job again after deleting a record
        #----------------------------------------------------------------------
        print("delete row from source db")
        with db_utils.get_test_connection(test_db) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute("DELETE FROM {} WHERE id = 3".format(canonicalized_table_name(test_schema_name, test_table_name, cur)))

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        # verify the inserted record's lsn is less than or equal to the bookmarked lsn
        self.assertGreaterEqual(lsn_2, actual_sdc_lsn_2)
        expected_record_count = 1 if actual_sdc_lsn_2 < lsn_2 else 2
        self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test': expected_record_count })

        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(recs['schema'],
                             expected_schemas[stream],
                             msg="Persisted schema did not match expected schema for stream `{}`.".format(stream))

        # if there are 2 records...
        if expected_record_count == 2:
            # the 1st message will be the previous insert
            insert_message = records_by_stream['postgres_logical_replication_test']['messages'][0]['data']
            del insert_message['_sdc_lsn']

            self.assertDictEqual(insert_message, expected_inserted_record)

        #the 2nd message will be the delete
        delete_message = records_by_stream['postgres_logical_replication_test']['messages'][expected_record_count - 1]
        self.assertEqual(delete_message['action'], 'upsert')

        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 3)
        print("deleted record is correct")

        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")

        self.assertIsNotNone(bookmark['lsn'],
                             msg="expected bookmark for stream ROOT-CHICKEN to have an scn")

        lsn_3 = bookmark['lsn']
        self.assertTrue(lsn_3 >= lsn_2)
        #----------------------------------------------------------------------
        # invoke the sync job again after deleting a record using the 'id IN (SELECT ...)' format
        #----------------------------------------------------------------------
        print("delete row from source db")
        with db_utils.get_test_connection(test_db) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute("DELETE FROM {} WHERE id IN (SELECT id FROM {} WHERE id=2)".format(canonicalized_table_name(test_schema_name, test_table_name, cur), canonicalized_table_name(test_schema_name, test_table_name, cur)))

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())


        self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test': 2 })
        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(recs['schema'],
                             expected_schemas[stream],
                             msg="Persisted schema did not match expected schema for stream `{}`.".format(stream))

        #first record will be the previous delete
        delete_message = records_by_stream['postgres_logical_replication_test']['messages'][0]
        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 3)



        #the 2nd message will be the more recent delete
        delete_message = records_by_stream['postgres_logical_replication_test']['messages'][1]
        self.assertEqual(delete_message['action'], 'upsert')

        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 2)
        print("deleted record is correct")

        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")

        self.assertIsNotNone(bookmark['lsn'],
                             msg="expected bookmark for stream ROOT-CHICKEN to have an scn")

        lsn_4 = bookmark['lsn']
        self.assertTrue(lsn_4 >= lsn_3)


        #table_version does NOT change
        self.assertEqual(bookmark['version'], table_version,
                         msg="expected bookmark for stream postgres_logical_replication_test to match version")
        #----------------------------------------------------------------------
        # invoke the sync job again after deleting a record using the 'id IN (<id>, <id>)' format
        #----------------------------------------------------------------------
        print("delete row from source db")
        with db_utils.get_test_connection(test_db) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute("DELETE FROM {} WHERE id IN (4, 5)".format(canonicalized_table_name(test_schema_name, test_table_name, cur)))

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())


        self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test': 3 })
        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(recs['schema'],
                             expected_schemas[stream],
                             msg="Persisted schema did not match expected schema for stream `{}`.".format(stream))

        #first record will be the previous delete
        delete_message = records_by_stream['postgres_logical_replication_test']['messages'][0]
        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 2)



        #the 2nd message will be the more recent delete
        delete_message = records_by_stream['postgres_logical_replication_test']['messages'][1]
        self.assertEqual(delete_message['action'], 'upsert')

        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 4)
        print("deleted record is correct")

        #the 3rd message will be the more recent delete
        delete_message = records_by_stream['postgres_logical_replication_test']['messages'][2]
        self.assertEqual(delete_message['action'], 'upsert')

        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 5)
        print("deleted record is correct")


        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")

        self.assertIsNotNone(bookmark['lsn'],
                             msg="expected bookmark for stream ROOT-CHICKEN to have an scn")

        lsn_5 = bookmark['lsn']
        self.assertTrue(lsn_5 >= lsn_4)


        #table_version does NOT change
        self.assertEqual(bookmark['version'], table_version,
                         msg="expected bookmark for stream postgres_logical_replication_test to match version")

        #----------------------------------------------------------------------
        # invoke the sync job again after updating a record
        #----------------------------------------------------------------------
        print("updating row from source db")
        with db_utils.get_test_connection(test_db) as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute("UPDATE {} SET our_varchar = 'THIS HAS BEEN UPDATED', our_money = '$56.811', our_decimal = 'NaN', our_real = '+Infinity', our_double = 'NaN' WHERE id = 1".format(canonicalized_table_name(test_schema_name, test_table_name, cur)))

        sync_job_name = runner.run_sync_mode(self, conn_id)
        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())
        self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test': 3 })
        records_by_stream = runner.get_records_from_target_output()
        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(recs['schema'],
                             expected_schemas[stream],
                             msg="Persisted schema did not match expected schema for stream `{}`.".format(stream))


        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)


        self.assertEqual(len(records_by_stream['postgres_logical_replication_test']['messages']), 3)
        #first record will be the previous first delete
        delete_message = records_by_stream['postgres_logical_replication_test']['messages'][0]
        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 4)

        #second record will be the previous second delete
        delete_message = records_by_stream['postgres_logical_replication_test']['messages'][1]
        sdc_deleted_at = delete_message['data'].get('_sdc_deleted_at')
        self.assertIsNotNone(sdc_deleted_at)
        self.assertEqual(delete_message['data']['id'], 5)

        #third record will be the new update
        updated_message = records_by_stream['postgres_logical_replication_test']['messages'][2]
        del updated_message['data']['_sdc_lsn']

        self.assertEqual(updated_message['action'], 'upsert')

        expected_updated_rec = {'our_varchar' : 'THIS HAS BEEN UPDATED',
                                'id' : 1,
                                'our_varchar_10' : "varchar_10",
                                'our_text' : "some text",
                                'our_integer' : 44100,
                                'our_smallint' : 1,
                                'our_bigint' : 1000000,
                                'our_decimal' : None,
                                'OUR TS': '1997-02-02T02:02:02.722184+00:00',
                                'OUR TS TZ' : '1997-02-02T07:02:02.722184+00:00',
                                'OUR TIME' : '12:11:10',
                                'OUR TIME TZ' : '12:11:10-04:00',
                                'OUR DATE': '1998-03-04T00:00:00+00:00',
                                'our_double' : None,
                                'our_real' : None,
                                'our_boolean' : True,
                                'our_bit' : False,
                                'our_json' : '{"secret": 55}',
                                'our_jsonb' : self.rec_1['our_jsonb'],
                                'our_uuid' : self.rec_1['our_uuid'],
                                '_sdc_deleted_at' : None,
                                'our_store' : {'name' : 'betty', 'size' : 'small' },
                                'our_citext': 'maGICKal',
                                'our_cidr': self.rec_1['our_cidr'],
                                'our_inet': self.rec_1['our_inet'],
                                'our_mac': self.rec_1['our_mac'],
                                'our_alignment_enum' : 'bad',
                                'our_money' : '$56.81'
        }

        self.assertDictEqual(expected_updated_rec, updated_message['data'])
        print("updated record is correct")

        #check state again
        state = menagerie.get_state(conn_id)
        self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")
        chicken_bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")
        self.assertIsNotNone(chicken_bookmark['lsn'],
                             msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn")
        lsn_6 = chicken_bookmark['lsn']
        self.assertTrue(lsn_6 >= lsn_5)

        #table_version does NOT change
        self.assertEqual(chicken_bookmark['version'], table_version,
                         msg="expected bookmark for stream public-postgres_logical_replication_test to match version")


        #----------------------------------------------------------------------
        # invoke the sync job one last time. should only get the PREVIOUS update
        #----------------------------------------------------------------------
        sync_job_name = runner.run_sync_mode(self, conn_id)
        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())
        # we will get the previous update record again
        self.assertEqual(record_count_by_stream, {'postgres_logical_replication_test': 1})
        # TODO the next line is not grabing the record from the latest sync, opening potential for false negatives
        update_message = records_by_stream['postgres_logical_replication_test']['messages'][2]
        self.assertEqual(update_message['action'], 'upsert')

        self.assertEqual(set(update_message['data'].keys()), set(expected_updated_rec.keys()),
                         msg="keys for expected_record_1 are wrong: {}".format(set(update_message['data'].keys()).symmetric_difference(set(expected_updated_rec.keys()))))


        for k,v in update_message['data'].items():
            self.assertEqual(v, expected_updated_rec[k], msg="{} != {} for key {}".format(v, expected_updated_rec[k], k))


        #check state again
        state = menagerie.get_state(conn_id)
        chicken_bookmark = state['bookmarks']['logical_1-public-postgres_logical_replication_test']
        self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")
        self.assertIsNotNone(chicken_bookmark['lsn'],
                             msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn")
        lsn_7 = chicken_bookmark['lsn']
        self.assertTrue(lsn_7 >= lsn_6)

        #table_version does NOT change
        self.assertEqual(chicken_bookmark['version'], table_version,
                         msg="expected bookmark for stream public-postgres_logical_replication_test to match version")
Ejemplo n.º 3
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'FULL_TABLE'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                # TODO - test schema matches expectations based on data type, nullable, not nullable, datetimes as string +, etc
                #   This needs to be consistent based on replication method so you can change replication methods
                table_version = records_by_stream[stream]['table_version']

                # verify on the first sync you get activate version message before and after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row[
                                "data"].items():
                            self.assertEqual(
                                expected_value,
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                # TODO - change this to something for mssql once binlog (cdc) is finalized and we know what it is
                self.assertIsNone(
                    bookmark.get('lsn'),
                    msg=
                    "expected bookmark for stream to have NO lsn because we are using full-table replication"
                )

                self.assertEqual(
                    bookmark['version'],
                    table_version,
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and get the same records
        # NOTE:  THIS IS ONLY DONE IN THIS TEST.  It also tests we don't send activate version before completion
        # and the talbe version is incremented
        # ----------------------------------------------------------------------
        # TODO - update the table to add a column and ensure that discovery adds the new column
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                # TODO - test schema matches expectations based on data type, nullable, not nullable, datetimes as string +, etc
                #   This needs to be consistent based on replication method so you can change replication methods
                # {'action': 'upsert', 'sequence': 1560362044666000001, 'data': {'MySmallIntColumn': 0, 'pk': 1, 'MyIntColumn': 0, 'MyBigIntColumn': 0}}

                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'upsert')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][0:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][0:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row[
                                "data"].items():
                            self.assertEqual(
                                expected_value,
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]
                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")

                self.assertIsNone(
                    bookmark.get('lsn'),
                    msg=
                    "expected bookmark for stream to have NO lsn because we are using full-table replication"
                )
                self.assertGreater(
                    new_table_version,
                    table_version,
                    msg=
                    "table version {} didn't increate from {} on the second run"
                    .format(new_table_version, table_version))
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))
Ejemplo n.º 4
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'INCREMENTAL',
                'replication-key': 'replication_key_column'
            }
        }]

        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        non_selected_properties = []

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        m["action"] == "upsert"
                        for m in records_by_stream[stream]['messages'][1:-1]
                    ]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(len(
                    records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                        if column not in non_selected_properties
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # Verify all data is correct for incremental
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, time):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.replace(
                                        microsecond=0).isoformat().replace(
                                            '+00:00', ''),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, date):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.isoformat() +
                                    'T00:00:00+00:00',
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat() +
                                        'T00:00:00+00:00',
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value,
                                        actual_row["data"][column_name]))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                expected_bookmark = max([
                    row[1] for row in stream_expected_data[self.VALUES]
                    if row[1] is not None
                ])
                self.assertEqual(bookmark['replication_key_value'],
                                 expected_bookmark.isoformat())
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "dates_and_times"
        column_name = [
            "pk", "replication_key_column", "date_and_time",
            "bigger_range_and_precision_datetime", "datetime_with_timezones",
            "datetime_no_seconds", "its_time"
        ]
        insert_value = [
            (5, date(9999, 12, 30),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).isoformat(),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (6, date(2018, 12, 29),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).isoformat(),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc))
        ]
        update_value = [
            (3, date(9999, 12, 31),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=10))).isoformat(),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (4, date(2018, 12, 30),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=6))).isoformat(),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc))
        ]
        delete_value = [(2, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)

        insert_value = [
            (5, date(9999, 12, 30),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).astimezone(
                          timezone.utc),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (6, date(2018, 12, 29),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).astimezone(
                          timezone.utc),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc))
        ]
        update_value = [
            (3, date(9999, 12, 31),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=10))).astimezone(
                          timezone.utc),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (4, date(2018, 12, 30),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=6))).astimezone(
                          timezone.utc),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc))
        ]

        insert_value = insert_value[:-1]  # only repl_key >= gets included
        update_value = update_value[:-1]
        self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"][
            "values"] = [(
                1, date(9999, 12, 29),
                datetime(9999, 12, 31, 23, 59, 59, 997000,
                         tzinfo=timezone.utc),
                datetime(9999, 12, 31, 23, 59, 59, 999000,
                         tzinfo=timezone.utc),
                datetime(
                    9999, 12, 31, 10, 14, tzinfo=timezone(
                        timedelta(hours=14))).astimezone(timezone.utc),
                datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
                time(23, 59, 59, tzinfo=timezone.utc))
                         ] + update_value + insert_value

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:-1]
                    ]))
                self.assertEqual(len(
                    records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                        if column not in non_selected_properties
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, time):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.replace(
                                        microsecond=0).isoformat().replace(
                                            '+00:00', ''),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, date):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.isoformat() +
                                    'T00:00:00+00:00',
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat() +
                                        'T00:00:00+00:00',
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value,
                                        actual_row["data"][column_name]))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                expected_bookmark = max([
                    row[1] for row in stream_expected_data[self.VALUES]
                    if row[1] is not None
                ])
                self.assertEqual(bookmark['replication_key_value'],
                                 expected_bookmark.isoformat())
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))
Ejemplo n.º 5
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [fc for fc
                          in menagerie.get_catalogs(conn_id)
                          if fc['tap_stream_id'] in self.expected_check_streams()]


        self.assertGreaterEqual(len(found_catalogs),
                                1,
                                msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(found_catalog_names)
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]

        self.assertEqual(test_table_name, test_catalog['stream_name'])

        print("discovered streams are correct")
        additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}]
        _ = connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog,
                                                               menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']),
                                                               additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        print("inserting a record")
        our_ts_tz = None
        our_date = None
        our_uuid = str(uuid.uuid1())
        with db_utils.get_test_connection('dev') as conn:
            conn.autocommit = True
            with conn.cursor() as cur:
                #insert fixture data 2

                #insert fixture data 1
                our_ts = datetime.datetime(1997, 2, 2, 2, 2, 2, 722184)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_date = datetime.date(1998, 3, 4)

                self.rec_1 = {
                    'our_bit_array'         : '{{0,1,1}}',
                    'our_boolean_array'     : '{true}',
                    'our_cidr_array'        : '{{192.168.100.128/25}}',
                    'our_citext_array'      : '{{maGICKal 2}}',
                    'our_date_array'        : '{{{}}}'.format(our_date),
                    'our_decimal_array'     : '{{{}}}'.format(decimal.Decimal('1234567890.01')),
                    'our_double_array'      : '{{1.232323}}',
                    'our_enum_array'        : '{{bad}}',
                    'our_float_array'       : '{{5.23}}',
                    'our_hstore_array'      : """{{"size=>small","name=>betty"}}""",
                    'our_inet_array'        : '{{192.168.100.128/24}}',
                    'our_int_array'         : '{{1,2,3},{4,5,6}}',
                    'our_json_array'        : [psycopg2.extras.Json({'secret' : 55})],
                    'our_jsonb_array'       : [psycopg2.extras.Json({'secret' : 69})],
                    'our_mac_array'         : '{{08:00:2b:01:02:03}}',
                    'our_money_array'       : '{{$412.1234}}',
                    'our_real_array'        : '{{76.33}}',
                    'our_smallint_array'    : '{{10,20,30},{40,50,60}}',
                    'our_string_array'      : '{{one string, two strings}}',
                    'our_text_array'        : '{{three string, four}}',
                    'our_time_array'        : '{{03:04:05}}',
                    'our_ts_tz_array'       : '{{{}}}'.format(our_ts_tz),
                    'our_uuid_array'        : '{{{}}}'.format(our_uuid)}


                insert_record(cur, test_table_name, self.rec_1)


        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())
        self.assertEqual(record_count_by_stream, { test_table_name: 1 })
        records_by_stream = runner.get_records_from_target_output()
        self.assertTrue(len(records_by_stream) > 0)

        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(recs['schema'],
                             expected_schemas[stream],
                             msg="Persisted schema did not match expected schema for stream `{}`.".format(stream))

        self.assertEqual(3, len(records_by_stream[test_table_name]['messages']))
        self.assertEqual(records_by_stream[test_table_name]['messages'][0]['action'],
                         'activate_version')
        self.assertEqual(records_by_stream[test_table_name]['messages'][1]['action'],
                         'upsert')
        self.assertEqual(records_by_stream[test_table_name]['messages'][2]['action'],
                         'activate_version')
        actual_record_1 = records_by_stream[test_table_name]['messages'][1]['data']

        expected_inserted_record = {'id': 1,
                                    'our_bit_array'         : [[False, True, True]],
                                    'our_boolean_array'     : [True],
                                    'our_cidr_array'        : [['192.168.100.128/25']],
                                    'our_citext_array'      : [['maGICKal 2']],
                                    'our_date_array'        : ['1998-03-04T00:00:00+00:00'],
                                    'our_decimal_array'     : [decimal.Decimal('1234567890.01')],
                                    'our_double_array'      : [[decimal.Decimal('1.232323')]],
                                    'our_enum_array'        : [['bad']],
                                    'our_float_array'       : [[decimal.Decimal('5.23')]],
                                    'our_hstore_array'      : [[{'size' : 'small' }, {'name' : 'betty'} ]],
                                    'our_inet_array'        : [['192.168.100.128/24']],
                                    'our_int_array'         : [[1,2,3],[4,5,6]],
                                    'our_json_array'        : [json.dumps({'secret' : 55})],
                                    'our_jsonb_array'       : [json.dumps({'secret' : 69})],
                                    'our_mac_array'         : [['08:00:2b:01:02:03']],
                                    'our_money_array'       : [['$412.12']],
                                    'our_real_array'        : [[decimal.Decimal('76.33')]],
                                    'our_smallint_array'    : [[10,20,30],[40,50,60]],
                                    'our_string_array'      : [['one string', 'two strings']],
                                    'our_text_array'        : [['three string', 'four']],
                                    'our_time_array'        : [['03:04:05']],
                                    'our_ts_tz_array'       : ['1997-02-02T07:02:02.722184+00:00'],
                                    'our_uuid_array'        : ['{}'.format(our_uuid)]

        }

        self.assertEqual(set(actual_record_1.keys()), set(expected_inserted_record.keys()),
                         msg="keys for expected_record_1 are wrong: {}".format(set(actual_record_1.keys()).symmetric_difference(set(expected_inserted_record.keys()))))

        for k in actual_record_1.keys():
            self.assertEqual(actual_record_1[k], expected_inserted_record[k], msg="{} != {} for key {}".format(actual_record_1[k], expected_inserted_record[k], k))

        print("inserted record is correct")

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)

        bookmark = state['bookmarks']['dev-public-postgres_full_table_replication_array_test']
        self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")

        self.assertIsNone(bookmark.get('lsn'),
                          msg="expected bookmark for stream to have NO lsn because we are using full-table replication")
Ejemplo n.º 6
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'FULL_TABLE'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version = records_by_stream[stream]['table_version']

                # verify on the first sync you get activate version message before and after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, Decimal):
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "decimal value is not represented as a number"
                                )
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                # TODO - change this to something for mssql once binlog (cdc) is finalized and we know what it is
                self.assertIsNone(
                    bookmark.get('lsn'),
                    msg=
                    "expected bookmark for stream to have NO lsn because we are using full-table replication"
                )

                self.assertEqual(
                    bookmark['version'],
                    table_version,
                    msg="expected bookmark for stream to match version")

                expected_schemas = {
                    "selected": True,
                    "type": "object",
                    "properties": {
                        k: dict(**self.DATATYPE_SCHEMAS[v["sql-datatype"]],
                                selected=True,
                                inclusion=v["inclusion"])
                        for fd in stream_expected_data[self.FIELDS]
                        for k, v in fd.items()
                    }
                }

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(
                    records_by_stream[stream]['schema'],
                    simplejson.loads(simplejson.dumps(expected_schemas),
                                     use_decimal=True),
                    msg="expected: {} != actual: {}".format(
                        expected_schemas, records_by_stream[stream]['schema']))
Ejemplo n.º 7
0
    def test_run(self):
        # SYNC 1
        conn_id = self.ensure_connection()

        # Run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        # Select only the expected streams tables
        expected_streams = self.expected_streams()
        catalog_entries = [
            ce for ce in found_catalogs
            if ce['tap_stream_id'] in expected_streams
        ]
        self.select_all_streams_and_fields(conn_id, catalog_entries)

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)
        first_sync_records = runner.get_records_from_target_output()
        first_sync_record_count = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys())
        first_sync_bookmarks = menagerie.get_state(conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # UPDATE STATE BETWEEN SYNCS
        new_state = dict()
        new_state['bookmarks'] = {
            key: {
                'LastUpdatedTime': value
            }
            for key, value in self.calculated_states_by_stream(
                first_sync_bookmarks).items()
        }
        menagerie.set_state(conn_id, new_state)

        # SYNC 2
        sync_job_name = runner.run_sync_mode(self, conn_id)
        second_sync_records = runner.get_records_from_target_output()
        second_sync_record_count = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys())
        second_sync_bookmarks = menagerie.get_state(conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Test by stream
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                # record counts
                first_sync_count = first_sync_record_count.get(stream, 0)
                second_sync_count = second_sync_record_count.get(stream, 0)

                # record messages
                first_sync_messages = first_sync_records.get(
                    stream, {
                        'messages': []
                    }).get('messages')
                second_sync_messages = second_sync_records.get(
                    stream, {
                        'messages': []
                    }).get('messages')

                # replication key is an object (MetaData.LastUpdatedTime) in sync records
                # but just the sub level replication key is used in setting bookmarks
                top_level_replication_key = 'MetaData'
                sub_level_replication_key = 'LastUpdatedTime'

                # bookmarked states (top level objects)
                first_bookmark_key_value = first_sync_bookmarks.get(
                    'bookmarks').get(stream)
                second_bookmark_key_value = second_sync_bookmarks.get(
                    'bookmarks').get(stream)

                # Verify the first sync sets a bookmark of the expected form
                self.assertIsNotNone(first_bookmark_key_value)
                self.assertIsNotNone(
                    first_bookmark_key_value.get(sub_level_replication_key))

                # Verify the second sync sets a bookmark of the expected form
                self.assertIsNotNone(second_bookmark_key_value)
                self.assertIsNotNone(
                    second_bookmark_key_value.get(sub_level_replication_key))

                # bookmarked states (actual values)
                first_bookmark_value = first_bookmark_key_value.get(
                    sub_level_replication_key)
                second_bookmark_value = second_bookmark_key_value.get(
                    sub_level_replication_key)
                # bookmarked values as utc for comparing against records
                first_bookmark_value_utc = self.convert_state_to_utc(
                    first_bookmark_value)
                second_bookmark_value_utc = self.convert_state_to_utc(
                    second_bookmark_value)

                # Verify the second sync bookmark is Equal to the first sync bookmark
                self.assertEqual(second_bookmark_value, first_bookmark_value
                                 )  # assumes no changes to data during test

                # Verify the second sync records respect the previous (simulated) bookmark value
                simulated_bookmark_value = new_state['bookmarks'][stream][
                    sub_level_replication_key]
                for message in second_sync_messages:
                    replication_key_value = message.get('data').get(
                        top_level_replication_key).get(
                            sub_level_replication_key)
                    self.assertGreaterEqual(
                        replication_key_value,
                        simulated_bookmark_value,
                        msg=
                        "Second sync records do not repect the previous bookmark."
                    )

                # Verify the first sync bookmark value is the max replication key value for a given stream
                for message in first_sync_messages:
                    replication_key_value = message.get('data').get(
                        top_level_replication_key).get(
                            sub_level_replication_key)
                    self.assertLessEqual(
                        replication_key_value,
                        first_bookmark_value_utc,
                        msg=
                        "First sync bookmark was set incorrectly, a record with a greater rep key value was synced"
                    )

                # Verify the second sync bookmark value is the max replication key value for a given stream
                for message in second_sync_messages:
                    replication_key_value = message.get('data').get(
                        top_level_replication_key).get(
                            sub_level_replication_key)
                    self.assertLessEqual(
                        replication_key_value,
                        second_bookmark_value_utc,
                        msg=
                        "Second sync bookmark was set incorrectly, a record with a greater rep key value was synced"
                    )

                # Verify the number of records in the 2nd sync is less then the first
                self.assertLess(second_sync_count, first_sync_count)

                # Verify at least 1 record was replicated in the second sync
                self.assertGreater(
                    second_sync_count,
                    0,
                    msg="We are not fully testing bookmarking for {}".format(
                        stream))
Ejemplo n.º 8
0
    def test_run(self):

        conn_id = connections.ensure_connection(self, payload_hook=None)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")
        #
        # # Select some catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for catalog in our_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], [])

        # # Verify that all streams sync at least one row for initial sync
        # # This test is also verifying access token expiration handling. If test fails with
        # # authentication error, refresh token was not replaced after expiring.
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        zero_count_streams = {
            k
            for k, v in record_count_by_stream.items() if v == 0
        }
        self.assertFalse(
            zero_count_streams,
            msg="The following streams did not sync any rows {}".format(
                zero_count_streams))

        # # Verify that bookmark values are correct after incremental sync
        bookmark_props = configuration['bookmark']
        current_state = menagerie.get_state(conn_id)
        test_bookmark = current_state['bookmarks'][
            bookmark_props['bookmark_dict']][bookmark_props['bookmark_key']]
        print(test_bookmark)
        self.assertTrue(
            test_bookmark == bookmark_props['bookmark_timestamp'],
            msg="The bookmark value does not match the expected result")
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [fc for fc
                          in menagerie.get_catalogs(conn_id)
                          if fc['tap_stream_id'] in self.expected_check_streams()]


        self.assertGreaterEqual(len(found_catalogs),
                                1,
                                msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(found_catalog_names)
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]
        print('Catalog', test_catalog)
        self.assertEqual('postgres_full_table_replication_test', test_catalog['stream_name'])

        print("discovered streams are correct")

        print('checking discoverd metadata for public-postgres_full_table_test...')
        md = menagerie.get_annotated_schema(conn_id, test_catalog['stream_id'])['metadata']

        self.assertEqual(
            {('properties', 'our_varchar'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'character varying'},
             ('properties', 'our_boolean'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'boolean'},
             ('properties', 'our_real'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'real'},
             ('properties', 'our_uuid'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'uuid'},
             ('properties', 'our_bit'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'bit'},
             ('properties', 'OUR TS TZ'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'timestamp with time zone'},
             ('properties', 'our_varchar_10'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'character varying'},
             ('properties', 'our_store'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'hstore'},
             ('properties', 'our_citext'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'citext'},
             ('properties', 'OUR TIME'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'time without time zone'},
             ('properties', 'our_decimal'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'numeric'},
             ('properties', 'OUR TS'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'timestamp without time zone'},
             ('properties', 'our_jsonb'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'jsonb'},
             ('properties', 'OUR TIME TZ'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'time with time zone'},
             ('properties', 'our_text'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'text'},
             ('properties', 'OUR DATE'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'date'},
             ('properties', 'our_double'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'double precision'},
             (): {'is-view': False, 'schema-name': 'public', 'table-key-properties': ['id'], 'database-name': 'dev', 'row-count': 0},
             ('properties', 'our_bigint'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'bigint'},
             ('properties', 'id'): {'inclusion': 'automatic', 'selected-by-default': True, 'sql-datatype': 'integer'},
             ('properties', 'our_json'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'json'},
             ('properties', 'our_smallint'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'smallint'},
             ('properties', 'our_integer'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'integer'},
             ('properties', 'our_inet'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'inet'},
             ('properties', 'our_cidr'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'cidr'},
             ('properties', 'our_mac'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'macaddr'},
             ('properties', 'our_alignment_enum'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'alignment'},
             ('properties', 'our_money'): {'inclusion': 'available', 'selected-by-default': True, 'sql-datatype': 'money'}},
            metadata.to_map(md))

        additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}]
        selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog,
                                                                               menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']),
                                                                               additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())


        self.assertEqual(record_count_by_stream, { 'postgres_full_table_replication_test': 3})
        records_by_stream = runner.get_records_from_target_output()

        table_version = records_by_stream['postgres_full_table_replication_test']['table_version']

        self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][0]['action'],
                         'activate_version')

        self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][1]['action'],
                         'upsert')

        self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][2]['action'],
                         'upsert')

        self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][3]['action'],
                         'upsert')

        self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][4]['action'],
                         'activate_version')

        # verifications about individual records
        for table_name, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(recs['schema'],
                             expected_schemas[table_name],
                             msg="Persisted schema did not match expected schema for table `{}`.".format(table_name))

        expected_record_1 = {'our_decimal': decimal.Decimal('.01'),
                             'our_text': 'some text',
                             'our_bit': False,
                             'our_integer': 44100,
                             'our_double': decimal.Decimal('1.1'),
                             'id': 1,
                             'our_json': '{"secret": 55}',
                             'our_boolean': True,
                             'our_jsonb': '{"burgers": "good"}',
                             'our_bigint': 1000000,
                             'OUR TS': '1997-02-02T02:02:02.722184+00:00',
                             'OUR TS TZ': '1997-02-02T07:02:02.722184+00:00',
                             'OUR TIME': '12:11:10',
                             'OUR TIME TZ': '12:11:10-04:00',
                             'our_store': {"name" : "betty", "size" :"small"},
                             'our_smallint': 1,
                             'OUR DATE': '1998-03-04T00:00:00+00:00',
                             'our_varchar': 'our_varchar',
                             'our_uuid': self.rec_1['our_uuid'],
                             'our_real': decimal.Decimal('1.2'),
                             'our_varchar_10': 'varchar_10',
                             'our_citext'    : self.rec_1['our_citext'],
                             'our_inet'    : self.rec_1['our_inet'],
                             'our_cidr'    : self.rec_1['our_cidr'],
                             'our_mac'    : self.rec_1['our_mac'],
                             'our_alignment_enum' : self.rec_1['our_alignment_enum'],
                             'our_money'      : '$100.11'
        }

        expected_record_2 = {'our_decimal': decimal.Decimal('.02'),
                             'OUR TIME': '10:09:08',
                             'our_text': 'some text 2',
                             'our_bit': True,
                             'our_integer': 44101,
                             'our_double': decimal.Decimal('1.1'),
                             'id': 2,
                             'our_json': '["nymn 77"]',
                             'our_boolean': True,
                             'our_jsonb': '{"burgers": "good++"}',
                             'our_bigint': 1000001,
                             'OUR TIME TZ': '10:09:08-04:00',
                             'our_store': {"name" : "betty", "dances" :"floor"},
                             'OUR TS TZ': '1987-03-03T08:03:03.733184+00:00',
                             'our_smallint': 2,
                             'OUR DATE': '1964-07-01T00:00:00+00:00',
                             'our_varchar': 'our_varchar 2',
                             'OUR TS': '1987-03-03T03:03:03.733184+00:00',
                             'our_uuid': self.rec_2['our_uuid'],
                             'our_real': decimal.Decimal('1.2'),
                             'our_varchar_10': 'varchar_10',
                             'our_citext'    : self.rec_2['our_citext'],
                             'our_inet'    : self.rec_2['our_inet'],
                             'our_cidr'    : self.rec_2['our_cidr'],
                             'our_mac'     : self.rec_2['our_mac'],
                             'our_alignment_enum' : None,
                             'our_money':    None
        }

        actual_record_1 = records_by_stream['postgres_full_table_replication_test']['messages'][1]
        self.assertEqual(set(actual_record_1['data'].keys()), set(expected_record_1.keys()),
                         msg="keys for expected_record_1 are wrong: {}".format(set(actual_record_1.keys()).symmetric_difference(set(expected_record_1.keys()))))

        for k,v in actual_record_1['data'].items():
            self.assertEqual(actual_record_1['data'][k], expected_record_1[k], msg="{} != {} for key {}".format(actual_record_1['data'][k], expected_record_1[k], k))

        actual_record_2 = records_by_stream['postgres_full_table_replication_test']['messages'][2]
        self.assertEqual(set(actual_record_2['data'].keys()), set(expected_record_2.keys()),
                         msg="keys for expected_record_2 are wrong: {}".format(set(actual_record_2.keys()).symmetric_difference(set(expected_record_2.keys()))))

        for k,v in actual_record_2['data'].items():
            self.assertEqual(actual_record_2['data'][k], expected_record_2[k], msg="{} != {} for key {}".format(actual_record_2['data'][k], expected_record_2[k], k))

        #We cast NaN's, +Inf, -Inf to NULL as wal2json does not support them and now we are at least consistent(ly wrong)
        expected_record_3 = {'our_decimal' : None,
                             'our_double' : None,
                             'our_real' : None}
        actual_record_3 = records_by_stream['postgres_full_table_replication_test']['messages'][3]
        for k,v in expected_record_3.items():
            self.assertEqual(actual_record_3['data'][k], v, msg="{} != {} for key {}".format(actual_record_3['data'][k], v, k))


        print("records are correct")

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)

        bookmark = state['bookmarks']['dev-public-postgres_full_table_replication_test']
        self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")


        self.assertIsNone(bookmark.get('lsn'),
                          msg="expected bookmark for stream ROOT-CHICKEN to have NO lsn because we are using full-table replication")
        self.assertEqual(bookmark['version'], table_version,
                         msg="expected bookmark for stream ROOT-CHICKEN to match version")

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN and get the same 3 records
        #----------------------------------------------------------------------
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        self.assertEqual(record_count_by_stream, { 'postgres_full_table_replication_test': 3})
        records_by_stream = runner.get_records_from_target_output()

        new_table_version = records_by_stream['postgres_full_table_replication_test']['table_version']

        self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][0]['action'],
                         'upsert')

        self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][1]['action'],
                         'upsert')

        self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][2]['action'],
                         'upsert')

        self.assertEqual(records_by_stream['postgres_full_table_replication_test']['messages'][3]['action'],
                         'activate_version')

        new_table_version = records_by_stream['postgres_full_table_replication_test']['table_version']

        self.assertGreater(new_table_version, table_version,
                           msg="table version {} didn't increate from {} on the second run".format(new_table_version, table_version))

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(recs['schema'],
                             expected_schemas[stream],
                             msg="Persisted schema did not match expected schema for stream `{}`.".format(stream))
Ejemplo n.º 10
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        # Select all Catalogs
        for catalog in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id']))

        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        max_bookmarks_from_records = runner.get_most_recent_records_from_target(self, self.expected_bookmarks(), self.get_properties()['start_date'])

        start_of_today =  utils.strftime(datetime.datetime(datetime.datetime.utcnow().year, datetime.datetime.utcnow().month, datetime.datetime.utcnow().day, 0, 0, 0, 0, datetime.timezone.utc))
        max_bookmarks_from_records['subscription_changes'] = start_of_today
        max_bookmarks_from_records['email_events'] = start_of_today


        #if we didn't replicate data, the bookmark should be the start_date
        for k in self.expected_bookmarks().keys():
            if max_bookmarks_from_records.get(k) is None:
                max_bookmarks_from_records[k] = utils.strftime(datetime.datetime(2017, 5, 1, 0, 0, 0, 0, datetime.timezone.utc))

        state = menagerie.get_state(conn_id)
        bookmarks = state.get('bookmarks')
        bookmark_streams = set(state.get('bookmarks').keys())

        #verify bookmarks and offsets
        for k,v in sorted(list(self.expected_bookmarks().items())):
            for w in v:
                bk_value = bookmarks.get(k,{}).get(w)
                self.assertEqual(utils.strptime_with_tz(bk_value), utils.strptime_with_tz(max_bookmarks_from_records[k]), "Bookmark {} ({}) for stream {} should have been updated to {}".format(bk_value, w, k, max_bookmarks_from_records[k]))
                print("bookmark {}({}) updated to {} from max record value {}".format(k, w, bk_value, max_bookmarks_from_records[k]))

        for k,v in self.expected_offsets().items():
            self.assertEqual(bookmarks.get(k,{}).get('offset', {}), v, msg="unexpected offset found for stream {} {}. state: {}".format(k, v, state))
            print("offsets {} cleared".format(k))

        diff = bookmark_streams.difference(self.acceptable_bookmarks())
        self.assertEqual(len(diff), 0, msg="Unexpected bookmarks: {} Expected: {} Actual: {}".format(diff, self.acceptable_bookmarks(), bookmarks))

        self.assertEqual(state.get('currently_syncing'), None,"Unexpected `currently_syncing` bookmark value: {} Expected: None".format(state.get('currently_syncing')))
    def test_run(self):
        """
        Verify that a bookmark doesn't exist for the stream
        Verify that the second sync includes the same number or more records than the first sync
        Verify that all records in the first sync are included in the second sync
        Verify that the sync only sent records to the target for selected streams (catalogs)

        PREREQUISITE
        For EACH stream that is fully replicated there are multiple rows of data with
            different values for the replication key
        """
        CREATED_RECORDS = {x: [] for x in self.expected_streams()}
        UPDATED_RECORDS = {x: [] for x in self.expected_streams()}

        # Ensure data exists prior to test for all full table streams
        expected_records_1 = {x: [] for x in self.expected_streams()}
        for stream in self.expected_full_table_streams():
            existing_objects = self.client.get_all(stream)
            assert existing_objects, "Test data is not properly set for {}, test will fail.".format(stream)
            print("Data exists for stream: {}".format(stream))
            for obj in existing_objects:
                expected_records_1[stream].append(obj)

        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Select all full table streams and no fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        full_streams = {key for key, value in self.expected_replication_method().items()
                        if value == self.FULL}
        our_catalogs = [catalog for catalog in found_catalogs if
                        catalog.get('tap_stream_id') in full_streams]
        self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True)

        # Run a sync job using orchestrator
        first_sync_record_count = self.run_sync(conn_id)

        # verify that the sync only sent records to the target for selected streams (catalogs)
        self.assertEqual(set(first_sync_record_count.keys()), full_streams,
                         msg="Expect first_sync_record_count keys {} to equal full_streams {},"
                         " first_sync_record_count was {}".format(
                             first_sync_record_count.keys(),
                             full_streams,
                             first_sync_record_count))

        first_sync_state = menagerie.get_state(conn_id)

        # Get the set of records from a first sync
        first_sync_records = runner.get_records_from_target_output()

        # Create 1 new record for every full table stream
        N = 1  # number of creates/updates between syncs
        expected_records_2 = {x: [] for x in self.expected_streams()}
        for stream in self.streams_creatable():
            for _ in range(N):
                print("CREATING A RECORD FOR STREAM: {}".format(stream))
                new_object = self.client.create(stream)
                expected_records_2[stream].append(new_object)
                CREATED_RECORDS[stream].append(new_object)

        # Update 1 existing record for every full table stream
        for stream in self.streams_creatable():
            for _ in range(N):
                print("UDPATING A RECORD FOR STREAM: {}".format(stream))
                # eid = expected_records_1.get(stream)[-1] # most recent record prior to test
                updated_object = self.client.update(stream)
                expected_records_2[stream].append(updated_object)
                UPDATED_RECORDS[stream].append(updated_object)

        # adjust expectations to include expected_records_1
        for stream in self.streams_creatable():
            for record in expected_records_1.get(stream):
                if record.get('eid') in [ex_rec.get('eid') for ex_rec in expected_records_2.get(stream, [])]:
                    continue  # don't add a record to expectations twice
                expected_records_2[stream].append(record)
                # Run a second sync job using orchestrator
        second_sync_record_count = self.run_sync(conn_id)

        # Get the set of records from a second sync
        second_sync_records = runner.get_records_from_target_output()

        # Loop first_sync_records and compare against second_sync_records
        # each iteration of loop is chekcing both for a stream
        # first_sync_records["ads"] == second["ads"]
        for stream in full_streams:
            with self.subTest(stream=stream):
                # RECORD COUNT
                record_count_1 = first_sync_record_count.get(stream, 0)
                record_count_2 = second_sync_record_count.get(stream, 0)
                # ACTUAL RECORDS
                records_from_sync_1 = set(row.get('data', {}).get('eid')
                                          for row in first_sync_records.get(stream, []).get('messages', []))
                records_from_sync_2 = set(row.get('data', {}).get('eid')
                                          for row in second_sync_records.get(stream, []).get('messages', []))
                # EXPECTED_RECORDS
                expected_records_from_sync_1 = set(record.get('eid') for record in expected_records_1.get(stream, []))
                expected_records_from_sync_2 = set(record.get('eid') for record in expected_records_2.get(stream, []))
                
                # verify there is no bookmark values from state
                state_value = first_sync_state.get("bookmarks", {}).get(stream)
                self.assertIsNone(state_value)

                # verify that there is more than 1 record of data - setup necessary
                self.assertGreater(record_count_1, 1, msg="Data isn't set up to be able to test full sync")

                # verify that you get the same or more data the 2nd time around
                self.assertGreaterEqual(record_count_2, record_count_1,
                                        msg="second syc didn't have more records, full sync not verified")

                # verify all expected records were replicated for first sync
                self.assertEqual(
                        set(), records_from_sync_1.symmetric_difference(expected_records_from_sync_1),
                        msg="1st Sync records do not match expectations.\n" +
                        "MISSING RECORDS: {}\n".format(expected_records_from_sync_1.symmetric_difference(records_from_sync_1)) +
                        "ADDITIONAL RECORDS: {}".format(records_from_sync_1.symmetric_difference(expected_records_from_sync_1))
                )

                # verify all data from 1st sync included in 2nd sync
                self.assertEqual(set(), records_from_sync_1.difference(records_from_sync_2),
                                 msg="Data in 1st sync missing from 2nd sync")

                # testing streams with new and updated data
                if stream in self.streams_creatable():

                    # verify that the record count has increased by N record in the 2nd sync, where
                    # N = the number of new records created between sync 1 and sync 2
                    self.assertEqual(record_count_2, record_count_1 + N,
                                     msg="Expected {} new records to be captured by the 2nd sync.\n".format(N) +
                                     "Record Count 1: {}\nRecord Count 2: {}".format(record_count_1, record_count_2)
                    )

                    # verify that the newly created and updated records are captured by the 2nd sync
                    self.assertEqual(
                        set(), records_from_sync_2.symmetric_difference(expected_records_from_sync_2),
                        msg="2nd Sync records do not match expectations.\n" +
                        "MISSING RECORDS: {}\n".format(expected_records_from_sync_2.difference(records_from_sync_2)) +
                        "ADDITIONAL RECORDS: {}".format(records_from_sync_2.difference(expected_records_from_sync_2))
                    )

                    # verify that the updated records are correctly captured by the 2nd sync
                    expected_updated_records = set(record.get('eid') for record in expected_records_2.get(stream, [])
                                                   if "UPDATED" in record.get('name', '').upper())
                    if stream == 'segments': # Account for 'display name' in segments
                        expected_updated_records.update(set(record.get('eid') for record in expected_records_2.get(stream, [])
                                                            if "UPDATED" in record.get('display_name', '').upper()))
                    if expected_updated_records:
                        updated_records_from_sync_2 = set(row.get('data', {}).get('eid')
                                                          for row in second_sync_records.get(stream, []).get('messages', [])
                                                          if "UPDATED" in row.get('data', {}).get('name', '').upper())
                        if stream == 'segments': # Account for 'display name' in segments
                            updated_records_from_sync_2.update(set(row.get('data', {}).get('eid')
                                                                   for row in second_sync_records.get(stream, []).get('messages', [])
                                                                   if "UPDATED" in row.get('data', {}).get('display_name', '').upper()))

                        # check that the updated records are present in the target
                        self.assertEqual(
                            set(), updated_records_from_sync_2.symmetric_difference(expected_updated_records),
                            msg="Failed to replicate the updated {} record(s)\n".format(stream) +
                            "MISSING RECORDS: {}\n".format(expected_updated_records.difference(updated_records_from_sync_2)) +
                            "ADDITIONAL RECORDS: {}\n".format(updated_records_from_sync_2.difference(expected_updated_records))
                        )
                        # check that the record data matches expectations
                        self.assertEqual(len(UPDATED_RECORDS.get(stream, [])), 1, msg="Expectations are invalid")
                        updated_record = UPDATED_RECORDS.get(stream, []).pop()

                        record_name = [row.get('data', {}).get('name')
                                       for row in second_sync_records.get(stream, []).get('messages', [])
                                       if row.get('data', {}).get('eid') == updated_record.get('eid')]
                        expected_record_name = updated_record.get('name')
                        self.assertEqual(len(record_name), 1, msg="Updated record was duplicated.")

                        self.assertEqual(expected_record_name, record_name.pop(),
                                         msg="Update was not captured correctly.")
Ejemplo n.º 12
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]

        self.assertEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        chicken_catalog = found_catalogs[0]

        self.assertEqual('chicken_view', chicken_catalog['stream_name'])
        print("discovered streams are correct")

        print('checking discoverd metadata for ROOT-CHICKEN_VIEW')
        md = menagerie.get_annotated_schema(
            conn_id, chicken_catalog['stream_id'])['metadata']

        self.assertEqual(
            {
                (): {
                    'database-name': 'postgres',
                    'is-view': True,
                    'row-count': 0,
                    'schema-name': 'public',
                    'table-key-properties': []
                },
                ('properties', 'fk_id'): {
                    'inclusion': 'available',
                    'sql-datatype': 'bigint',
                    'selected-by-default': True
                },
                ('properties', 'name'): {
                    'inclusion': 'available',
                    'sql-datatype': 'character varying',
                    'selected-by-default': True
                },
                ('properties', 'age'): {
                    'inclusion': 'available',
                    'sql-datatype': 'integer',
                    'selected-by-default': True
                },
                ('properties', 'size'): {
                    'inclusion': 'available',
                    'sql-datatype': 'character varying',
                    'selected-by-default': True
                },
                ('properties', 'id'): {
                    'inclusion': 'available',
                    'sql-datatype': 'integer',
                    'selected-by-default': True
                },
                ('properties', 'updated_at'): {
                    'selected-by-default': True,
                    'inclusion': 'available',
                    'sql-datatype': 'timestamp with time zone'
                }
            }, metadata.to_map(md))

        # 'ID' selected as view-key-properties, updated_at is replication_key
        replication_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-key': 'updated_at',
                "replication-method": "INCREMENTAL",
                'view-key-properties': ["id"]
            }
        }]

        connections.select_catalog_and_fields_via_metadata(
            conn_id, chicken_catalog,
            menagerie.get_annotated_schema(conn_id,
                                           chicken_catalog['stream_id']),
            replication_md)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        self.assertEqual(record_count_by_stream, {'chicken_view': 1})
        records_by_stream = runner.get_records_from_target_output()

        table_version = records_by_stream['chicken_view']['table_version']
        self.assertEqual(2, len(records_by_stream['chicken_view']['messages']))
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][0]['action'],
            'activate_version')
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][1]['action'],
            'upsert')

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        actual_chicken_record = records_by_stream['chicken_view']['messages'][
            1]['data']

        expected_chicken_record = {
            'id': 1,
            'fk_id': 1,
            'name': 'fred',
            'age': 99,
            'updated_at': '2111-01-01T12:12:12.222111+00:00',
            'size': 'big'
        }
        self.assertEqual(
            actual_chicken_record,
            expected_chicken_record,
            msg=
            "Expected `various_types` upsert record data to be {}, but target output {}"
            .format(expected_chicken_record, actual_chicken_record))

        print("records are correct")

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)

        chicken_bookmark = state['bookmarks']['postgres-public-chicken_view']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        self.assertEqual(
            chicken_bookmark['version'],
            table_version,
            msg="expected bookmark for stream ROOT-CHICKEN to match version")
        self.assertEqual(chicken_bookmark['replication_key'], 'updated_at')
        self.assertEqual(chicken_bookmark['replication_key_value'],
                         '2111-01-01T12:12:12.222111+00:00')
        print("bookmarks are correct")
Ejemplo n.º 13
0
    def test_run(self):
        # Default test setup
        # Create the connection for Zendesk
        conn_id = connections.ensure_connection(self)

        # Run a check job using orchestrator
        check_job_name = runner.run_check_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify schemas discovered were discovered
        self.found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertEqual(len(self.found_catalogs),
                         len(self.expected_check_streams()))

        # Verify the schemas discovered were exactly what we expect
        found_catalog_names = {
            catalog['tap_stream_id']
            for catalog in self.found_catalogs
            if catalog['tap_stream_id'] in self.expected_check_streams()
        }
        self.assertSetEqual(self.expected_check_streams(), found_catalog_names)

        # Select our catalogs
        our_catalogs = [
            c for c in self.found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            # Tags table only has name and count columns; don't select count
            connections.select_catalog_and_fields_via_metadata(
                conn_id, c, c_annotated, [], ['name'])

        # Clear state before our run
        menagerie.set_state(conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records are retrieving the sub set of fields
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream).get('messages')
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(
                        m.get('data', {}).get(pk),
                        msg="Missing primary-key for message {}".format(m))
Ejemplo n.º 14
0
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        catalog = menagerie.get_catalog(conn_id)
        found_catalogs = menagerie.get_catalogs(conn_id)
        found_streams = {entry['tap_stream_id'] for entry in catalog['streams']}
        self.assertSetEqual(self.expected_check_streams(), found_streams)

        # verify the tap discovered stream metadata is consistent with the source database
        for tap_stream_id in self.expected_check_streams():
            with self.subTest(stream=tap_stream_id):

                # gather expectations
                stream = tap_stream_id.split('-')[1]
                expected_primary_key = self.expected_pks()[stream]
                expected_row_count = self.expected_row_counts()[stream]
                expected_replication_keys = self.expected_valid_replication_keys()[stream]

                # gather results
                found_stream = [entry for entry in catalog['streams'] if entry['tap_stream_id'] == tap_stream_id][0]
                stream_metadata = [entry['metadata'] for entry in found_stream['metadata'] if entry['breadcrumb']==[]][0]
                primary_key = set(stream_metadata.get('table-key-properties'))
                row_count = stream_metadata.get('row-count')
                replication_key = set(stream_metadata.get('valid-replication-keys'))

                # assert that the pks are correct
                self.assertSetEqual(expected_primary_key, primary_key)

                # assert that the row counts are correct
                self.assertEqual(expected_row_count, row_count)

                # assert that valid replication keys are correct
                self.assertSetEqual(replication_key, expected_replication_keys)

        #  -----------------------------------
        # ----------- Initial Sync ---------
        #  -----------------------------------

        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
            rep_key = 'date_field'
            for key in self.key_names():
                if key in stream_catalog['stream_name']:
                    rep_key = key
            additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'INCREMENTAL',
                                                                'replication-key': rep_key}}]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
                                                                                   stream_catalog,
                                                                                   annotated_schema,
                                                                                   additional_md)

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()

        # gather expectations
        expected_schema = {'type': 'object'}

        for tap_stream_id in self.expected_sync_streams():
            with self.subTest(stream=tap_stream_id):

                # gather results
                persisted_schema = messages_by_stream[tap_stream_id]['schema']

                # assert the schema is an object
                self.assertDictEqual(expected_schema, persisted_schema)

        # verify that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        # verify that the entire collection was synced by comparing row counts against the source
        for tap_stream_id in self.expected_sync_streams():
            with self.subTest(stream=tap_stream_id):

                expected_row_count = self.expected_row_counts()[tap_stream_id]
                row_count = record_count_by_stream[tap_stream_id]

                self.assertEqual(expected_row_count, row_count)

        # verify state is saved in the proper format for all streams
        state = menagerie.get_state(conn_id)
        expected_state_keys = {
            'last_replication_method',
            'replication_key_name',
            'replication_key_type',
            'replication_key_value',
            'version',
        }
        for tap_stream_id in self.expected_check_streams():
            with self.subTest(stream=tap_stream_id):
                bookmark = state['bookmarks'][tap_stream_id]

                # gather expectations
                stream = tap_stream_id.split('-')[1]
                expected_replication_keys = self.expected_valid_replication_keys()[stream]

                # gather results
                replication_key = bookmark['replication_key_name']
                replication_key_type = bookmark['replication_key_type']

                # assert that all expected bookmark keys are present
                self.assertSetEqual(expected_state_keys, set(bookmark.keys()))

                # assert all bookmark keys have values
                for key in expected_state_keys:
                    self.assertIsNotNone(bookmark[key])

                # assert incremental sync was performed
                self.assertEqual('INCREMENTAL', bookmark['last_replication_method'])

                # assert the replication key was used to save state
                self.assertIn(replication_key, expected_replication_keys)

                # assert the replication key type is a valid datatype
                self.assertIn(replication_key_type, VALID_REPLICATION_TYPES)

                self.assertIsNone(state['currently_syncing'])

        # -----------------------------------
        # ------------ Second Sync ----------
        # -----------------------------------

        # Perform data manipulations
        with get_test_connection() as client:

            # update 1 document in each of the collection
            update_doc_coll_1 = client["simple_db"]["simple_coll_1"].find_one()
            client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": update_doc_coll_1["_id"]}, {"$set": {"date_field": datetime(2020, 1, 1, 19, 29, 14, 578000)}})

            update_doc_coll_2 = client["simple_db"]["simple_coll_2"].find_one()
            client["simple_db"]["simple_coll_2"].find_one_and_update({"_id": update_doc_coll_2["_id"]}, {"$set": {"date_field": datetime(2020, 1, 1, 19, 29, 14, 578000)}})

            for key_name in self.key_names():
                if (key_name == 'int_field'):
                    # get the first document in the collection to update
                    doc_to_update = client["simple_db"]["simple_coll_{}".format(key_name)].find_one(sort=[("{}".format(key_name), -1)])
                    value = doc_to_update["{}".format(key_name)]
                    int_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one()
                    client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": int_based_coll["_id"]}, {"$set": {"{}".format(key_name): value+3}})
                elif (key_name == 'double_field'):
                    doc_to_update = client["simple_db"]["simple_coll_{}".format(key_name)].find_one(sort=[("{}".format(key_name), -1)])
                    value = doc_to_update["{}".format(key_name)]
                    double_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one()
                    client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": double_based_coll["_id"]}, {"$set": {"{}".format(key_name): value+3}})
                elif (key_name == '64_bit_int_field'):
                    doc_to_update = client["simple_db"]["simple_coll_{}".format(key_name)].find_one(sort=[("{}".format(key_name), -1)])
                    value = doc_to_update["{}".format(key_name)]
                    bit64_int_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one()
                    client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": bit64_int_based_coll["_id"]}, {"$set": {"{}".format(key_name): value+3}})
                elif (key_name == 'date_field'):
                    date_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one()
                    client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": date_based_coll["_id"]}, {"$set": {"{}".format(key_name): datetime(2021, 1, 1, 15, 30, 14, 222000)}})
                elif (key_name == 'timestamp_field'):
                    timestamp_based_coll = client["simple_db"]["simple_coll_{}".format(key_name)].find_one()
                    client["simple_db"]["simple_coll_{}".format(key_name)].find_one_and_update({"_id": timestamp_based_coll["_id"]}, {"$set": {"{}".format(key_name): bson.timestamp.Timestamp(1565897157+99, 1)}})

            # TODO : figure out how to update collections with replication key = string, uuid

            # insert two documents with date_field > bookmark for next sync
            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field": 50,
                "string_field": z_string_generator(),
                "date_field": datetime(2018, 9, 13, 19, 29, 14, 578000),
                "double_field": 51.001,
                "timestamp_field": bson.timestamp.Timestamp(1565897157+50, 1),
                "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282050'),
                "64_bit_int_field": 34359738368 + 50
            })
            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field": 51,
                "string_field": z_string_generator(),
                "date_field": datetime(2018, 9, 18, 19, 29, 14, 578000),
                "double_field": 52.001,
                "timestamp_field": bson.timestamp.Timestamp(1565897157+51, 1),
                "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282051'),
                "64_bit_int_field": 34359738368 + 51
            })

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field": 100,
                "string_field": z_string_generator(),
                "date_field": datetime(2019, 5, 21, 19, 29, 14, 578000),
                "double_field": 101.001,
                "timestamp_field": bson.timestamp.Timestamp(1565897157+100, 1),
                "uuid_field":uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282100'),
                "64_bit_int_field": 34359738368 + 100
            })
            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field": 101,
                "string_field": z_string_generator(),
                "date_field": datetime(2019, 5, 26, 19, 29, 14, 578000),
                "double_field": 102.001,
                "timestamp_field": bson.timestamp.Timestamp(1565897157+101, 1),
                "uuid_field":  uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282101'),
                "64_bit_int_field": 34359738368 + 101
            })

            for key_name in self.key_names():
                client["simple_db"]["simple_coll_{}".format(key_name)].insert_one({
                    "int_field": 50,
                    "string_field": z_string_generator(50),
                    "date_field": datetime(2018, 9, 13, 19, 29, 15, 578000),
                    "double_field": 51.001,
                    "timestamp_field": bson.timestamp.Timestamp(1565897157+50, 1),
                    "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282050'),
                    "64_bit_int_field": 34359738368 + 50
                })
                client["simple_db"]["simple_coll_{}".format(key_name)].insert_one({
                    "int_field": 51,
                    "string_field": z_string_generator(51),
                    "date_field": datetime(2018, 9, 18, 19, 29, 16, 578000),
                    "double_field": 52.001,
                    "timestamp_field": bson.timestamp.Timestamp(1565897157+51, 1),
                    "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec7282051'),
                    "64_bit_int_field": 34359738368 + 51
                })

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()
        records_by_stream = {}
        for stream_name in self.expected_sync_streams():
            records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert']

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        # Verify that we got 4 records for each stream (2 because of the new records, 1 because of update and 1 because of greater than equal [for key based incremental there will always be an overlap on the bookmark value])
        for k, v in record_count_by_stream.items():
            # Workaround for not including collections for uuid and string, TODO : look for a solution to implement string and uuid as replication_key
            if k not in ('simple_coll_uuid_field', 'simple_coll_string_field'):
                self.assertEqual(4, v)

        # Verify that the _id of the records sent are the same set as the
        # _ids of the documents changed
        for stream_name in self.expected_sync_streams():
            # Workaround for not including collections for uuid and string, TODO : look for a solution to implement string and uuid as replication_key
            if stream_name not in ('simple_coll_uuid_field', 'simple_coll_string_field'):
                actual = set([x['data']['int_field'] for x in records_by_stream[stream_name]])
                self.assertEqual(self.expected_incremental_int_fields()[stream_name], actual)

        ##############################################################################
        # Verify that data is not replicated when non replication key is updated
        ##############################################################################

        # Sampling a document from a collection which we know it exists because of the data set up
        no_rep_doc_coll_1 = client["simple_db"]["simple_coll_1"].find_one({"int_field": 20})
        client["simple_db"]["simple_coll_1"].find_one_and_update({"_id": no_rep_doc_coll_1["_id"]}, {"$set": {"string_field": 'No_replication'}})

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        messages_by_stream = runner.get_records_from_target_output()
        second_state = menagerie.get_state(conn_id)
        records_by_stream = {}
        for stream_name in self.expected_sync_streams():
            records_by_stream[stream_name] = [x for x in messages_by_stream[stream_name]['messages'] if x.get('action') == 'upsert']

        doc_from_simple_coll_1 = records_by_stream['simple_coll_1']

        # Verify the document from simple_coll_1 does not correspond to the document which we updated_data
        self.assertNotEqual(doc_from_simple_coll_1[0]['data']['_id'], no_rep_doc_coll_1["_id"])

        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        # Verify that we got 1 record for each stream (1 because of greater than equal [for key based incremental there will always be an overlap on the bookmark value])
        for k, v in record_count_by_stream.items():
            if k not in ('simple_coll_uuid_field', 'simple_coll_string_field'):
                self.assertEqual(1, v)

        # -----------------------------------
        # ------------ Third Sync -----------
        # -----------------------------------
        # Change the replication method for simple_coll_1
        # Change the replication key for simple_coll_2
        # Make sure both do full resync
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
            additional_md = []
            if stream_catalog['tap_stream_id'] == 'simple_db-simple_coll_1':
                additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}]
            elif stream_catalog['tap_stream_id'] == 'simple_db-simple_coll_2':
                additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'INCREMENTAL',
                                                                    'replication-key': 'timestamp_field'}}]
            else:
                additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'INCREMENTAL',
                                                                    'replication-key': stream_catalog['stream_name'].replace('simple_coll_', '')}}]

            selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
                                                                                   stream_catalog,
                                                                                   annotated_schema,
                                                                                   additional_md)
        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        self.assertDictEqual(record_count_by_stream, self.expected_last_sync_row_counts())
Ejemplo n.º 15
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        all_excluded_fields = {}
        # select all catalogs
        for c in found_catalogs:
            if c['stream_name'] == 'ads':
                continue

            discovered_schema = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])['annotated-schema']
            all_excluded_fields[c['stream_name']] = list(
                set(discovered_schema.keys()) -
                self.expected_automatic_fields().get(c['stream_name'], set())
            )[:5]
            connections.select_catalog_and_fields_via_metadata(
                conn_id,
                c,
                discovered_schema,
                non_selected_fields=all_excluded_fields[c['stream_name']])

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # This should be validating the the PKs are written in each record
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        synced_records = runner.get_records_from_target_output()
        self.assertTrue('ads' not in synced_records.keys())
        for stream_name, data in synced_records.items():
            record_messages = [
                set(row['data'].keys()) for row in data['messages']
            ]
            for record_keys in record_messages:
                # The intersection should be empty
                self.assertFalse(
                    record_keys.intersection(all_excluded_fields[stream_name]))
Ejemplo n.º 16
0
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id'] for c in found_catalogs})

        #  -------------------------------------------
        #  ----------- First full Table Sync ---------
        #  -------------------------------------------
        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
            additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
                                                                                    stream_catalog,
                                                                                    annotated_schema,
                                                                                    additional_md)

        # run full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # check exit status
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # streams that we synced are the ones that we expect to see
        records_by_stream = runner.get_records_from_target_output()
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        # assert that we get the correct number of records for each stream
        self.assertEqual(self.expected_row_counts(),record_count_by_stream)

        # assert that an activate_version_message is first and last message sent for each stream
        for stream_name in self.expected_sync_streams():
            self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action'])
            self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action'])

        state = menagerie.get_state(conn_id)

        first_versions = {}

        for tap_stream_id in self.expected_check_streams():

            # state has an initial_full_table_complete == True
            self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete'])

            # there is a version bookmark in state
            first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version']
            self.assertIsNotNone(first_versions[tap_stream_id])

        #  -------------------------------------------
        #  ----------- Second full Table Sync ---------
        #  -------------------------------------------
        # add 2 rows and run full table again, make sure we get initial number + 2

        with get_test_connection() as client:

            client["simple_db"]["simple_coll_1"].insert_many(generate_simple_coll_docs(2))

            client["simple_db"]["simple_coll_2"].insert_many(generate_simple_coll_docs(2))

            client["admin"]["admin_coll_1"].insert_many(generate_simple_coll_docs(2))


        sync_job_name = runner.run_sync_mode(self, conn_id)

        # check exit status
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        state = menagerie.get_state(conn_id)

        # Verify  that menagerie state does not include a key for currently syncing
        self.assertIsNone(state['currently_syncing'])

        # Verify that menagerie state does not include a key for oplog based syncing
        self.assertNotIn('oplog', state)

        # assert that we have correct number of records (including the two new records)
        new_expected_row_counts = {k:v+2 for k,v in self.expected_row_counts().items() if k not in ['simple_db_simple_coll_3',
                                                                                                    'simple_db_simple_coll_4']}
        new_expected_row_counts['simple_db_simple_coll_3']=0
        new_expected_row_counts['simple_db_simple_coll_4']=5
        self.assertEqual(new_expected_row_counts, record_count_by_stream)

        # assert that we only have an ActivateVersionMessage as the last message and not the first
        for stream_name in self.expected_sync_streams():
            if len(records_by_stream[stream_name]['messages']) > 1:
                self.assertNotEqual('activate_version', records_by_stream[stream_name]['messages'][0]['action'], stream_name + "failed")
                self.assertEqual('upsert', records_by_stream[stream_name]['messages'][0]['action'], stream_name + "failed")
            self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action'], stream_name + "failed")

        second_versions = {}
        for tap_stream_id in self.expected_check_streams():
            found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]

            # state has an initial_full_table_complete == True
            self.assertTrue(state['bookmarks'][tap_stream_id]['initial_full_table_complete'])

            # version bookmark
            second_versions[tap_stream_id] = state['bookmarks'][tap_stream_id]['version']
            self.assertIsNotNone(second_versions[tap_stream_id])

            # version in this state is different than that of the previous state
            self.assertNotEqual(first_versions[tap_stream_id], second_versions[tap_stream_id])

            # version which is larger than the previous target version
            self.assertTrue(second_versions[tap_stream_id]>first_versions[tap_stream_id])

            # verify that menagerie state does include the version which matches the target version
            self.assertEqual(records_by_stream[self.tap_stream_id_to_stream()[tap_stream_id]]['table_version'], second_versions[tap_stream_id])

            # version which is larger than the previous target version
            self.assertTrue(second_versions[tap_stream_id]>first_versions[tap_stream_id])

            # version matches the target version
            self.assertEqual(records_by_stream[self.tap_stream_id_to_stream()[tap_stream_id]]['table_version'], second_versions[tap_stream_id])
    def test_run(self):
        """
        Verify that for each stream you can get data when no fields are selected
        and only the automatic fields are replicated.
        """

        print("\n\nRUNNING {}\n\n".format(self.name()))

        # ensure data exists for sync streams and set expectations
        expected_records = {x: []
                            for x in self.expected_streams()}  # ids by stream
        for stream in self.testable_streams():
            existing_objects = self.client.get_all(stream)
            if existing_objects:
                print("Data exists for stream: {}".format(stream))
                for obj in existing_objects:
                    expected_records[stream].append({
                        field: obj.get(field)
                        for field in self.expected_automatic_fields().get(
                            stream)
                    })
            else:
                print("Data does not exist for stream: {}".format(stream))
                assert None, "more test functinality needed"

        # Instantiate connection with default start/end dates
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")

        for cat in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, cat['stream_id'])

            # Verify that pks, rep keys, foreign keys have inclusion of automatic (metadata and annotated schema).
            for k in self.expected_automatic_fields().get(cat['stream_name']):
                mdata = next(
                    (m for m in catalog_entry['metadata']
                     if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k),
                    None)

                print("Validating inclusion on {}: {}".format(
                    cat['stream_name'], mdata))
                self.assertTrue(
                    mdata and mdata['metadata']['inclusion'] == 'automatic')

        # Deselect all available fields from all streams, keep automatic fields
        self.select_all_streams_and_fields(conn_id=conn_id,
                                           catalogs=found_catalogs,
                                           select_all_fields=False)

        catalogs = menagerie.get_catalogs(conn_id)

        # Ensure our selection worked
        for cat in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, cat['stream_id'])
            # Verify all streams are selected
            selected = catalog_entry.get('annotated-schema').get('selected')
            print("Validating selection on {}: {}".format(
                cat['stream_name'], selected))
            self.assertTrue(selected, msg="Stream not selected.")

            # Verify only automatic fields are selected
            for field, field_props in catalog_entry.get(
                    'annotated-schema').get('properties').items():
                field_selected = field_props.get('selected')
                print("\tValidating selection on {}.{}: {}".format(
                    cat['stream_name'], field, field_selected))
                if field in self.expected_automatic_fields().get(
                        cat['stream_name']):
                    # NOTE: AUTOMATIC FIELDS IGNORE THE SELECTED md {'selected': None}
                    print(
                        "NOTE: selection for {} is ignored by the Transformer "
                        .format(field) +
                        " so long as 'inlcusion' = 'automatic'")
                else:
                    self.assertFalse(
                        field_selected,
                        msg="Field is selected but not automatic.")

        #clear state
        menagerie.set_state(conn_id, {})

        # run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # read target output
        first_record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      first_record_count_by_stream.values())
        synced_records = runner.get_records_from_target_output()

        # Verify target has records for all synced streams
        for stream, count in first_record_count_by_stream.items():
            assert stream in self.expected_streams()
            self.assertGreater(
                count,
                0,
                msg="failed to replicate any data for: {}".format(stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Test by Stream
        for stream in self.testable_streams():
            with self.subTest(stream=stream):
                data = synced_records.get(stream)
                record_messages_keys = [
                    set(row['data'].keys()) for row in data['messages']
                ]
                expected_keys = self.expected_automatic_fields().get(stream)

                # Verify that only the automatic fields are sent to the target
                for actual_keys in record_messages_keys:
                    self.assertEqual(
                        actual_keys.symmetric_difference(expected_keys),
                        set(),
                        msg="Expected automatic fields and nothing else.")

                actual_records = [row['data'] for row in data['messages']]

                # Verify the number of records match expectations
                self.assertEqual(len(expected_records.get(stream)),
                                 len(actual_records),
                                 msg="Number of actual records do match expectations. " +\
                                 "We probably have duplicate records.")

                # verify by values, that we replicated the expected records
                for actual_record in actual_records:
                    self.assertTrue(
                        actual_record in expected_records.get(stream),
                        msg="Actual record missing from expectations")
                for expected_record in expected_records.get(stream):
                    self.assertTrue(expected_record in actual_records,
                                    msg="Expected record missing from target.")
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id'] for c in found_catalogs})

        for tap_stream_id in self.expected_check_streams():
            found_stream = [c for c in found_catalogs if c['tap_stream_id'] == tap_stream_id][0]

            # assert that the pks are correct
            self.assertEqual(self.expected_pks()[found_stream['stream_name']],
                             set(found_stream.get('metadata', {}).get('table-key-properties')))

            # assert that the row counts are correct
            self.assertEqual(self.expected_row_counts()[found_stream['stream_name']],
                             found_stream.get('metadata', {}).get('row-count'))

        #  -----------------------------------
        # ----------- Full Table Sync ---------
        #  -----------------------------------
        # select simple_coll_1 stream and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(conn_id, stream_catalog['stream_id'])
            additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(conn_id,
                                                                                    stream_catalog,
                                                                                    annotated_schema,
                                                                                    additional_md)
        # synthesize interrupted state
        interrupted_state = {
            'currently_syncing' : 'simple_db-simple_coll_1',
            'bookmarks' : {'simple_db-simple_coll_1': { 'max_id_value': 49,
                                                        'max_id_type': 'int',
                                                        'initial_full_table_complete': False,
                                                        'last_id_fetched': 25,
                                                        'last_id_fetched_type': 'int',
                                                        'version': int(time.time() * 1000)},
                           'simple_db-simple_coll_2': { 'max_id_value': base64.b64encode("test {}".format(49).encode()),
                                                        'max_id_type': 'bytes',
                                                        'initial_full_table_complete': False,
                                                        'last_id_fetched': base64.b64encode("test {}".format(25).encode()),
                                                        'last_id_fetched_type': 'bytes',
                                                        'version': int(time.time() * 1000)}}}

        menagerie.set_state(conn_id, interrupted_state)
        runner.run_sync_mode(self, conn_id)

        # streams that we synced are the ones that we expect to see
        records_by_stream = runner.get_records_from_target_output()
        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        # ActivateVersionMessage as the last message and not the first
        for stream_name in self.expected_sync_streams():
            self.assertNotEqual('activate_version',records_by_stream[stream_name]['messages'][0]['action'])
            self.assertEqual('activate_version',records_by_stream[stream_name]['messages'][-1]['action'])

        # _id of the first record sync'd for each stream is the bookmarked
        # last_id_fetched from the interrupted_state passed to the tap
        self.assertEqual(records_by_stream['simple_coll_1']['messages'][0]['data']['_id'],
                         int(interrupted_state['bookmarks']['simple_db-simple_coll_1']['last_id_fetched']))

        # _id of the last record sync'd for each stream is the bookmarked
        # max_id_value from the interrupted_state passed to the tap
        self.assertEqual(records_by_stream['simple_coll_1']['messages'][-2]['data']['_id'],
                         int(interrupted_state['bookmarks']['simple_db-simple_coll_1']['max_id_value']))

        # assert that final state has no last_id_fetched and max_id_value bookmarks
        final_state = menagerie.get_state(conn_id)
        for tap_stream_id in self.expected_check_streams():
            self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('last_id_fetched'))
            self.assertIsNone(final_state['bookmarks'][tap_stream_id].get('max_id_value'))
Ejemplo n.º 19
0
    def test_run(self):
        conn_id = self.ensure_connection()

        # Run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        # Select only the expected streams tables
        expected_streams = self.expected_streams()
        catalog_entries = [
            ce for ce in found_catalogs
            if ce['tap_stream_id'] in expected_streams
        ]
        self.select_all_streams_and_fields(conn_id, catalog_entries)

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Examine target file
        sync_records = runner.get_records_from_target_output()
        sync_record_count = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys())

        # Test by stream
        for stream in self.expected_streams():
            with self.subTest(stream=stream):

                expected_count = self.minimum_record_count_by_stream().get(
                    stream)
                record_count = sync_record_count.get(stream, 0)

                sync_messages = sync_records.get(stream, {
                    'messages': []
                }).get('messages')

                primary_key = self.expected_primary_keys().get(stream).pop()

                # Verify the sync meets or exceeds the default record count
                self.assertLessEqual(expected_count, record_count)

                # Verify the number or records exceeds the max_results (api limit)
                pagination_threshold = int(
                    self.get_properties().get(page_size_key))
                self.assertGreater(
                    record_count,
                    pagination_threshold,
                    msg="Record count not large enough to gaurantee pagination."
                )

                # Verify we did not duplicate any records across pages
                records_pks_set = {
                    message.get('data').get(primary_key)
                    for message in sync_messages
                }
                records_pks_list = [
                    message.get('data').get(primary_key)
                    for message in sync_messages
                ]
                self.assertCountEqual(
                    records_pks_set,
                    records_pks_list,
                    msg="We have duplicate records for {}".format(stream))
Ejemplo n.º 20
0
    def test_run(self):
        # SYNC 1
        conn_id = self.ensure_connection()

        # Run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        # Select only the expected streams tables
        expected_streams = self.expected_streams()
        catalog_entries = [
            ce for ce in found_catalogs
            if ce['tap_stream_id'] in expected_streams
        ]
        self.select_all_streams_and_fields(conn_id, catalog_entries)

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)
        first_sync_records = runner.get_records_from_target_output()
        first_sync_record_count = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys())

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # SYNC 2
        conn_id = self.ensure_connection(original=False)

        # Run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Select only the Accounts table
        found_catalogs = menagerie.get_catalogs(conn_id)
        catalog_entries = [
            ce for ce in found_catalogs
            if ce['tap_stream_id'] in expected_streams
        ]
        self.select_all_streams_and_fields(conn_id, catalog_entries)

        # Run in Sync mode
        sync_job_name = runner.run_sync_mode(self, conn_id)
        second_sync_records = runner.get_records_from_target_output()
        second_sync_record_count = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys())

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Test by stream
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                # record counts
                first_sync_count = first_sync_record_count.get(stream, 0)
                expected_first_sync_count = self.minimum_record_count_by_stream(
                ).get(stream)
                second_sync_count = second_sync_record_count.get(stream, 0)

                # record messages
                first_sync_messages = first_sync_records.get(
                    stream, {
                        'messages': []
                    }).get('messages')
                second_sync_messages = second_sync_records.get(
                    stream, {
                        'messages': []
                    }).get('messages')

                # start dates
                start_date_1 = self.get_properties()['start_date']
                start_date_2 = self.get_properties(
                    original=False)['start_date']

                # Verify by stream that our first sync meets or exceeds the default record count
                self.assertLessEqual(expected_first_sync_count,
                                     first_sync_count)

                # Verify by stream more records were replicated in the first sync, with an older start_date than the second
                self.assertGreaterEqual(first_sync_count, second_sync_count)

                # Verify by stream that all records have a rep key that is equal to or greater than that sync's start_date
                for message in first_sync_messages:
                    rep_key_value = message.get('data').get('MetaData').get(
                        'LastUpdatedTime')
                    self.assertGreaterEqual(
                        rep_key_value,
                        start_date_1,
                        msg=
                        "A record was replicated with a replication key value prior to the start date"
                    )
                for message in second_sync_messages:
                    rep_key_value = message.get('data').get('MetaData').get(
                        'LastUpdatedTime')
                    self.assertGreaterEqual(
                        rep_key_value,
                        start_date_2,
                        msg=
                        "A record was replicated with a replication key value prior to the start date"
                    )
Ejemplo n.º 21
0
    def test_run(self):
        print("\n\nRUNNING {}\n\n".format(self.name()))

        # ensure data exists for sync streams and set expectations
        expected_records_1 = {x: [] for x in self.expected_sync_streams()} # ids by stream
        for stream in self.expected_sync_streams().difference(self.untestable_streams()):
            if stream in self.expected_incremental_sync_streams():
                start_date = dt.strptime(self.get_properties().get('start_date'), self.START_DATE_FORMAT)
                since = start_date.strftime(self.TEST_TIME_FORMAT)
                _, existing_objects = utils.get_total_record_count_and_objects(stream, since=since)
            else:
                _, existing_objects = utils.get_total_record_count_and_objects(stream)

            if existing_objects:
                logging.info("Data exists for stream: {}".format(stream))
                for obj in existing_objects:  # add existing records to expectations
                    expected_records_1[stream].append(obj)
                continue
            # Create 1 record if none exist
            logging.info("Data does not exist for stream: {}".format(stream))
            new_object = utils.create_object(stream)
            logging.info("Data generated for stream: {}".format(stream))
            expected_records_1[stream].append(new_object)

        # Create comment actions
        start_date = dt.strptime(self.get_properties().get('start_date'), self.START_DATE_FORMAT)
        since = start_date.strftime(self.TEST_TIME_FORMAT)
        # count_before, before_records = utils.get_total_record_count_and_objects('actions', since=since)
        action_comments = []
        action_comments.append(utils.create_object('actions', action_type="comment"))
        action_comments.append(utils.create_object('actions', action_type="comment"))
        for action in action_comments:
            expected_records_1['actions'].append(action)
        # count_after, after_records = utils.get_total_record_count_and_objects('actions', since=since)


        # run in check mode
        conn_id = connections.ensure_connection(self)
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")

        #select all catalogs
        for c in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id'])

            for k in self.expected_automatic_fields()[c['stream_name']]:
                mdata = next((m for m in catalog_entry['metadata']
                              if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k), None)
                print("Validating inclusion on {}: {}".format(c['stream_name'], mdata))
                self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic')

            connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry)
            
        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify data was replicated
        record_count_by_stream_1 = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks()
        )
        replicated_row_count_1 =  reduce(lambda accum,c : accum + c, record_count_by_stream_1.values())
        self.assertGreater(replicated_row_count_1, 0, msg="failed to replicate any data: {}".format(record_count_by_stream_1))
        print("total replicated row count: {}".format(replicated_row_count_1))

        # get emitted with records
        synced_records_1 = runner.get_records_from_target_output()

        # Verify bookmarks were saved for all streams
        state_1 = menagerie.get_state(conn_id)
        for stream in self.expected_incremental_sync_streams():
            self.assertTrue(state_1.get('bookmarks', {}).get(stream, {}).get('window_start', {}))
        print("Bookmarks meet expectations")

        # Generate data between syncs for bookmarking streams
        print("Generating more data prior to 2nd sync")
        expected_records_2 = {x: [] for x in self.expected_sync_streams()}
        for stream in self.expected_full_table_sync_streams().difference(self.untestable_streams()):
            for _ in range(1):
                new_object = utils.create_object(stream)
                expected_records_2[stream].append({field: new_object.get(field)
                                                   for field in self.expected_automatic_fields().get(stream)})

        # Update a single comment action before second sync
        print("Updating existing data prior to 2nd sync")
        updated_records = {x: [] for x in self.expected_sync_streams()}
        action_id_to_update = random.choice(action_comments).get('id')
        updated_action = utils.update_object_action(obj_id=action_id_to_update)
        updated_records['actions'].append(updated_action)

        # Get new actions from data manipulation between syncs
        print("Acquriing in-test actions prior to 2nd sync")
        for stream in self.expected_incremental_sync_streams().difference(self.untestable_streams()):
            state = dt.strptime(state_1.get('bookmarks').get(stream).get('window_start'), self.TEST_TIME_FORMAT)
            since = (state - timedelta(days=self.LOOKBACK_WINDOW)).strftime(self.TEST_TIME_FORMAT)
            # start_date = dt.strptime(self.get_properties().get('start_date'), self.START_DATE_FORMAT)
            # since = start_date.strftime(self.TEST_TIME_FORMAT)
            _, objects = utils.get_total_record_count_and_objects(stream, since=since)
            for obj in objects:
                expected_records_2[stream].append({field: obj.get(field)
                                                   for field in self.expected_automatic_fields().get(stream)})

        # Run another sync
        print("Running 2nd sync job")
        sync_job_name_2 = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status_2 = menagerie.get_exit_status(conn_id, sync_job_name_2)
        menagerie.verify_sync_exit_status(self, exit_status_2, sync_job_name_2)

        # verify data was replicated
        record_count_by_stream_2 = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks()
        )
        replicated_row_count_2 =  reduce(lambda accum,c : accum + c, record_count_by_stream_2.values())
        self.assertGreater(replicated_row_count_2, 0,
                           msg="failed to replicate any data: {}".format(record_count_by_stream_2))
        print("total replicated row count: {}".format(replicated_row_count_2))

        # get emitted with records
        synced_records_2 = runner.get_records_from_target_output()

        # Verify bookmarks were saved as expected inc streams
        state_2 = menagerie.get_state(conn_id)
        for stream in self.expected_incremental_sync_streams():
            self.assertTrue(state_2.get('bookmarks', {}).get(stream, {}).get('window_start', {}))
        print("Bookmarks meet expectations")

        # TESTING FULL TABLE STREAMS
        for stream in self.expected_full_table_sync_streams().difference(self.untestable_streams()):
            with self.subTest(stream=stream):
                record_count_1 = record_count_by_stream_1.get(stream, 0)
                record_count_2 = record_count_by_stream_2.get(stream, 0)

                # Assert we have data for both syncs for full table streams
                self.assertGreater(record_count_1, 0)
                self.assertGreater(record_count_2, 0)

                # Assert that we are capturing the expected number of records for full table streams
                self.assertGreater(record_count_2, record_count_1,
                                   msg="Full table streams should have more data in second sync.")
                self.assertEqual((record_count_2 - record_count_1),
                                 len(expected_records_2.get(stream, [])),
                                 msg="The differnce in record counts between syncs should " +\
                                 "equal the number of records we created between syncs.\n" +\
                                 "This is not the case for {}".format(stream))

                # Test that we are capturing the expected records for full table streams
                expected_ids_1 = set(record.get('id') for record in expected_records_1.get(stream))
                data_1 = synced_records_1.get(stream, [])
                record_messages_1 = [row.get('data') for row in data_1['messages']]
                record_ids_1 = set(row.get('data').get('id') for row in data_1['messages'])
                expected_ids_2 = set(record.get('id') for record in expected_records_2.get(stream))
                data_2 = synced_records_2.get(stream, [])
                record_messages_2 = [row.get('data') for row in data_2['messages']]
                record_ids_2 = set(row.get('data').get('id') for row in data_2['messages'])

                # verify all expected records are replicated for both syncs
                self.assertEqual(expected_ids_1, record_ids_1,
                                 msg="Data discrepancy. Expected records do not match actual in sync 1.")
                self.assertTrue(expected_ids_1.issubset(record_ids_2),
                                 msg="Data discrepancy. Expected records do not match actual in sync 2.")

                for expected_record in expected_records_1.get(stream):
                    actual_record = [message for message in record_messages_1
                                     if message.get('id') == expected_record.get('id')].pop()
                    self.assertEqual(set(expected_record.keys()), set(actual_record.keys()),
                                     msg="Field mismatch between expectations and replicated records in sync 1.")

                # verify the 2nd sync gets records created after the 1st sync
                self.assertEqual(set(record_ids_2).difference(set(record_ids_1)),
                                 expected_ids_2,
                                 msg="We did not get the new record(s)")

        print("Full table streams tested.")

        # TESTING INCREMENTAL STREAMS
        for stream in self.expected_incremental_sync_streams().difference(self.untestable_streams()):
            with self.subTest(stream=stream):
                record_count_1 = record_count_by_stream_1.get(stream, 0)
                record_count_2 = record_count_by_stream_2.get(stream, 0)

                # Assert we have data for both syncs for inc streams
                self.assertGreater(record_count_1, 0)
                self.assertGreater(record_count_2, 0)

                # Assert that we are capturing the expected number of records for inc streams
                self.assertEqual(record_count_1, len(expected_records_1.get(stream, [])),
                                 msg="Stream {} replicated an unexpedted number records on 1st sync.".format(stream))
                self.assertEqual(record_count_2, len(expected_records_2.get(stream, [])),
                                 msg="Stream {} replicated an unexpedted number records on 2nd sync.".format(stream))

                # Assert that we are capturing the expected records for inc streams
                data_1 = synced_records_1.get(stream, [])
                record_messages_1 = [row.get('data').get('id') for row in data_1['messages']]
                data_2 = synced_records_2.get(stream, [])
                record_messages_2 = [row.get('data').get('id') for row in data_2['messages']]
                for record in expected_records_1.get(stream):
                    self.assertTrue(record.get('id') in record_messages_1,
                                    msg="Missing an expected record from sync 1.")
                for record in expected_records_2.get(stream):
                    self.assertTrue(record.get('id') in record_messages_2,
                                    msg="Missing an expected record from sync 2.")

                record_data_1 = [row.get('data') for row in data_1['messages']]
                record_data_2 = [row.get('data') for row in data_2['messages']]

                # Testing action comments (the only action type that can be updated)
                for action in action_comments:

                    # Get text value for action comment from sync 1
                    original_action_text = ""
                    for record in record_data_1:
                        if record.get('id') == action.get('id'):
                            original_action_text = record.get('data').get('text')
                    assert original_action_text, "Record  {} is missing from 1st sync.".format(action.get('id'))
                    # Get text value for action comment from sync 2
                    for record in record_data_2:
                        if record.get('id') == action.get('id'):
                            current_action_text = record.get('data').get('text')
                    assert current_action_text, "Record  {} is missing from 2nd sync.".format(action.get('id'))

                    # Verify the action comment text matches expectations
                    if action.get('id')== action_id_to_update:
                        self.assertNotEqual(original_action_text, current_action_text, msg="Update was not captured.")
                        self.assertIn("UPDATE", current_action_text, msg="Update was captured but not as expected.")
                    else:
                        self.assertEqual(original_action_text, current_action_text, msg="Text does not match expected.")

        print("Incremental streams tested.")

        # CLEANING UP
        stream_to_delete = 'boards'
        boards_remaining = 5
        print("Deleting all but {} records for stream {}.".format(boards_remaining, stream_to_delete))
        board_count = len(expected_records_1.get(stream_to_delete, [])) + len(expected_records_2.get(stream_to_delete, []))
        for obj_to_delete in expected_records_2.get(stream_to_delete, []): # Delete all baords between syncs
            if board_count > boards_remaining:
                utils.delete_object(stream_to_delete, obj_to_delete.get('id'))
                board_count -= 1
            else:
                break
        for obj_to_delete in expected_records_1.get(stream_to_delete, []): # Delete all baords between syncs
            if board_count > boards_remaining:
                utils.delete_object(stream_to_delete, obj_to_delete.get('id'))
                board_count -= 1
            else:
                break
        # Reset the parent objects that we have been tracking
        utils.reset_tracked_parent_objects()
Ejemplo n.º 22
0
    def discovery_test(self):
        """
        Verify that discover creates the appropriate catalog, schema, metadata, etc.

        • Verify number of actual streams discovered match expected
        • Verify the stream names discovered were what we expect
        • Verify stream names follow naming convention
          streams should only have lowercase alphas and underscores
        • verify there is only 1 top level breadcrumb
        • verify replication key(s)
        • verify primary key(s)
        • verify that if there is a replication key we are doing INCREMENTAL otherwise FULL
        • verify the actual replication matches our expected replication method
        • verify that primary, replication and foreign keys
          are given the inclusion of automatic (metadata and annotated schema).
        • verify that all other fields have inclusion of available (metadata and schema)
        """
        conn_id = connections.ensure_connection(self)
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify number of actual streams discovered match expected
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))
        self.assertEqual(
            len(found_catalogs),
            len(self.expected_streams()),
            msg="Expected {} streams, actual was {} for connection {}, "
            "actual {}".format(len(self.expected_streams()),
                               len(found_catalogs), found_catalogs, conn_id))

        # Verify the stream names discovered were what we expect
        found_catalog_names = {c['tap_stream_id'] for c in found_catalogs}
        self.assertEqual(set(self.expected_streams()),
                         set(found_catalog_names),
                         msg="Expected streams don't match actual streams")

        # Verify stream names follow naming convention
        # streams should only have lowercase alphas and underscores
        self.assertTrue(all(
            [re.fullmatch(r"[a-z_]+", name) for name in found_catalog_names]),
                        msg="One or more streams don't follow standard naming")

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                catalog = next(
                    iter([
                        catalog for catalog in found_catalogs
                        if catalog["stream_name"] == stream
                    ]))
                assert catalog  # based on previous tests this should always be found

                schema_and_metadata = menagerie.get_annotated_schema(
                    conn_id, catalog['stream_id'])
                metadata = schema_and_metadata["metadata"]
                schema = schema_and_metadata["annotated-schema"]

                # verify the stream level properties are as expected
                # verify there is only 1 top level breadcrumb
                stream_properties = [
                    item for item in metadata if item.get("breadcrumb") == []
                ]
                self.assertTrue(
                    len(stream_properties) == 1,
                    msg="There is more than one top level breadcrumb")

                # verify replication key(s)
                self.assertEqual(
                    set(stream_properties[0].get("metadata", {
                        self.REPLICATION_KEYS: []
                    }).get(self.REPLICATION_KEYS, [])),
                    self.expected_replication_keys()[stream],
                    msg="expected replication key {} but actual is {}".format(
                        self.expected_replication_keys()[stream],
                        set(stream_properties[0].get(
                            "metadata", {
                                self.REPLICATION_KEYS: None
                            }).get(self.REPLICATION_KEYS, []))))

                # verify primary key(s)
                self.assertEqual(
                    set(stream_properties[0].get("metadata", {
                        self.PRIMARY_KEYS: []
                    }).get(self.PRIMARY_KEYS, [])),
                    self.expected_primary_keys()[stream],
                    msg="expected primary key {} but actual is {}".format(
                        self.expected_primary_keys()[stream],
                        set(stream_properties[0].get("metadata", {
                            self.PRIMARY_KEYS: None
                        }).get(self.PRIMARY_KEYS, []))))

                # verify that if there is a replication key we are doing INCREMENTAL otherwise FULL
                actual_replication_method = stream_properties[0].get(
                    "metadata", {
                        self.REPLICATION_METHOD: None
                    }).get(self.REPLICATION_METHOD)
                if stream_properties[0].get("metadata", {
                        self.REPLICATION_KEYS: []
                }).get(self.REPLICATION_KEYS, []):

                    self.assertTrue(
                        actual_replication_method == self.INCREMENTAL,
                        msg="Expected INCREMENTAL replication "
                        "since there is a replication key")
                else:
                    self.assertTrue(actual_replication_method == self.FULL,
                                    msg="Expected FULL replication "
                                    "since there is no replication key")

                # verify the actual replication matches our expected replication method
                self.assertEqual(
                    self.expected_replication_method().get(stream, None),
                    actual_replication_method,
                    msg=
                    "The actual replication method {} doesn't match the expected {}"
                    .format(
                        actual_replication_method,
                        self.expected_replication_method().get(stream, None)))

                expected_primary_keys = self.expected_primary_keys()[stream]
                expected_replication_keys = self.expected_replication_keys(
                )[stream]
                expected_automatic_fields = expected_primary_keys | expected_replication_keys

                # verify that primary, replication and foreign keys
                # are given the inclusion of automatic in annotated schema.
                actual_automatic_fields = {
                    key
                    for key, value in schema["properties"].items()
                    if value.get("inclusion") == "automatic"
                }
                self.assertEqual(
                    expected_automatic_fields,
                    actual_automatic_fields,
                    msg="expected {} automatic fields but got {}".format(
                        expected_automatic_fields, actual_automatic_fields))

                # verify that all other fields have inclusion of available
                # This assumes there are no unsupported fields for SaaS sources
                self.assertTrue(
                    all({
                        value.get("inclusion") == "available"
                        for key, value in schema["properties"].items()
                        if key not in actual_automatic_fields
                    }),
                    msg=
                    "Not all non key properties are set to available in annotated schema"
                )

                # verify that primary, replication and foreign keys
                # are given the inclusion of automatic in metadata.
                actual_automatic_fields = {
                    item.get("breadcrumb", ["properties", None])[1]
                    for item in metadata
                    if item.get("metadata").get("inclusion") == "automatic"
                }
                self.assertEqual(
                    expected_automatic_fields,
                    actual_automatic_fields,
                    msg="expected {} automatic fields but got {}".format(
                        expected_automatic_fields, actual_automatic_fields))

                # verify that all other fields have inclusion of available
                # This assumes there are no unsupported fields for SaaS sources
                self.assertTrue(
                    all({
                        item.get("metadata").get("inclusion") == "available"
                        for item in metadata
                        if item.get("breadcrumb", []) != []
                        and item.get("breadcrumb", ["properties", None])[1]
                        not in actual_automatic_fields
                    }),
                    msg=
                    "Not all non key properties are set to available in metadata"
                )
Ejemplo n.º 23
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        # select all catalogs
        for c in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])
            for k in self.expected_pks()[c['stream_name']]:
                mdata = next(
                    (m for m in catalog_entry['metadata']
                     if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k),
                    None)
                print("Validating inclusion on {}: {}".format(
                    c['stream_name'], mdata))
                self.assertTrue(
                    mdata and mdata['metadata']['inclusion'] == 'automatic')
            connections.select_catalog_via_metadata(conn_id, c, catalog_entry)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # This should be validating the the PKs are written in each record
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        synced_records = runner.get_records_from_target_output()
        for stream_name, data in synced_records.items():
            record_messages = [
                set(row['data'].keys()) for row in data['messages']
            ]
            for record_keys in record_messages:
                # The symmetric difference should be empty
                self.assertEqual(
                    record_keys,
                    self.expected_automatic_fields().get(stream_name, set()))
Ejemplo n.º 24
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            2,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        test_catalog_cows = list(
            filter(
                lambda c: c['stream_name'] ==
                'postgres_logical_replication_test_cows', found_catalogs))[0]
        self.assertEqual('postgres_logical_replication_test_cows',
                         test_catalog_cows['stream_name'])

        test_catalog_chickens = list(
            filter(
                lambda c: c['stream_name'
                            ] == 'postgres_logical_replication_test_chickens',
                found_catalogs))[0]
        self.assertEqual('postgres_logical_replication_test_chickens',
                         test_catalog_chickens['stream_name'])
        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog_cows,
            menagerie.get_annotated_schema(conn_id,
                                           test_catalog_cows['stream_id']),
            additional_md)
        connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog_chickens,
            menagerie.get_annotated_schema(conn_id,
                                           test_catalog_chickens['stream_id']),
            additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        #run sync job
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        self.assertEqual(
            record_count_by_stream, {
                'public_postgres_logical_replication_test_cows': 1,
                'public_postgres_logical_replication_test_chickens': 1
            })
        records_by_stream = runner.get_records_from_target_output()

        table_version_cows = records_by_stream[
            'public_postgres_logical_replication_test_cows']['table_version']
        self.assertEqual(
            records_by_stream['public_postgres_logical_replication_test_cows']
            ['messages'][0]['action'], 'activate_version')
        self.assertEqual(
            records_by_stream['public_postgres_logical_replication_test_cows']
            ['messages'][1]['action'], 'upsert')
        self.assertEqual(
            records_by_stream['public_postgres_logical_replication_test_cows']
            ['messages'][2]['action'], 'activate_version')

        table_version_chickens = records_by_stream[
            'public_postgres_logical_replication_test_chickens'][
                'table_version']
        self.assertEqual(
            records_by_stream[
                'public_postgres_logical_replication_test_chickens']
            ['messages'][0]['action'], 'activate_version')
        self.assertEqual(
            records_by_stream[
                'public_postgres_logical_replication_test_chickens']
            ['messages'][1]['action'], 'upsert')
        self.assertEqual(
            records_by_stream[
                'public_postgres_logical_replication_test_chickens']
            ['messages'][2]['action'], 'activate_version')

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")

        bookmark_cows = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_cows']
        self.assertIsNotNone(bookmark_cows['lsn'],
                             msg="expected bookmark for stream to have an lsn")
        lsn_cows_1 = bookmark_cows['lsn']
        self.assertEqual(bookmark_cows['version'],
                         table_version_cows,
                         msg="expected bookmark for stream to match version")

        bookmark_chickens = state['bookmarks'][
            'postgres-public-postgres_logical_replication_test_chickens']
        self.assertIsNotNone(bookmark_chickens['lsn'],
                             msg="expected bookmark for stream to have an lsn")
        lsn_chickens_1 = bookmark_chickens['lsn']
        self.assertEqual(bookmark_chickens['version'],
                         table_version_chickens,
                         msg="expected bookmark for stream to match version")

        #----------------------------------------------------------------------
        # invoke the sync job again after adding records
        #----------------------------------------------------------------------
        print("inserting 1 more cows and 1 more chickens")

        with db_utils.get_test_connection('dev') as conn:
            conn.autocommit = True
            with conn.cursor() as cur:
                #insert another cow
                self.cows_rec_2 = {'cow_name': "betty cow", 'cow_age': 21}
                insert_record(cur, test_table_name_cows, self.cows_rec_2)

        with db_utils.get_test_connection('postgres') as conn:
            conn.autocommit = True
            with conn.cursor() as cur:
                #insert another chicken
                self.chicken_rec_2 = {
                    'chicken_name': "burt chicken",
                    'chicken_age': 14
                }
                insert_record(cur, test_table_name_chickens,
                              self.chicken_rec_2)

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        self.assertEqual(
            record_count_by_stream, {
                'public_postgres_logical_replication_test_cows': 1,
                'public_postgres_logical_replication_test_chickens': 1
            })

        upserts = []
        for u in runner.get_upserts_from_target_output():
            self.assertIsNotNone(u.get('_sdc_lsn'))
            del u['_sdc_lsn']
            upserts.append(u)

        self.assertEqual([{
            '_sdc_deleted_at': None,
            'cow_age': 21,
            'id': 2,
            'cow_name': 'betty cow'
        }, {
            'chicken_name': 'burt chicken',
            '_sdc_deleted_at': None,
            'chicken_age': 14,
            'id': 2
        }], upserts)

        print("inserted record is correct")

        state = menagerie.get_state(conn_id)
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        cows_bookmark = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_cows']
        self.assertIsNotNone(
            cows_bookmark['lsn'],
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to have an scn"
        )
        lsn_cows_2 = cows_bookmark['lsn']
        self.assertTrue(lsn_cows_2 >= lsn_cows_1)

        chickens_bookmark = state['bookmarks'][
            'postgres-public-postgres_logical_replication_test_chickens']
        self.assertIsNotNone(
            chickens_bookmark['lsn'],
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to have an scn"
        )
        lsn_chickens_2 = chickens_bookmark['lsn']
        self.assertTrue(lsn_chickens_2 >= lsn_chickens_1)

        #table_version does NOT change
        self.assertEqual(
            chickens_bookmark['version'],
            table_version_chickens,
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to match version"
        )

        #table_version does NOT change
        self.assertEqual(
            cows_bookmark['version'],
            table_version_cows,
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to match version"
        )
Ejemplo n.º 25
0
    def test_run(self):

        conn_id = connections.ensure_connection(self, payload_hook=None)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        catalog_entries = menagerie.get_catalogs(conn_id)

        # Select all streams and all fields
        for entry in catalog_entries:

            if entry.get('tap_stream_id') in self.expected_sync_streams():
                schema = menagerie.select_catalog(conn_id, entry)

                catalog_entry = {
                    'key_properties': entry.get('key_properties'),
                    'schema': schema,
                    'tap_stream_id': entry.get('tap_stream_id'),
                    'replication_method': entry.get('replication_method'),
                    'replication_key': entry.get('replication_key')
                }

                connections.select_catalog_and_fields_via_metadata(
                    conn_id, catalog_entry, schema)

        # found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(catalog_entries),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        set_of_discovered_streams = {
            entry['tap_stream_id']
            for entry in catalog_entries
        }
        self.assertTrue(
            self.expected_check_streams().issubset(set_of_discovered_streams),
            msg="Expected check streams are not a subset of discovered streams"
        )

        menagerie.set_state(conn_id, {})

        # Verify that tap and target exit codes are 0
        first_record_count = self.run_sync_and_get_record_count(conn_id)

        # verify that we only sync selected streams
        self.assertEqual(set(first_record_count.keys()),
                         self.expected_sync_streams())

        first_state = menagerie.get_state(conn_id)

        first_sync_records = runner.get_records_from_target_output()
        first_max_bookmarks = self.max_bookmarks_by_stream(first_sync_records)
        first_min_bookmarks = self.min_bookmarks_by_stream(first_sync_records)

        # Run second sync
        second_record_count = self.run_sync_and_get_record_count(conn_id)
        second_state = menagerie.get_state(conn_id)

        second_sync_records = runner.get_records_from_target_output()
        second_max_bookmarks = self.max_bookmarks_by_stream(
            second_sync_records)
        second_min_bookmarks = self.min_bookmarks_by_stream(
            second_sync_records)

        for stream in self.expected_sync_streams():
            # Verify first sync returns more data or same amount of data
            self.assertGreaterEqual(
                first_record_count.get(stream, 0),
                second_record_count.get(stream, 0),
                msg="Second sync didn't always return less records for stream {}"
                .format(stream))

            self.assertGreaterEqual(second_state['bookmarks'][stream],
                                    first_state['bookmarks'][stream])
Ejemplo n.º 26
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [fc for fc
                          in menagerie.get_catalogs(conn_id)
                          if fc['tap_stream_id'] in self.expected_check_streams()]

        self.assertEqual(len(found_catalogs),
                         1,
                         msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(found_catalog_names)
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        chicken_catalog = found_catalogs[0]

        self.assertEqual('chicken_view', chicken_catalog['stream_name'])
        print("discovered streams are correct")

        print('checking discoverd metadata for ROOT-CHICKEN_VIEW')
        md = menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id'])['metadata']

        self.assertEqual(
            {(): {'database-name': 'postgres', 'is-view': True, 'row-count': 0, 'schema-name': 'public', 'table-key-properties': []},
             ('properties', 'fk_id'): {'inclusion': 'available', 'sql-datatype': 'bigint', 'selected-by-default': True},
             ('properties', 'name'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True},
             ('properties', 'age'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True},
             ('properties', 'size'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True},
             ('properties', 'id'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True}},
            metadata.to_map(md))


        # 'ID' selected as view-key-properties
        replication_md = [{"breadcrumb": [], "metadata": {'replication-key': None, "replication-method" : "LOG_BASED", 'view-key-properties': ["id"]}}]

        connections.select_catalog_and_fields_via_metadata(conn_id, chicken_catalog,
                                                           menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id']),
                                                           replication_md)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

       # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)

        self.assertEqual(exit_status['tap_exit_status'], 1)
        # menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self,
                                                                   conn_id,
                                                                   self.expected_sync_streams(),
                                                                   self.expected_pks())

        self.assertEqual(record_count_by_stream, {})
        print("records are correct")

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)
        self.assertEqual(state, {}, msg="expected state to be empty")
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id']
                          for c in found_catalogs})

        for tap_stream_id in self.expected_check_streams():
            found_stream = [
                c for c in found_catalogs
                if c['tap_stream_id'] == tap_stream_id
            ][0]

            # assert that the pks are correct
            self.assertEqual(
                self.expected_pks()[found_stream['stream_name']],
                set(
                    found_stream.get('metadata',
                                     {}).get('table-key-properties')))

            # assert that the row counts are correct
            self.assertEqual(
                self.expected_row_counts()[found_stream['stream_name']],
                found_stream.get('metadata', {}).get('row-count'))

        #  -----------------------------------
        # ----------- Full Table Sync ---------
        #  -----------------------------------
        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    'replication-method': 'LOG_BASED'
                }
            }]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema, additional_md)
        # Synthesize interrupted state
        original_version = int(time.time() * 1000)
        interrupted_state = {
            'currently_syncing': 'simple_db-simple_coll_1',
            'bookmarks': {
                'simple_db-simple_coll_1': {
                    'version': original_version,
                    'initial_full_table_complete': True,
                    'oplog_ts_time': 1,
                    'oplog_ts_inc': 0
                }
            }
        }

        menagerie.set_state(conn_id, interrupted_state)

        # This should say the oplog has timed out and will execute a resync
        runner.run_sync_mode(self, conn_id)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # assert that we only have an ActivateVersionMessage as the last message and not the first
        for stream_name in self.expected_sync_streams():
            self.assertEqual(
                'activate_version',
                records_by_stream[stream_name]['messages'][0]['action'])
            self.assertEqual(
                'activate_version',
                records_by_stream[stream_name]['messages'][51]['action'])

        # assert that final state has no last_id_fetched and max_id_value bookmarks
        final_state = menagerie.get_state(conn_id)
        self.assertNotEqual(
            original_version,
            final_state.get('bookmarks', {}).get('simple_db-simple_coll_1',
                                                 {}).get('version'))

        # assert that all rows in the collection were sync'd
        for stream_id, row_count in self.expected_row_counts().items():
            self.assertGreaterEqual(record_count_by_stream[stream_id],
                                    row_count)

        # assert that each stream has a initial_full_table_complete=True bookmark
        self.assertIsNotNone(
            final_state.get('bookmarks', {}).get('simple_db-simple_coll_1',
                                                 {}).get('oplog_ts_time'))
        self.assertIsNotNone(
            final_state.get('bookmarks', {}).get('simple_db-simple_coll_1',
                                                 {}).get('oplog_ts_inc'))
        self.assertTrue(
            final_state.get('bookmarks',
                            {}).get('simple_db-simple_coll_1',
                                    {}).get('initial_full_table_complete'))
Ejemplo n.º 28
0
    def run_single_projection(self, projection_mapping):
        self.setUpDatabase()
        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id']
                          for c in found_catalogs})

        for tap_stream_id in self.expected_check_streams():
            found_stream = [
                c for c in found_catalogs
                if c['tap_stream_id'] == tap_stream_id
            ][0]

            # assert that the pks are correct
            self.assertEqual(
                self.expected_pks()[found_stream['stream_name']],
                set(
                    found_stream.get('metadata',
                                     {}).get('table-key-properties')))

            # assert that the row counts are correct
            self.assertEqual(
                self.expected_row_counts()[found_stream['stream_name']],
                found_stream.get('metadata', {}).get('row-count'))

        #  -----------------------------------
        # ----------- Initial Full Table ---------
        #  -----------------------------------
        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    'replication-method': 'LOG_BASED'
                }
            }]
            if projection_mapping['projection'] is not None:
                additional_md[0]['metadata'][
                    'tap_mongodb.projection'] = json.dumps(
                        projection_mapping['projection'])
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema, additional_md)

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()

        for stream_name in self.expected_sync_streams():
            stream_records = [
                x for x in messages_by_stream[stream_name]['messages']
                if x.get('action') == 'upsert'
            ]
            #actual_keys = set()

            for record in stream_records:
                self.assertIn(record['data'].keys(),
                              projection_mapping['expected_keys'])
                #actual_keys = actual_keys.union(set(record['data'].keys()))

            #self.assertTrue(actual_keys.issubset(projection_mapping['expected_keys']))

        self.modify_database()

        #  -----------------------------------
        # ----------- Subsequent Oplog Sync ---------
        #  -----------------------------------

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()

        for stream_name in self.expected_sync_streams():
            stream_records = [
                x for x in messages_by_stream[stream_name]['messages']
                if x.get('action') == 'upsert'
            ]
            #actual_keys = set()
            for record in stream_records:
                self.assertIn(record['data'].keys(),
                              projection_mapping['expected_keys'])
Ejemplo n.º 29
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        # TODO - change the replication key back to replication_key_column when rowversion is supported
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'INCREMENTAL',
                'replication-key': 'temp_replication_key_column'
            }
        }]

        non_selected_properties = [
            "nvarchar_text", "varchar_text", "varbinary_data", "geospacial",
            "geospacial_map", "markup", "tree", "variant",
            "SpecialPurposeColumns", "started_at", "ended_at"
        ]
        BaseTapTest.select_all_streams_and_fields(
            conn_id,
            found_catalogs,
            non_selected_properties=non_selected_properties,
            additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        m["action"] == "upsert"
                        for m in records_by_stream[stream]['messages'][1:-1]
                    ]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(len(
                    records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [
                    {
                        "action": "upsert",
                        "data": {
                            column: value
                            for column, value in list(
                                zip(column_names, row_values))
                            if column not in non_selected_properties
                        }  # TODO - change to -1 for using rowversion for replication key
                    } for row_values in sorted(
                        stream_expected_data[self.VALUES],
                        key=lambda row: (row[1] is not None, row[1]))
                ]

                # Verify all data is correct for incremental
                for expected_row, actual_row in zip(
                        expected_messages,
                        records_by_stream[stream]['messages'][1:-1]):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key TODO - change to -1 for using rowversion for replication key
                self.assertEqual(
                    bookmark['replication_key_value'],
                    re.sub(
                        r'\d{3}Z', "Z",
                        max([
                            row[1] for row in stream_expected_data[self.VALUES]
                        ]).strftime("%Y-%m-%dT%H:%M:%S.%fZ")))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "text_and_image_deprecated_soon"
        column_name = [
            "pk", "temp_replication_key_column", "nvarchar_text",
            "varchar_text", "varbinary_data", "replication_key_column"
        ]
        insert_value = [(3,
                         datetime(2018,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  993000,
                                  tzinfo=timezone.utc), "JKL", "MNO",
                         "PQR".encode('utf-8'))]
        update_value = [(0,
                         datetime(2018,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  997000,
                                  tzinfo=timezone.utc), "JKL", "MNO",
                         "PQR".encode('utf-8'))]
        query_list = (insert(database_name,
                             schema_name,
                             table_name,
                             insert_value,
                             column_names=column_name[:-1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)

        values = insert_value + [(
            1, datetime(2018, 12, 31, 23, 59, 59, 987000, tzinfo=timezone.utc),
            "abc", "def", "ghi".encode('utf-8'))] + update_value
        rows = mssql_cursor_context_manager(*[
            "select replication_key_column from data_types_database.dbo.text_and_image_deprecated_soon "
            "where pk in (0, 1,3) order by pk desc"
        ])
        rows = [tuple(row) for row in rows]
        rows = [("0x{}".format(value.hex().upper()), ) for value, in rows]
        row_with_version = [x[0] + x[1] for x in zip(values, rows)]
        self.EXPECTED_METADATA[
            'data_types_database_dbo_text_and_image_deprecated_soon'][
                'values'] = row_with_version

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "weirdos"
        column_name = [
            "pk", "temp_replication_key_column", "geospacial",
            "geospacial_map", "markup", "guid", "tree", "variant",
            "SpecialPurposeColumns", "replication_key_column"
        ]
        insert_value = [(3,
                         datetime(9999,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  993000,
                                  tzinfo=timezone.utc), None, None, None,
                         str(uuid.uuid1()).upper(), None, None, None)]
        update_value = [(1,
                         datetime(9999,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  997000,
                                  tzinfo=timezone.utc), None, None, None,
                         str(uuid.uuid1()).upper(), None, None, None)]
        delete_value = [(0, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value, column_name[:-1]))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)

        values = insert_value + [
            (2, datetime(9999, 12, 31, 23, 59, 59, 990000,
                         tzinfo=timezone.utc), None, None, None,
             "B792681C-AEF4-11E9-8002-0800276BC1DF", None, None, None)
        ] + update_value
        rows = mssql_cursor_context_manager(*[
            "select replication_key_column from data_types_database.dbo.weirdos "
            "where pk in (1, 2, 3) order by pk desc"
        ])
        rows = [tuple(row) for row in rows]
        rows = [("0x{}".format(value.hex().upper()), ) for value, in rows]
        row_with_version = [x[0] + x[1] for x in zip(values, rows)]
        self.EXPECTED_METADATA['data_types_database_dbo_weirdos'][
            'values'] = row_with_version

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "computed_columns"
        column_name = [
            "pk", "temp_replication_key_column", "started_at", "ended_at",
            "replication_key_column"
        ]
        insert_value = [(2,
                         datetime(9998,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  990000,
                                  tzinfo=timezone.utc),
                         datetime(1980, 5, 30, 16), datetime.now())]
        update_value = [(0,
                         datetime(9998,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  997000,
                                  tzinfo=timezone.utc), datetime(1942, 11, 30),
                         datetime(2017, 2, 12))]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value, column_name[:-1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        values = insert_value + [(
            1, datetime(9998, 12, 31, 23, 59, 59, 987000, tzinfo=timezone.utc),
            datetime(1970, 1, 1, 0), datetime.now())] + update_value
        rows = mssql_cursor_context_manager(*[
            "select replication_key_column from data_types_database.dbo.computed_columns "
            "where pk in (0, 1, 2) order by pk desc"
        ])
        rows = [tuple(row) for row in rows]
        row_with_duration = [x[0] + x[1] for x in zip(values, rows)]
        self.EXPECTED_METADATA['data_types_database_dbo_computed_columns'][
            'values'] = row_with_duration

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:-1]
                    ]))
                self.assertEqual(len(
                    records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                        if column not in non_selected_properties
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                        print(
                            "records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(
                    bookmark['replication_key_value'],
                    re.sub(
                        r'\d{3}Z', "Z",
                        max([
                            row[1] for row in stream_expected_data[self.VALUES]
                        ]).strftime("%Y-%m-%dT%H:%M:%S.%fZ")))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))
Ejemplo n.º 30
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        # self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                if records_by_stream[stream]['messages'][-1].get("data"):
                    last_row_data = True
                else:
                    last_row_data = False

                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-2]['action'],
                    'activate_version')
                if last_row_data:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-3]['action'],
                        'activate_version')
                else:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-1]['action'],
                        'activate_version')
                self.assertEqual(
                    len([
                        m for m in records_by_stream[stream]['messages'][1:]
                        if m["action"] == "activate_version"
                    ]),
                    2,
                    msg=
                    "Expect 2 more activate version messages for end of full table and beginning of log based"
                )

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # Verify all data is correct for the full table part
                if last_row_data:
                    final_row = -3
                else:
                    final_row = -2

                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]
                            ['messages'][1:final_row])):
                    with self.subTest(expected_row=expected_row):

                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            self.assertEqual(
                                expected_value,
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_row, actual_row))

                # Verify all data is correct for the log replication part if sent
                if records_by_stream[stream]['messages'][-1].get("data"):
                    for column_name, expected_value in expected_messages[-1][
                            "data"].items():
                        self.assertEqual(
                            expected_value,
                            records_by_stream[stream]['messages'][-1]["data"]
                            [column_name],
                            msg="expected: {} != actual {}".format(
                                expected_row, actual_row))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                inital_log_version = bookmark['current_log_version']

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "integers"
        column_name = [
            "pk", "MyBigIntColumn", "MyIntColumn", "MySmallIntColumn"
        ]
        insert_value = [(14, 100, 100, 100)]
        update_value = [(1, 101, 101, 101)]
        delete_value = [(5, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [(14, 100, 100, 100, None)]
        update_value = [(1, 101, 101, 101, None)]
        delete_value = [(5, None, None, None, datetime.utcnow())]
        self.EXPECTED_METADATA["data_types_database_dbo_integers"]["values"] = \
            insert_value + delete_value + update_value
        self.EXPECTED_METADATA["data_types_database_dbo_integers"][
            "fields"].append({
                "_sdc_deleted_at": {
                    'sql-datatype': 'datetime',
                    'selected-by-default': True,
                    'inclusion': 'automatic'
                }
            })

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "tiny_integers_and_bools"
        column_name = ["pk", "MyTinyIntColumn", "my_boolean"]
        insert_value = [(14, 100, False)]
        update_value = [(1, 101, True)]
        delete_value = [(5, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        insert_value = [(14, 100, False, None)]
        update_value = [(1, 101, True, None)]
        delete_value = [(5, None, None, datetime.utcnow())]
        self.EXPECTED_METADATA["data_types_database_dbo_tiny_integers_and_bools"]["values"] = \
            [self.expected_metadata()["data_types_database_dbo_tiny_integers_and_bools"]["values"][-1]] + \
            insert_value + delete_value + update_value
        self.EXPECTED_METADATA[
            "data_types_database_dbo_tiny_integers_and_bools"][
                "fields"].append({
                    "_sdc_deleted_at": {
                        'sql-datatype': 'datetime',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                })
        mssql_cursor_context_manager(*query_list)

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:]
                    ]))

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertGreaterEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if column_name != "_sdc_deleted_at":
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                            elif expected_value:
                                # we have an expected value for a deleted row
                                try:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%S.%fZ")
                                except ValueError:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%SZ")
                                self.assertGreaterEqual(
                                    actual_value,
                                    expected_value - timedelta(seconds=15))
                                self.assertLessEqual(
                                    actual_value,
                                    expected_value + timedelta(seconds=15))
                            else:
                                # the row wasn't deleted so we can either not pass the column or it can be None
                                self.assertIsNone(
                                    actual_row["data"].get(column_name))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                new_log_version = bookmark['current_log_version']
                self.assertGreater(new_log_version,
                                   inital_log_version,
                                   msg='expected log version to increase')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))