def save_results(self, folder_path):

        my_file.create_folder(folder_path)

        my_file.save_pkl(self.test_idx_list,
                         os.path.join(folder_path, './test_list.pkl'))
        my_file.save_pkl((self.success_idx_list, self.success_target_list,
                          self.success_x_list),
                         os.path.join(folder_path, './success.pkl'))
        my_file.save_pkl((self.long_fail_idx_list, self.long_fail_target_list,
                          self.long_fail_x_list),
                         os.path.join(folder_path, './long_fail.pkl'))
    def __init__(self, result_folder_in_repo, log_filename='log.txt'):

        # record all
        self.success_count = 0
        self.test_count = 0
        self.long_fail_count = 0
        self.query_num_list = []
        self.success_query_num_list = []
        self.real_success_modif_rate_list = []
        self.modif_rate_list = []

        if result_folder_in_repo is not None:
            my_file.create_folder(RESULT_FOLDER, result_folder_in_repo)
            self.log_file = open(
                my_file.real_path_of(RESULT_FOLDER, result_folder_in_repo,
                                     log_filename), 'w')
    def __init__(self, result_folder_in_repo, log_file_path=None):

        self.success_count = 0
        self.test_count = 0
        self.long_fail_count = 0

        if result_folder_in_repo is not None:
            my_file.create_folder(PWWS_OUT_PATH, result_folder_in_repo)
            self.log_file = open(
                my_file.real_path_of(PWWS_OUT_PATH, result_folder_in_repo,
                                     'log.txt'), 'w')
        elif log_file_path is not None:
            self.log_file = open(log_file_path, 'w')

        self.query_num_list = []
        self.success_query_num_list = []
        self.all_success_change_ratio_list = []
        self.change_ratio_list = []
Ejemplo n.º 4
0
SEED = 5555

dataset_path = '/home/workspace/nlp_attack/data/pso_raw/IMDB_used_data/aux_files/dataset_50000.pkl'
word_candidates_path = '/home/workspace/nlp_attack/data/pso_raw/IMDB_used_data/word_candidates_sense.pkl'
pos_tags_path = '/home/workspace/nlp_attack/data/pso_raw/IMDB_used_data/pos_tags_test.pkl'
model_path = '/home/workspace/nlp_attack/data/pso_raw/IMDB_used_data/BERTModel.pt'

# ===========================================

np.random.seed(SEED)

dataset_name = 'IMDB'
model_name = 'BERT'
tag = 'LS'
SAVE_FOLDER = f'out/pso_related/{dataset_name}_{model_name}_{tag}_search/{SEED}'
my_file.create_folder(SAVE_FOLDER)

# init log file
log_file = open(my_file.real_path_of(SAVE_FOLDER, 'log.txt'), 'w')

# save parametes
log_file.write(f'SEED: {SEED}\n')
log_file.write(f'Test Size: {TEST_SIZE}\n')
log_file.flush()

# CURRENT_PATH = 'data/pso_raw/IMDB_used_data'
VOCAB_SIZE = 50000

dataset = my_file.load_pkl(dataset_path)
word_candidate = my_file.load_pkl_in_repo(word_candidates_path)
test_pos_tags = my_file.load_pkl_in_repo(pos_tags_path)
Ejemplo n.º 5
0
def train_text_classifier():
    dataset = args.dataset
    x_train = y_train = x_test = y_test = None
    if dataset == 'imdb':
        train_texts, train_labels, test_texts, test_labels = split_imdb_files()
        if args.level == 'word':
            x_train, y_train, x_test, y_test = word_process(train_texts, train_labels, test_texts, test_labels, dataset)
        elif args.level == 'char':
            x_train, y_train, x_test, y_test = char_process(train_texts, train_labels, test_texts, test_labels, dataset)
    elif dataset == 'agnews':
        train_texts, train_labels, test_texts, test_labels = split_agnews_files()
        if args.level == 'word':
            x_train, y_train, x_test, y_test = word_process(train_texts, train_labels, test_texts, test_labels, dataset)
        elif args.level == 'char':
            x_train, y_train, x_test, y_test = char_process(train_texts, train_labels, test_texts, test_labels, dataset)

    x_train, y_train = shuffle(x_train, y_train, random_state=0)

    # Take a look at the shapes
    print('dataset:', dataset, '; model:', args.model, '; level:', args.level)
    print('X_train:', x_train.shape)
    print('y_train:', y_train.shape)
    print('X_test:', x_test.shape)
    print('y_test:', y_test.shape)

    log_dir = './logs/{}/all_{}/'.format(dataset, args.model)
    tb_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0, write_graph=True)

    model_path = r'./runs/{}/{}.dat'.format(dataset, args.model)
    my_file.create_folder(r'./runs/{}'.format(dataset))
    model = batch_size = epochs = None
    assert args.model[:4] == args.level

    if args.model == "word_cnn":
        model = word_cnn(dataset)
        batch_size = config.wordCNN_batch_size[dataset]
        epochs = config.wordCNN_epochs[dataset]
    elif args.model == "word_bdlstm":
        model = bd_lstm(dataset)
        batch_size = config.bdLSTM_batch_size[dataset]
        epochs = config.bdLSTM_epochs[dataset]
    elif args.model == "char_cnn":
        model = char_cnn(dataset)
        batch_size = config.charCNN_batch_size[dataset]
        epochs = config.charCNN_epochs[dataset]
    elif args.model == "word_lstm":
        model = lstm(dataset)
        batch_size = config.LSTM_batch_size[dataset]
        epochs = config.LSTM_epochs[dataset]

    print('Train...')
    print('batch_size: ', batch_size, "; epochs: ", epochs)
    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2,
              shuffle=True,
              callbacks=[tb_callback])
    scores = model.evaluate(x_test, y_test)
    print('test_loss: %f, accuracy: %f' % (scores[0], scores[1]))
    print('Saving model weights...')
    model.save_weights(model_path)
Ejemplo n.º 6
0
# TEST_SIZE = 20
# test_idx = np.random.choice(len(dataset.test_y), SAMPLE_SIZE, replace=False)
# test_len = []
# for i in range(SAMPLE_SIZE):
#     test_len.append(len(dataset.test_seqs2[test_idx[i]]))
# print('Shortest sentence in our test set is %d words' %np.min(test_len))

TEST_SIZE = None
test_size = len(dataset.test_y)
test_idx_list = np.arange(len(dataset.test_y))
# np.random.shuffle(test_idx_list)

test_list = []

cur_result_folder = f'{algo}_{dataset_name}/{SEED}'
my_file.create_folder(GA_OUT_PATH, cur_result_folder)
cur_log_file = open(
    my_file.real_path_of(GA_OUT_PATH, cur_result_folder, 'log.txt'), 'a')
cur_logger = GAIMDBLogger(cur_log_file)
cur_recorder = GARecorderIMDB()

st = time()

for test_idx in test_idx_list:
    x_orig = test_x[test_idx]
    orig_label = test_y[test_idx]
    orig_preds = model.predict(sess, x_orig[np.newaxis, :])[0]

    if np.argmax(orig_preds) != orig_label:
        print('skipping wrong classifed ..')
        print('--------------------------')