def save_results(self, folder_path): my_file.create_folder(folder_path) my_file.save_pkl(self.test_idx_list, os.path.join(folder_path, './test_list.pkl')) my_file.save_pkl((self.success_idx_list, self.success_target_list, self.success_x_list), os.path.join(folder_path, './success.pkl')) my_file.save_pkl((self.long_fail_idx_list, self.long_fail_target_list, self.long_fail_x_list), os.path.join(folder_path, './long_fail.pkl'))
def __init__(self, result_folder_in_repo, log_filename='log.txt'): # record all self.success_count = 0 self.test_count = 0 self.long_fail_count = 0 self.query_num_list = [] self.success_query_num_list = [] self.real_success_modif_rate_list = [] self.modif_rate_list = [] if result_folder_in_repo is not None: my_file.create_folder(RESULT_FOLDER, result_folder_in_repo) self.log_file = open( my_file.real_path_of(RESULT_FOLDER, result_folder_in_repo, log_filename), 'w')
def __init__(self, result_folder_in_repo, log_file_path=None): self.success_count = 0 self.test_count = 0 self.long_fail_count = 0 if result_folder_in_repo is not None: my_file.create_folder(PWWS_OUT_PATH, result_folder_in_repo) self.log_file = open( my_file.real_path_of(PWWS_OUT_PATH, result_folder_in_repo, 'log.txt'), 'w') elif log_file_path is not None: self.log_file = open(log_file_path, 'w') self.query_num_list = [] self.success_query_num_list = [] self.all_success_change_ratio_list = [] self.change_ratio_list = []
SEED = 5555 dataset_path = '/home/workspace/nlp_attack/data/pso_raw/IMDB_used_data/aux_files/dataset_50000.pkl' word_candidates_path = '/home/workspace/nlp_attack/data/pso_raw/IMDB_used_data/word_candidates_sense.pkl' pos_tags_path = '/home/workspace/nlp_attack/data/pso_raw/IMDB_used_data/pos_tags_test.pkl' model_path = '/home/workspace/nlp_attack/data/pso_raw/IMDB_used_data/BERTModel.pt' # =========================================== np.random.seed(SEED) dataset_name = 'IMDB' model_name = 'BERT' tag = 'LS' SAVE_FOLDER = f'out/pso_related/{dataset_name}_{model_name}_{tag}_search/{SEED}' my_file.create_folder(SAVE_FOLDER) # init log file log_file = open(my_file.real_path_of(SAVE_FOLDER, 'log.txt'), 'w') # save parametes log_file.write(f'SEED: {SEED}\n') log_file.write(f'Test Size: {TEST_SIZE}\n') log_file.flush() # CURRENT_PATH = 'data/pso_raw/IMDB_used_data' VOCAB_SIZE = 50000 dataset = my_file.load_pkl(dataset_path) word_candidate = my_file.load_pkl_in_repo(word_candidates_path) test_pos_tags = my_file.load_pkl_in_repo(pos_tags_path)
def train_text_classifier(): dataset = args.dataset x_train = y_train = x_test = y_test = None if dataset == 'imdb': train_texts, train_labels, test_texts, test_labels = split_imdb_files() if args.level == 'word': x_train, y_train, x_test, y_test = word_process(train_texts, train_labels, test_texts, test_labels, dataset) elif args.level == 'char': x_train, y_train, x_test, y_test = char_process(train_texts, train_labels, test_texts, test_labels, dataset) elif dataset == 'agnews': train_texts, train_labels, test_texts, test_labels = split_agnews_files() if args.level == 'word': x_train, y_train, x_test, y_test = word_process(train_texts, train_labels, test_texts, test_labels, dataset) elif args.level == 'char': x_train, y_train, x_test, y_test = char_process(train_texts, train_labels, test_texts, test_labels, dataset) x_train, y_train = shuffle(x_train, y_train, random_state=0) # Take a look at the shapes print('dataset:', dataset, '; model:', args.model, '; level:', args.level) print('X_train:', x_train.shape) print('y_train:', y_train.shape) print('X_test:', x_test.shape) print('y_test:', y_test.shape) log_dir = './logs/{}/all_{}/'.format(dataset, args.model) tb_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0, write_graph=True) model_path = r'./runs/{}/{}.dat'.format(dataset, args.model) my_file.create_folder(r'./runs/{}'.format(dataset)) model = batch_size = epochs = None assert args.model[:4] == args.level if args.model == "word_cnn": model = word_cnn(dataset) batch_size = config.wordCNN_batch_size[dataset] epochs = config.wordCNN_epochs[dataset] elif args.model == "word_bdlstm": model = bd_lstm(dataset) batch_size = config.bdLSTM_batch_size[dataset] epochs = config.bdLSTM_epochs[dataset] elif args.model == "char_cnn": model = char_cnn(dataset) batch_size = config.charCNN_batch_size[dataset] epochs = config.charCNN_epochs[dataset] elif args.model == "word_lstm": model = lstm(dataset) batch_size = config.LSTM_batch_size[dataset] epochs = config.LSTM_epochs[dataset] print('Train...') print('batch_size: ', batch_size, "; epochs: ", epochs) model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, shuffle=True, callbacks=[tb_callback]) scores = model.evaluate(x_test, y_test) print('test_loss: %f, accuracy: %f' % (scores[0], scores[1])) print('Saving model weights...') model.save_weights(model_path)
# TEST_SIZE = 20 # test_idx = np.random.choice(len(dataset.test_y), SAMPLE_SIZE, replace=False) # test_len = [] # for i in range(SAMPLE_SIZE): # test_len.append(len(dataset.test_seqs2[test_idx[i]])) # print('Shortest sentence in our test set is %d words' %np.min(test_len)) TEST_SIZE = None test_size = len(dataset.test_y) test_idx_list = np.arange(len(dataset.test_y)) # np.random.shuffle(test_idx_list) test_list = [] cur_result_folder = f'{algo}_{dataset_name}/{SEED}' my_file.create_folder(GA_OUT_PATH, cur_result_folder) cur_log_file = open( my_file.real_path_of(GA_OUT_PATH, cur_result_folder, 'log.txt'), 'a') cur_logger = GAIMDBLogger(cur_log_file) cur_recorder = GARecorderIMDB() st = time() for test_idx in test_idx_list: x_orig = test_x[test_idx] orig_label = test_y[test_idx] orig_preds = model.predict(sess, x_orig[np.newaxis, :])[0] if np.argmax(orig_preds) != orig_label: print('skipping wrong classifed ..') print('--------------------------')