def gen_eda(train_orig, output_file, alpha, num_aug=9): lines = pd.read_csv(train_orig) out = open(output_file, "a", newline="") csv_write = csv.writer(out) csv_write.writerow(['sentence_a', 'sentence_b', 'category']) for i, line in tqdm(lines.iterrows()): try: sentence_a = str(line['sentence_a']) sentence_b = str(line['sentence_b']) category = str(line['category']) aug_sentence_a = eda(sentence_a, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) aug_sentencd_b = eda(sentence_b, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) for aug_sentence_a, aug_sentence_b in zip(aug_sentence_a, aug_sentencd_b): csv_write.writerow([aug_sentence_a, aug_sentence_b, category]) except IndexError: print("Index Error for sample " + str(i)) print("generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(num_aug))
def gen_eda(train_orig, output_file, alpha, num_aug): writer = open(output_file, 'w') lines = open(train_orig, 'r').readlines() for i, line, in enumerate(lines): # print(i,line) parts = line[:-1].split('\t') label = parts[-1] sentence = parts[1] if label=='CPR:3': num_aug=1 elif label=='CPR:9': num_aug=1 elif label=='CPR:6': num_aug=3 elif label=='CPR:5': num_aug=5 else: num_aug=0 aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) for aug_sentence in aug_sentences: writer.write(parts[0] + "\t" + aug_sentence +"\t"+ label +'\n') writer.close() print("generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(num_aug))
def gen_eda(train_orig, output_file, alpha_sr, alpha_ri, alpha_rs, alpha_rd, num_aug=9): import csv writer_fp = open(output_file, "w", newline="") writer = csv.writer(writer_fp, delimiter="\t", quoting=csv.QUOTE_ALL) writer.writerow(["index", "label", "text"]) line_counter = 0 with open(train_orig, "r", newline="") as fp: reader = csv.reader(fp, delimiter="\t", quoting=csv.QUOTE_ALL) next(reader, None) for line in reader: label = line[1] sentence = line[2] aug_sentences = eda(sentence, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=alpha_rd, num_aug=num_aug) for aug_sentence in aug_sentences: writer.writerow([line_counter, label, aug_sentence]) line_counter += 1 writer_fp.close() print("Generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(num_aug))
def gen_eda(x_train, y_train, alpha=0.1, num_aug=4): x_train_aug = x_train y_train_aug = y_train y_train_aug = [np.argmax(y, axis=None, out=None) for y in y_train_aug] n = len(y_train) # # of data for i in range(n): label = np.argmax(y_train[i]) # bottom 5 classes if label >= 16: x_train_aug = np.append(x_train_aug, eda(x_train[i], alpha_rs=alpha, num_aug=num_aug), axis=0) for k in range(num_aug + 1): y_train_aug = np.append(y_train_aug, np.array(label)) y_train_aug = tf.keras.utils.to_categorical(y_train_aug, 21) print('Before Augmentation: ', Counter(np.argmax(y_train, axis=1))) print('After Augmentation: ', Counter(np.argmax(y_train_aug, axis=1))) return x_train_aug, y_train_aug
def gen_eda(train_orig, output_file, alpha_sr, alpha_ri, alpha_rs, alpha_rd, num_aug=9): writer = open(output_file, 'w') lines = open(train_orig, 'r').readlines() for i, line in enumerate(lines): parts = line[:-1].split('\t') label = parts[0] sentence = parts[1] aug_sentences = eda(sentence, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=alpha_rd, num_aug=num_aug) for aug_sentence in aug_sentences: writer.write(label + "\t" + aug_sentence + '\n') writer.close() print("generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(num_aug))
def gen_eda(train_orig, output_file, alpha, num_aug=9): # writer = open(output_file, 'w') # lines = open(train_orig, 'r').readlines() df = pd.read_csv('movie_reviews_review_level.csv') # df_aug = pd.DataFrame() # df_aug['review'] = '' # df_aug['sentiment'] = '' aug_data = [] for i in range(len(df.index)): print(i) cur_row = df.loc[i] label = cur_row['sentiment'] sentence = cur_row['review'] aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) for aug_sentence in aug_sentences: # print(aug_sentence) dict1 = {'review': aug_sentence, 'sentiment': label} # new_row = pd.Series([aug_sentence, label], index = ['review', ['sentiment']]) # df_aug = df_aug.append(new_row, ignore_index=True) aug_data.append(dict1) df_aug = pd.DataFrame(aug_data) file_name = "IMDB_num_aug=" + str(num_aug) + ",alpha=" + str(alpha) + ".csv" df_aug.to_csv(file_name)
def gen_eda(train_orig, output_file, alpha, num_aug=8): n = 0 writer = open(output_file, 'w', encoding='utf-8') #lines = open(train_orig, 'r',encoding='utf-16').readlines() print("正在使用EDA生成增强语句...") with open(train_orig, encoding='utf-8') as fileTrainRaw: for line in fileTrainRaw: # 按行读取文件 #for i, line in enumerate(lines): try: parts = line[:-1].split('\t') label = parts[0] sentence = parts[1] aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) #调用eda,注意原则上不能使用默认值 for aug_sentence in aug_sentences: writer.write(label + "\t" + aug_sentence + '\n') if n % 5 == 0: print(n) except: print("error sentence") n = n + 1 writer.close() print("Done!") print(output_file)
def gen_eda(train_orig, output_file, alpha, num_aug): writer = open(output_file, 'w') lines = open(train_orig, 'r').readlines() for i, line, in enumerate(lines): # print(i,line) if i == 0: continue parts = line.split(',') task2 = parts[3] # print(task2) if task2 == 'OFFN': num_aug = 3 elif task2 == 'HATE': num_aug = 6 aug_sentences = eda(parts[1], alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) for aug_sentence in aug_sentences: writer.write(parts[0] + "\t" + aug_sentence + "\t" + task2 + "\t" + parts[-1] + '\n') writer.close() print("generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(num_aug))
def gen_eda(train_orig, train_orig_label, output_file, alpha, num_aug=9): writer = open(output_file, 'w') writer_label = open(output_file_label, 'w') lines = open(train_orig, 'r').readlines() train_orig_label_lines = open(train_orig_label, 'r').readlines() # for i, train_orig_label_line in enumerate(train_orig_label_lines): # part_label = train_orig_label_line[:-1].split('\t')[0] # for j in range(0,num_aug+1): # writer_label.write(part_label + '\n') for i, line in enumerate(lines): part_label = train_orig_label_lines[i][:-1].split('\t')[0] part = line[:-1].split('\t')[0] dic_part = json.loads(part) sentence = dic_part['goal'] aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) for aug_sentence in aug_sentences: new_dic_part = dic_part new_dic_part['goal'] = aug_sentence try: writer.write(str(new_dic_part) + '\n') writer_label.write(part_label + '\n') except: pass writer.close() print("generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str( num_aug))
def get_eda(sentence, alpha=0.1, num_aug=4): aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) aug_sentences = [x.replace(' ', '') for x in aug_sentences] return aug_sentences
def gen_eda(train_orig, output_file, alpha, num_aug=9, query_eda=False): writer = open(output_file, 'w', encoding='UTF8') lines = open(train_orig, 'r', encoding='UTF8').readlines() for i, line in enumerate(lines): parts = line[:-1].split('\t') label = parts[0] id = parts[1] query = parts[2] title = parts[3] description = parts[4] if query_eda: aug_querys = eda(query, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) else: aug_querys = [query] * (num_aug + 1) aug_titles = eda(title, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) aug_descriptions = eda(description, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) for aug_query, aug_title, aug_description in zip( aug_querys, aug_titles, aug_descriptions): writer.write(label + "\t" + id + "\t" + aug_query + "\t" + aug_title + "\t" + aug_description + '\n') writer.close() print("generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(num_aug))
def main(): ap = argparse.ArgumentParser() # ap.add_argument("--input", required=True, type=str, help="原始数据的输入文件目录") ap.add_argument("--input", default="sample.txt", type=str, help="原始数据的输入文件目录") ap.add_argument("--output", default="sample_augmented.txt", required=False, type=str, help="增强数据后的输出文件") ap.add_argument("--num_aug", default=4, required=False, type=int, help="每条原始语句增强的语句数") ap.add_argument("--alpha_sr", default=0.1, required=False, type=float, help="每条语句中替换同义词数占比") ap.add_argument("--alpha_ri", default=0.1, required=False, type=float, help="每条语句中随机插入单词数占比") ap.add_argument("--alpha_rs", default=0.1, required=False, type=float, help="每条语句中随机互换位置单词数占比") ap.add_argument("--alpha_rd", default=0.1, required=False, type=float, help="每条语句中随机删除单词数占比") args = ap.parse_args() with open(args.input, encoding="utf-8") as fi, \ open(join(dirname(args.input), args.output), "w", encoding="utf-8") as fo: for line in fi: label, text = line.strip().split("\t", 2) aug_texts = eda(text, alpha_sr=args.alpha_sr, alpha_ri=args.alpha_ri, alpha_rs=args.alpha_rs, p_rd=args.alpha_rd, num_aug=args.num_aug) for aug_text in aug_texts: fo.write(f"{label}\t{aug_text}\n")
def gen_eda(train_orig, output_file, alpha, num_aug=9): writer = open(output_file, 'w', encoding='utf8') lines = open(train_orig, 'r', encoding='utf8').readlines() print("正在使用EDA生成增强语句...") for i, line in enumerate(lines): parts = line[:-1].split('\t') #使用[:-1]是把\n去掉了 label = parts[0] sentence = parts[1] aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) for aug_sentence in aug_sentences: writer.write(label + "\t" + aug_sentence + '\n') writer.close() print("已生成增强语句!") print(output_file)
def gen_eda(train_orig, output_file, alpha, num_aug=9): writer = open(output_file, 'w') lines = open(train_orig, 'r').readlines() for i, line in enumerate(lines): parts = line[:-1].split('\t') label = parts[0] sentence = parts[1] aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) for aug_sentence in aug_sentences: writer.write(label + "\t" + aug_sentence + '\n') writer.close()
def gen_eda(train_orig, output_file, alpha, num_aug=9): # writer = open(output_file, 'w', encoding='utf-8') df_original = pd.read_csv(train_orig) labels_original = df_original['label'].tolist() sentences_original = df_original['content'].tolist() # # lines = open(train_orig, 'r', encoding='utf-8').readlines() # # writer.write("label" + "," + "content" + '\n') print("正在使用EDA生成增强语句...") # for i, line in enumerate(lines): labels = [] da_sentences = [] for index, sentence in enumerate(sentences_original): label = labels_original[index] # for line in islice(lines, 1, None): # parts = line[:-1].split(',') #使用[:-1]是把\n去掉了 # if len(parts) != 2: # continue # label = parts[0] # sentence = parts[1] aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug) for i in range(len(aug_sentences)): labels.append(label) da_sentences.extend(aug_sentences) df = pd.DataFrame({'label': labels, 'content': da_sentences}) df.to_csv(output_file, index=False) # for aug_sentence in aug_sentences: # writer.write(label + "," + aug_sentence + '\n') # writer.close() print("已生成增强语句!") print(output_file)
def gen_eda(data, text_col, label_col): sentences, labels = [], [] for idx, row in tqdm(data.iterrows()): label = row[label_col] sentence = row[text_col] aug_sentences = eda(sentence, alpha_sr=configs.alpha_sr, alpha_ri=configs.alpha_ri, alpha_rs=configs.alpha_rs, p_rd=configs.p_rd, num_aug=configs.num_aug) sentences.append(aug_sentences) labels.append([label] * len(aug_sentences)) sentences = [j for sub in sentences for j in sub] labels = [j for sub in labels for j in sub] # strip & remove duplicates aug_data = pd.DataFrame({text_col: sentences, label_col: labels}) aug_data[text_col] = aug_data[text_col].str.strip() aug_data = aug_data.drop_duplicates() return aug_data
def eda(self, sentence, aspect, adjusted=False): sent_adjusted = sentence.replace(aspect, '$t$') assert sent_adjusted != sentence, 'Something went wrong, the aspect "{}" cannot be found in "{}"'.format( aspect, sentence) augmented_sent = eda(sent_adjusted, aspect, alpha_ri=FLAGS.EDA_insertion, alpha_rs=FLAGS.EDA_swap, alpha_sr=FLAGS.EDA_replacement, p_rd=FLAGS.EDA_deletion, percentage=FLAGS.EDA_pct, adjusted=adjusted, counter=self.counter) augmented_with_aspect = [] for sent in augmented_sent: augmented_with_aspect.append(sent.replace('$t$', aspect)) assert sent != sent.replace( '$t$', aspect ), 'Something went wrong, the aspect "{}" cannot be found in "{}"'.format( "$t$", sent) return augmented_with_aspect, aspect
def gen_eda(train_orig, output_file, alpha, path_to_synonyms, num_aug=9): writer = open(output_file, 'w') lines = open(train_orig, 'r').readlines() for i, line in enumerate(lines): parts = line[:-1].split('\t') label = parts[0] sentence = parts[1] aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug, path_to_synonyms=path_to_synonyms) for aug_sentence in aug_sentences: writer.write(label + "\t" + aug_sentence + '\n') writer.close() print( f"generated augmented sentences with eda for {train_orig} to {output_file} with num_aug={num_aug}" )
def augment(path): filenames = random.sample(glob.glob(path + "/*txt"), TOTAL) for filename in filenames: new_filename = filename[:-4] + "_" + str(0) + filename[-4:] os.rename(filename, new_filename) for i in range(1, NUM_AUG + 1): copy_filename = new_filename[:-5] + str(i) + new_filename[-4:] shutil.copyfile(new_filename, copy_filename) with open(new_filename, 'r', encoding='utf-8') as fpr: data_raw = json.load(fpr) article = data_raw['article'] aug_article = eda(article, alpha_sr=ALPHA, alpha_ri=ALPHA, alpha_rs=ALPHA, p_rd=ALPHA, num_aug=NUM_AUG) for i in range(NUM_AUG + 1): write_file = new_filename[:-5] + str(i) + new_filename[-4:] with open(write_file, 'r', encoding='utf-8') as fpr: data_raw = json.load(fpr) data_raw['article'] = aug_article[i] with open(write_file, 'w', encoding='utf-8') as fpr: fpr.write(json.dumps(data_raw))
lines.append(line) ori_df = pd.DataFrame(columns=["sentence1", "sentence2", "label"]) for (i, line) in enumerate(lines): if i == 0: continue text_a = line[1] text_b = line[2] label = line[3] ori_df = ori_df.append({'sentence1': text_a,'sentence2': text_b, 'label': label}, ignore_index=True) print(ori_df.head()) for i in ori_df.sentence1: ori_sentence = i method_label = np.random.randint(0, 4, 1)[0] method = augment_single_with_label(method_label) aug_sentences = eda(ori_sentence, alpha=0.15, num_aug=1, method=method) for aug_sentence in aug_sentences: aug_df = aug_df.append({'sentence1': aug_sentence, 'label': method}, ignore_index=True) print("generated augmented sentences finished.") print(aug_df['label'].value_counts(normalize=True) * 100) aug_df.to_csv('augment_train.tsv', sep = '\t', index=False)
def sentimentGenerateTestSuite(r,threshold_CC,threshold_MC,symbols_SQ,seq,TestCaseNum,minimalTest,TargMetri,CoverageStop): r.resetTime() random.seed(1) # set oracle radius oracleRadius = 0.2 # load model sm = Sentiment() sm.load_model() # test layer layer = 1 termin = 0 # minimal test dataset generation if minimalTest != '0': ncdata = [] ccdata = [] mcdata = [] sqpdata = [] sqndata = [] # predict sentiment from reviews review = "i really dislike the movie" tmp = sm.fromTextToID(review) test = np.squeeze(sm.pre_processing_x(tmp)) h_t, c_t, f_t = sm.cal_hidden_state(test) # input seeds X_train = sm.X_train[random.sample(range(20000),5000)] # test objective NC nctoe = NCTestObjectiveEvaluation(r) nctoe.model = sm.model nctoe.testObjective.layer = layer nctoe.testCase = test activations_nc = nctoe.get_activations() nctoe.testObjective.feature = (np.argwhere(activations_nc >= np.min(activations_nc))).tolist() nctoe.testObjective.setOriginalNumOfFeature() # test objective CC cctoe = CCTestObjectiveEvaluation(r) cctoe.model = sm.model cctoe.testObjective.layer = layer cctoe.hidden = h_t cctoe.threshold = float(threshold_CC) activations_cc = cctoe.get_activations() total_features_cc = (np.argwhere(activations_cc >= np.min(activations_cc))).tolist() cctoe.testObjective.feature = total_features_cc cctoe.testObjective.setOriginalNumOfFeature() cctoe.testObjective.setfeaturecount() # test objective MC mctoe = MCTestObjectiveEvaluation(r) mctoe.model = sm.model mctoe.testObjective.layer = layer mctoe.hidden = f_t mctoe.threshold = float(threshold_MC) activations_mc = mctoe.get_activations() total_features_mc = (np.argwhere(activations_mc >= np.min(activations_mc))).tolist() mctoe.testObjective.feature = total_features_mc mctoe.testObjective.setOriginalNumOfFeature() mctoe.testObjective.setfeaturecount() # test objective SQ sqtoe = SQTestObjectiveEvaluation(r) sqtoe.model = sm.model sqtoe.testObjective.layer = layer sqtoe.symbols = int(symbols_SQ) # generate all the features # choose time steps to cover t1 = int(seq[0]) t2 = int(seq[1]) indices = slice(t1, t2 + 1) # slice(480, 485) # characters to represent time series alpha_list = [chr(i) for i in range(97, 97 + int(symbols_SQ))] symb = ''.join(alpha_list) sqtoe.testObjective.feature_p = list(iter.product(symb, repeat=t2-t1+1)) sqtoe.testObjective.feature_n = list(iter.product(symb, repeat=t2-t1+1)) sqtoe.testObjective.setOriginalNumOfFeature() for test in X_train: for i in range(4): text = sm.fromIDToText(test) (label1, conf1) = sm.displayInfo(test) # get next input test2 # test case pertubations alpha = random.uniform(0.001, oracleRadius) aug_text = eda(text, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=1) tmp = sm.fromTextToID(str(aug_text[0])) test2 = np.squeeze(sm.pre_processing_x(tmp)) if not (test2 is None): (label2, conf2) = sm.displayInfo(test2) h_t, c_t, f_t = sm.cal_hidden_state(test2) cctoe.hidden = h_t sm.updateSample(label2, label1, alpha, True) # update NC coverage nctoe.testCase = test2 nctoe.update_features() # update CC coverage cctoe.hidden = h_t cctoe.update_features() # update MC coverage mctoe.hidden = f_t mctoe.update_features() # update SQ coverage sqtoe.hidden = h_t sqtoe.update_features(indices) # write information to file writeInfo(r, sm.numSamples, sm.numAdv, sm.perturbations,nctoe.coverage,cctoe.coverage, mctoe.coverage, sqtoe.coverage_p,sqtoe.coverage_n) # terminate condition if TargMetri == 'CC': termin = cctoe.coverage elif TargMetri == 'GC': termin = mctoe.coverage elif TargMetri == 'SQN': termin = sqtoe.coverage_n elif TargMetri == 'SQP': termin = sqtoe.coverage_p # output test cases and adversarial examples if minimalTest == '0': f = open('output/test_set.txt', 'a') f.write(str(label1)) f.write('\t') f.writelines(str(aug_text[0])) f.write('\n') f.close() if label2 != label1 : f = open('adv_output/adv_test_set.txt', 'a') f.write(str(label1)) f.write('\t') f.write(str(label2)) f.write('\t') f.writelines(str(aug_text[0])) f.write('\n') f.close() else: if nctoe.minimal == 1 : ncdata.append(test2) f = open('minimal_nc/test_set.txt', 'a') f.write(str(label1)) f.write('\t') f.writelines(str(aug_text[0])) f.write('\n') f.close() if cctoe.minimal == 1 : ccdata.append(test2) f = open('minimal_cc/test_set.txt', 'a') f.write(str(label1)) f.write('\t') f.writelines(str(aug_text[0])) f.write('\n') f.close() if mctoe.minimal == 1 : mcdata.append(test2) f = open('minimal_mc/test_set.txt', 'a') f.write(str(label1)) f.write('\t') f.writelines(str(aug_text[0])) f.write('\n') f.close() if sqtoe.minimalp == 1 : sqpdata.append(test2) f = open('minimal_sqp/test_set.txt', 'a') f.write(str(label1)) f.write('\t') f.writelines(str(aug_text[0])) f.write('\n') f.close() if sqtoe.minimaln == 1 : sqndata.append(test2) f = open('minimal_sqn/test_set.txt', 'a') f.write(str(label1)) f.write('\t') f.writelines(str(aug_text[0])) f.write('\n') f.close() # check termination condition if sm.numSamples < int(TestCaseNum) and termin < float(CoverageStop): continue else: io.savemat('log_folder/feature_count_CC.mat', {'feature_count_CC': cctoe.testObjective.feature_count}) io.savemat('log_folder/feature_count_GC.mat', {'feature_count_GC': mctoe.testObjective.feature_count}) # if minimalTest != '0': # np.save('minimal_nc/ncdata', ncdata) # np.save('minimal_cc/ccdata', ccdata) # np.save('minimal_mc/mcdata', mcdata) # np.save('minimal_sqp/sqpdata', sqpdata) # np.save('minimal_sqn/sqndata', sqndata) break if sm.numSamples < int(TestCaseNum) and termin < float(CoverageStop): continue else: break print("statistics: \n") nctoe.displayCoverage() cctoe.displayCoverage() mctoe.displayCoverage() sqtoe.displayCoverage1() sqtoe.displayCoverage2() sm.displaySamples() sm.displaySuccessRate()
def generate_aug_eda(X_train, y_train, eda_args, first): alpha_sr = eda_args.alpha_sr alpha_ri = eda_args.alpha_ri alpha_rs = eda_args.alpha_rs alpha_rd = eda_args.alpha_rd num_aug = eda_args.num_aug # create dataframe for storing augmented data X_train_aug = pd.DataFrame(columns=['X']) y_train_aug = pd.DataFrame(columns=['y']) # iterate the current dataset num_sample = len(X_train) for index in tqdm(range(num_sample)): # print(str(index), " / ", str(num_sample)) if (first == True): X_train_sample = X_train.iloc[index] y_train_sample = y_train.iloc[index] if (y_train_sample): # perform augmentation by using eda with current hyper-parameters X_train_sample = cleanText(X_train_sample) aug_sentences = eda(X_train_sample, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=alpha_rd, num_aug=num_aug) # append every augmented data to dataset for aug_sentence in aug_sentences: X_train_aug = X_train_aug.append({'X': aug_sentence}, ignore_index=True) y_train_aug = y_train_aug.append({'y': y_train_sample}, ignore_index=True) else: X_train_aug = X_train_aug.append({'X': X_train_sample}, ignore_index=True) y_train_aug = y_train_aug.append({'y': y_train_sample}, ignore_index=True) elif (first == False): X_train_sample = X_train['X'].iloc[index] y_train_sample = y_train['y'].iloc[index] if (y_train_sample): # perform augmentation by using eda with current hyper-parameters X_train_sample = cleanText(X_train_sample) aug_sentences = eda(X_train_sample, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=alpha_rd, num_aug=num_aug) # append every augmented data to dataset for aug_sentence in aug_sentences: X_train_aug = X_train_aug.append({'X': aug_sentence}, ignore_index=True) y_train_aug = y_train_aug.append({'y': y_train_sample}, ignore_index=True) else: X_train_aug = X_train_aug.append({'X': X_train_sample}, ignore_index=True) y_train_aug = y_train_aug.append({'y': y_train_sample}, ignore_index=True) # print("generate augmented sentences by eda") return X_train_aug, y_train_aug
def sentimentGenerateTestSuite(r, threshold_SC, threshold_BC, symbols_TC, seq, TestCaseNum, Mutation, CoverageStop): r.resetTime() seeds = 3 random.seed(seeds) # set oracle radius oracleRadius = 0.2 # load model sm = Sentiment() sm.load_model() # test layer layer = 1 #choose time step to cover t1 = int(seq[0]) t2 = int(seq[1]) indices = slice(t1, t2 + 1) # calculate mean and std for z-norm h_train = sm.cal_hidden_keras(sm.X_train, layer) mean_TC, std_TC, max_SC, min_SC, max_BC, min_BC = aggregate_inf( h_train, indices) # get the seeds pool X_seeds = [] # input seeds for label_idx in range(2): x_class = sm.X_train[sm.y_train == label_idx] for i in range(100, 200): X_seeds.append(x_class[i]) # X_seeds = sm.X_train[sm.y_train == 0] # X_seeds = X_seeds[:100] # predict sentiment from reviews review = "really good film to watch and highly recommended" # review = "movie is horrible and watching experience is terrible" tmp = sm.fromTextToID(review) test = np.squeeze(sm.pre_processing_x([tmp])) [h_t, c_t, f_t] = sm.cal_hidden_state(test, layer) # test objective NC nctoe = NCTestObjectiveEvaluation(r) threshold_nc = 0 nctoe.testObjective.setParamters(sm.model, layer, threshold_nc, test) # test objective KMNC kmnctoe = KMNCTestObjectiveEvaluation(r) k_sec = 10 kmnctoe.testObjective.setParamters(sm.model, layer, k_sec, test) # test objective NBC nbctoe = NBCTestObjectiveEvaluation(r) ub = 0.7 lb = -0.7 nbctoe.testObjective.setParamters(sm.model, layer, ub, lb, test) # test objective SNAC snactoe = NCTestObjectiveEvaluation(r) threshold_snac = 0.7 snactoe.testObjective.setParamters(sm.model, layer, threshold_snac, test) # test objective SC SCtoe = SCTestObjectiveEvaluation(r) SC_test_obj = 'h' act_SC = SCtoe.get_activations(np.array([h_t])) SCtoe.testObjective.setParamters(sm.model, SC_test_obj, layer, float(threshold_SC), indices, max_SC, min_SC, np.squeeze(act_SC)) # test objective BC BCtoe = BCTestObjectiveEvaluation(r) BC_test_obj = 'h' act_BC = BCtoe.get_activations(np.array([h_t])) BCtoe.testObjective.setParamters(sm.model, BC_test_obj, layer, float(threshold_BC), indices, max_BC, min_BC, np.squeeze(act_BC)) # test objective TC TCtoe = TCTestObjectiveEvaluation(r) seq_len = 5 TC_test_obj = 'h' act_TC = TCtoe.get_activations(np.array([h_t])) TCtoe.testObjective.setParamters(sm.model, TC_test_obj, layer, int(symbols_TC), seq_len, indices, mean_TC, std_TC) # visualize internal structure information # act_TC = np.squeeze(act_TC)[-8:] # act_SC = np.squeeze(act_SC)[-8:] # act_TC = Z_ScoreNormalization(act_TC, mean_TC, std_TC) # act_BC = np.sum(f_t, axis=1) / float(f_t.shape[1]) # act_BC = act_BC[-8:] # act_SC = (act_SC - min_SC) / (max_SC - min_SC) # # plt.figure(1) # plot_x = np.arange(len(act_TC)) # plt.plot(plot_x, act_TC) # plt.ylabel('$\\xi_t^{h}$', fontsize=14) # plt.xticks(fontsize=14) # plt.yticks(fontsize=14) # plt.figure(2) # plot_x = np.arange(len(act_BC)) # plt.bar(plot_x, act_BC) # plt.ylabel('$\\xi_t^{f, avg}$', fontsize=14) # plt.xticks(fontsize=14) # plt.yticks(fontsize=14) # plt.figure(3) # plot_x = np.arange(len(act_SC)) # plt.bar(plot_x, act_SC) # plt.xlabel('Input', fontsize=14) # plt.ylabel('$\Delta\\xi_t^{h}$', fontsize=14) # plt.xticks(fontsize=14) # plt.yticks(fontsize=14) # plt.show() text_seeds = [sm.fromIDToText(item) for item in X_seeds] y_seeds = sm.getOutputResult(X_seeds) X_test = [] r_t = 400 // len(X_seeds) while sm.numSamples < int(TestCaseNum): # generate test cases unique_test = np.repeat(np.arange(len(X_seeds)), r_t, axis=0) y_test1 = np.repeat(y_seeds, r_t, axis=0) alpha = random.uniform(0.01, oracleRadius) aug_text = [] for text in text_seeds: out = eda(text, sm.numSamples, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=r_t) aug_text = aug_text + out tmp = [sm.fromTextToID(text) for text in aug_text] test2 = sm.pre_processing_x(tmp) if sm.numSamples > 0 and Mutation == 'genetic': y_test1 = np.concatenate( (y_test1, np.array([sc_test_1]), np.array([bc_test_1]), np.array([tc_test_1]))) test2 = np.concatenate( (test2, np.array([sc_test_2]), np.array([bc_test_2]), np.array([tc_test_2]))) unique_test = np.concatenate( (unique_test, np.array([seed_id_sc]), np.array([seed_id_bc]), np.array([seed_id_tc]))) y_test2 = sm.getOutputResult(test2) # # display statistics of adv. sm.displayInfo(y_test1, y_test2, alpha, unique_test) # calculate the hidden state h_test = sm.cal_hidden_keras(test2, layer) # update the coverage # update NC coverage nctoe.update_features(test2) # update KMNC coverage kmnctoe.update_features(test2) # update NBC coverage nbctoe.update_features(test2) # update SNAC coverage snactoe.update_features(test2) # update SC coverage SCtoe.update_features(h_test, len(X_test)) # update BC coverage BCtoe.update_features(h_test, len(X_test)) # update TC coverage TCtoe.update_features(h_test, len(X_test)) X_test = X_test + test2.tolist() if Mutation == 'genetic': num_generation = 10 sc_test_record = SCtoe.testObjective.test_record bc_test_record = BCtoe.testObjective.test_record tc_test_record = TCtoe.testObjective.test_record if len(sc_test_record) != 0: print('boost coverage for SC') sc_feature, sc_cov_fit = random.choice( list(sc_test_record.items())) seed_id_sc = sc_cov_fit[0] % len(X_seeds) sc_test_1 = y_seeds[seed_id_sc] # boost coverage with GA sc_test_2 = getNextInputByGA(sm, SCtoe, sc_feature, np.array(X_test[sc_cov_fit[0]]), num_generation, sm.numSamples) print('\n') if len(bc_test_record) != 0: print('boost coverage for BC') bc_feature, bc_cov_fit = random.choice( list(bc_test_record.items())) seed_id_bc = bc_cov_fit[0] % len(X_seeds) bc_test_1 = y_seeds[seed_id_bc] # boost coverage with GA bc_test_2 = getNextInputByGA(sm, BCtoe, bc_feature, np.array(X_test[bc_cov_fit[0]]), num_generation, sm.numSamples) print('\n') if len(tc_test_record) != 0: print('boost coverage for TC') tc_feature, tc_cov_fit = random.choice( list(tc_test_record.items())) seed_id_tc = tc_cov_fit[1] % len(X_seeds) tc_test_1 = y_seeds[seed_id_tc] # boost coverage with GA tc_test_2 = getNextInputByGA(sm, TCtoe, tc_feature, np.array(X_test[tc_cov_fit[1]]), num_generation, sm.numSamples) # write information to file writeInfo(r, sm.numSamples, sm.numAdv, sm.perturbations, nctoe.coverage, kmnctoe.coverage, nbctoe.coverage, snactoe.coverage, SCtoe.coverage, BCtoe.coverage, TCtoe.coverage, len(sm.unique_adv)) print("statistics: \n") nctoe.displayCoverage() kmnctoe.displayCoverage() nbctoe.displayCoverage() snactoe.displayCoverage() SCtoe.displayCoverage() BCtoe.displayCoverage() TCtoe.displayCoverage() print('unique adv.', len(sm.unique_adv)) sm.displaySuccessRate()
np.random.seed(example_id) Context.append(knowledgebase.Context.values[example_id]) Utterance.append(np.random.choice(tmp_distractors, 1)[0]) Labels.append(0) train_df = pd.DataFrame({ 'Context': Context, 'Utterance': Utterance, 'Label': Labels }) for i in range(len(train_df)): try: tmp_df = pd.DataFrame({ 'Context': eda(train_df.Context.values[i], .2, .2, .2, .2, 15), 'Utterance': list(np.repeat(train_df.Utterance.values[i], 16)), 'Label': list(np.repeat(train_df.Label.values[i], 16)) }) train_df = train_df.append(tmp_df) except Exception: pass train_df.reset_index(level=None, drop=True, inplace=True) Context, WOzAnswers, Labels = [], [], [] KBanswers = list(np.unique(knowledgebase.Utterance.values)) for example_id in range(len(validation_dialogues)): exampleWOzAnswers = list(validation_dialogues.iloc[example_id, 7:].values)
id_to_exclude = KBanswers.index(knowledgebase.Utterance.values[example_id]) tmp_distractors = [KBanswers[i] for i in np.array(range(len(KBanswers))) [np.isin(range(len(KBanswers)), id_to_exclude, invert=True)] ] np.random.seed(example_id) Context.append(knowledgebase.Context.values[example_id]) Utterance.append(np.random.choice(tmp_distractors, 1)[0]) Labels.append(0) train_df = pd.DataFrame({'Context':Context, 'Utterance':Utterance, 'Label':Labels}) for i in range(len(train_df)): try: tmp_df = pd.DataFrame({ 'Context': eda(train_df.Context.values[i], .2, .2, .2, .2, 15), 'Utterance': list(np.repeat(train_df.Utterance.values[i], 16)), 'Label': list(np.repeat(train_df.Label.values[i], 16)) }) train_df = train_df.append(tmp_df) except Exception: pass train_df.reset_index(level=None, drop=True, inplace=True) ################### ar_msk = np.random.rand(len(train_df)) < 2/3 train_df[ar_msk].to_csv('data/data.csv', encoding='utf-8', index=False) train_df[~ar_msk].to_csv('data/data_test.csv', encoding='utf-8', index=False) ###-clear space-### del(train_df, ar_msk) ###################