Esempio n. 1
0
def gen_eda(train_orig, output_file, alpha, num_aug=9):

    lines = pd.read_csv(train_orig)
    out = open(output_file, "a", newline="")
    csv_write = csv.writer(out)
    csv_write.writerow(['sentence_a', 'sentence_b', 'category'])
    for i, line in tqdm(lines.iterrows()):
        try:
            sentence_a = str(line['sentence_a'])
            sentence_b = str(line['sentence_b'])
            category = str(line['category'])
            aug_sentence_a = eda(sentence_a,
                                 alpha_sr=alpha,
                                 alpha_ri=alpha,
                                 alpha_rs=alpha,
                                 p_rd=alpha,
                                 num_aug=num_aug)
            aug_sentencd_b = eda(sentence_b,
                                 alpha_sr=alpha,
                                 alpha_ri=alpha,
                                 alpha_rs=alpha,
                                 p_rd=alpha,
                                 num_aug=num_aug)
            for aug_sentence_a, aug_sentence_b in zip(aug_sentence_a,
                                                      aug_sentencd_b):
                csv_write.writerow([aug_sentence_a, aug_sentence_b, category])
        except IndexError:
            print("Index Error for sample " + str(i))

    print("generated augmented sentences with eda for " + train_orig + " to " +
          output_file + " with num_aug=" + str(num_aug))
Esempio n. 2
0
def gen_eda(train_orig, output_file, alpha, num_aug):

    writer = open(output_file, 'w')
    lines = open(train_orig, 'r').readlines()

    for i, line, in enumerate(lines):
#        print(i,line)
        
        parts = line[:-1].split('\t')
        label = parts[-1]
        sentence = parts[1]
        if label=='CPR:3':
            num_aug=1  
        elif label=='CPR:9':
            num_aug=1
        elif label=='CPR:6':
            num_aug=3
        elif label=='CPR:5':
            num_aug=5
        else:
            num_aug=0
    

        aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
        for aug_sentence in aug_sentences:
            writer.write(parts[0] + "\t" + aug_sentence +"\t"+ label +'\n')

    writer.close()
    print("generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(num_aug))
Esempio n. 3
0
def gen_eda(train_orig,
            output_file,
            alpha_sr,
            alpha_ri,
            alpha_rs,
            alpha_rd,
            num_aug=9):

    import csv
    writer_fp = open(output_file, "w", newline="")
    writer = csv.writer(writer_fp, delimiter="\t", quoting=csv.QUOTE_ALL)
    writer.writerow(["index", "label", "text"])
    line_counter = 0
    with open(train_orig, "r", newline="") as fp:
        reader = csv.reader(fp, delimiter="\t", quoting=csv.QUOTE_ALL)
        next(reader, None)
        for line in reader:
            label = line[1]
            sentence = line[2]
            aug_sentences = eda(sentence,
                                alpha_sr=alpha_sr,
                                alpha_ri=alpha_ri,
                                alpha_rs=alpha_rs,
                                p_rd=alpha_rd,
                                num_aug=num_aug)
            for aug_sentence in aug_sentences:
                writer.writerow([line_counter, label, aug_sentence])
                line_counter += 1

    writer_fp.close()
    print("Generated augmented sentences with eda for " + train_orig + " to " +
          output_file + " with num_aug=" + str(num_aug))
def gen_eda(x_train, y_train, alpha=0.1, num_aug=4):

    x_train_aug = x_train
    y_train_aug = y_train
    y_train_aug = [np.argmax(y, axis=None, out=None) for y in y_train_aug]

    n = len(y_train)  # # of data

    for i in range(n):
        label = np.argmax(y_train[i])

        # bottom 5 classes
        if label >= 16:
            x_train_aug = np.append(x_train_aug,
                                    eda(x_train[i],
                                        alpha_rs=alpha,
                                        num_aug=num_aug),
                                    axis=0)
            for k in range(num_aug + 1):
                y_train_aug = np.append(y_train_aug, np.array(label))

    y_train_aug = tf.keras.utils.to_categorical(y_train_aug, 21)

    print('Before Augmentation: ', Counter(np.argmax(y_train, axis=1)))
    print('After Augmentation: ', Counter(np.argmax(y_train_aug, axis=1)))

    return x_train_aug, y_train_aug
def gen_eda(train_orig,
            output_file,
            alpha_sr,
            alpha_ri,
            alpha_rs,
            alpha_rd,
            num_aug=9):

    writer = open(output_file, 'w')
    lines = open(train_orig, 'r').readlines()

    for i, line in enumerate(lines):
        parts = line[:-1].split('\t')
        label = parts[0]
        sentence = parts[1]
        aug_sentences = eda(sentence,
                            alpha_sr=alpha_sr,
                            alpha_ri=alpha_ri,
                            alpha_rs=alpha_rs,
                            p_rd=alpha_rd,
                            num_aug=num_aug)
        for aug_sentence in aug_sentences:
            writer.write(label + "\t" + aug_sentence + '\n')

    writer.close()
    print("generated augmented sentences with eda for " + train_orig + " to " +
          output_file + " with num_aug=" + str(num_aug))
def gen_eda(train_orig, output_file, alpha, num_aug=9):

    # writer = open(output_file, 'w')
    # lines = open(train_orig, 'r').readlines()

    df = pd.read_csv('movie_reviews_review_level.csv')
    # df_aug = pd.DataFrame()
    # df_aug['review'] = ''
    # df_aug['sentiment'] = ''
    aug_data = []

    for i in range(len(df.index)):
        print(i)
        cur_row = df.loc[i]
        label = cur_row['sentiment']
        sentence = cur_row['review']
        aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
        for aug_sentence in aug_sentences:
            # print(aug_sentence)
            dict1 = {'review': aug_sentence, 'sentiment': label}
            # new_row = pd.Series([aug_sentence, label], index = ['review', ['sentiment']])
            # df_aug = df_aug.append(new_row, ignore_index=True)
            aug_data.append(dict1)

    df_aug = pd.DataFrame(aug_data)
    file_name = "IMDB_num_aug=" + str(num_aug) + ",alpha=" + str(alpha) + ".csv"
    df_aug.to_csv(file_name)
Esempio n. 7
0
def gen_eda(train_orig, output_file, alpha, num_aug=8):
    n = 0
    writer = open(output_file, 'w', encoding='utf-8')
    #lines = open(train_orig, 'r',encoding='utf-16').readlines()
    print("正在使用EDA生成增强语句...")
    with open(train_orig, encoding='utf-8') as fileTrainRaw:
        for line in fileTrainRaw:  # 按行读取文件
            #for i, line in enumerate(lines):
            try:
                parts = line[:-1].split('\t')
                label = parts[0]
                sentence = parts[1]
                aug_sentences = eda(sentence,
                                    alpha_sr=alpha,
                                    alpha_ri=alpha,
                                    alpha_rs=alpha,
                                    p_rd=alpha,
                                    num_aug=num_aug)  #调用eda,注意原则上不能使用默认值
                for aug_sentence in aug_sentences:
                    writer.write(label + "\t" + aug_sentence + '\n')
                if n % 5 == 0:
                    print(n)
            except:
                print("error sentence")
            n = n + 1
    writer.close()
    print("Done!")
    print(output_file)
Esempio n. 8
0
def gen_eda(train_orig, output_file, alpha, num_aug):

    writer = open(output_file, 'w')
    lines = open(train_orig, 'r').readlines()

    for i, line, in enumerate(lines):
        #        print(i,line)
        if i == 0: continue
        parts = line.split(',')
        task2 = parts[3]
        # print(task2)

        if task2 == 'OFFN':
            num_aug = 3
        elif task2 == 'HATE':
            num_aug = 6

        aug_sentences = eda(parts[1],
                            alpha_sr=alpha,
                            alpha_ri=alpha,
                            alpha_rs=alpha,
                            p_rd=alpha,
                            num_aug=num_aug)
        for aug_sentence in aug_sentences:
            writer.write(parts[0] + "\t" + aug_sentence + "\t" + task2 + "\t" +
                         parts[-1] + '\n')

    writer.close()
    print("generated augmented sentences with eda for " + train_orig + " to " +
          output_file + " with num_aug=" + str(num_aug))
Esempio n. 9
0
def gen_eda(train_orig, train_orig_label, output_file, alpha, num_aug=9):
    writer = open(output_file, 'w')
    writer_label = open(output_file_label, 'w')
    lines = open(train_orig, 'r').readlines()
    train_orig_label_lines = open(train_orig_label, 'r').readlines()

    # for i, train_orig_label_line in enumerate(train_orig_label_lines):
    #     part_label = train_orig_label_line[:-1].split('\t')[0]
    #     for j in range(0,num_aug+1):
    #         writer_label.write(part_label + '\n')

    for i, line in enumerate(lines):
        part_label = train_orig_label_lines[i][:-1].split('\t')[0]
        part = line[:-1].split('\t')[0]
        dic_part = json.loads(part)
        sentence = dic_part['goal']
        aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
        for aug_sentence in aug_sentences:
            new_dic_part = dic_part
            new_dic_part['goal'] = aug_sentence
            try:
                writer.write(str(new_dic_part) + '\n')
                writer_label.write(part_label + '\n')
            except:
                pass

    writer.close()
    print("generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(
        num_aug))
Esempio n. 10
0
def get_eda(sentence, alpha=0.1, num_aug=4):
    aug_sentences = eda(sentence,
                        alpha_sr=alpha,
                        alpha_ri=alpha,
                        alpha_rs=alpha,
                        p_rd=alpha,
                        num_aug=num_aug)
    aug_sentences = [x.replace(' ', '') for x in aug_sentences]
    return aug_sentences
def gen_eda(train_orig, output_file, alpha, num_aug=9, query_eda=False):

    writer = open(output_file, 'w', encoding='UTF8')
    lines = open(train_orig, 'r', encoding='UTF8').readlines()

    for i, line in enumerate(lines):
        parts = line[:-1].split('\t')
        label = parts[0]
        id = parts[1]
        query = parts[2]
        title = parts[3]
        description = parts[4]
        if query_eda:
            aug_querys = eda(query,
                             alpha_sr=alpha,
                             alpha_ri=alpha,
                             alpha_rs=alpha,
                             p_rd=alpha,
                             num_aug=num_aug)
        else:
            aug_querys = [query] * (num_aug + 1)
        aug_titles = eda(title,
                         alpha_sr=alpha,
                         alpha_ri=alpha,
                         alpha_rs=alpha,
                         p_rd=alpha,
                         num_aug=num_aug)
        aug_descriptions = eda(description,
                               alpha_sr=alpha,
                               alpha_ri=alpha,
                               alpha_rs=alpha,
                               p_rd=alpha,
                               num_aug=num_aug)
        for aug_query, aug_title, aug_description in zip(
                aug_querys, aug_titles, aug_descriptions):
            writer.write(label + "\t" + id + "\t" + aug_query + "\t" +
                         aug_title + "\t" + aug_description + '\n')

    writer.close()
    print("generated augmented sentences with eda for " + train_orig + " to " +
          output_file + " with num_aug=" + str(num_aug))
Esempio n. 12
0
def main():
    ap = argparse.ArgumentParser()
    # ap.add_argument("--input", required=True, type=str, help="原始数据的输入文件目录")
    ap.add_argument("--input",
                    default="sample.txt",
                    type=str,
                    help="原始数据的输入文件目录")
    ap.add_argument("--output",
                    default="sample_augmented.txt",
                    required=False,
                    type=str,
                    help="增强数据后的输出文件")
    ap.add_argument("--num_aug",
                    default=4,
                    required=False,
                    type=int,
                    help="每条原始语句增强的语句数")
    ap.add_argument("--alpha_sr",
                    default=0.1,
                    required=False,
                    type=float,
                    help="每条语句中替换同义词数占比")
    ap.add_argument("--alpha_ri",
                    default=0.1,
                    required=False,
                    type=float,
                    help="每条语句中随机插入单词数占比")
    ap.add_argument("--alpha_rs",
                    default=0.1,
                    required=False,
                    type=float,
                    help="每条语句中随机互换位置单词数占比")
    ap.add_argument("--alpha_rd",
                    default=0.1,
                    required=False,
                    type=float,
                    help="每条语句中随机删除单词数占比")

    args = ap.parse_args()
    with open(args.input, encoding="utf-8") as fi, \
            open(join(dirname(args.input), args.output), "w", encoding="utf-8") as fo:
        for line in fi:
            label, text = line.strip().split("\t", 2)
            aug_texts = eda(text,
                            alpha_sr=args.alpha_sr,
                            alpha_ri=args.alpha_ri,
                            alpha_rs=args.alpha_rs,
                            p_rd=args.alpha_rd,
                            num_aug=args.num_aug)
            for aug_text in aug_texts:
                fo.write(f"{label}\t{aug_text}\n")
Esempio n. 13
0
def gen_eda(train_orig, output_file, alpha, num_aug=9):

    writer = open(output_file, 'w', encoding='utf8')
    lines = open(train_orig, 'r', encoding='utf8').readlines()

    print("正在使用EDA生成增强语句...")
    for i, line in enumerate(lines):
        parts = line[:-1].split('\t')    #使用[:-1]是把\n去掉了
        label = parts[0]
        sentence = parts[1]
        aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
        for aug_sentence in aug_sentences:
            writer.write(label + "\t" + aug_sentence + '\n')

    writer.close()
    print("已生成增强语句!")
    print(output_file)
Esempio n. 14
0
def gen_eda(train_orig, output_file, alpha, num_aug=9):

    writer = open(output_file, 'w')
    lines = open(train_orig, 'r').readlines()

    for i, line in enumerate(lines):
        parts = line[:-1].split('\t')
        label = parts[0]
        sentence = parts[1]
        aug_sentences = eda(sentence,
                            alpha_sr=alpha,
                            alpha_ri=alpha,
                            alpha_rs=alpha,
                            p_rd=alpha,
                            num_aug=num_aug)
        for aug_sentence in aug_sentences:
            writer.write(label + "\t" + aug_sentence + '\n')

    writer.close()
Esempio n. 15
0
def gen_eda(train_orig, output_file, alpha, num_aug=9):

    # writer = open(output_file, 'w', encoding='utf-8')
    df_original = pd.read_csv(train_orig)
    labels_original = df_original['label'].tolist()
    sentences_original = df_original['content'].tolist()
    #
    # lines = open(train_orig, 'r', encoding='utf-8').readlines()
    # # writer.write("label" + "," + "content" + '\n')

    print("正在使用EDA生成增强语句...")
    # for i, line in enumerate(lines):
    labels = []
    da_sentences = []
    for index, sentence in enumerate(sentences_original):
        label = labels_original[index]

        # for line in islice(lines, 1, None):
        #     parts = line[:-1].split(',')    #使用[:-1]是把\n去掉了
        #     if len(parts) != 2:
        #         continue
        #     label = parts[0]
        #     sentence = parts[1]
        aug_sentences = eda(sentence,
                            alpha_sr=alpha,
                            alpha_ri=alpha,
                            alpha_rs=alpha,
                            p_rd=alpha,
                            num_aug=num_aug)
        for i in range(len(aug_sentences)):
            labels.append(label)
        da_sentences.extend(aug_sentences)

    df = pd.DataFrame({'label': labels, 'content': da_sentences})
    df.to_csv(output_file, index=False)
    # for aug_sentence in aug_sentences:
    #     writer.write(label + "," + aug_sentence + '\n')

    # writer.close()
    print("已生成增强语句!")
    print(output_file)
Esempio n. 16
0
def gen_eda(data, text_col, label_col):
    sentences, labels = [], []
    for idx, row in tqdm(data.iterrows()):
        label = row[label_col]
        sentence = row[text_col]
        aug_sentences = eda(sentence,
                            alpha_sr=configs.alpha_sr,
                            alpha_ri=configs.alpha_ri,
                            alpha_rs=configs.alpha_rs,
                            p_rd=configs.p_rd,
                            num_aug=configs.num_aug)
        sentences.append(aug_sentences)
        labels.append([label] * len(aug_sentences))

    sentences = [j for sub in sentences for j in sub]
    labels = [j for sub in labels for j in sub]
    # strip & remove duplicates
    aug_data = pd.DataFrame({text_col: sentences, label_col: labels})
    aug_data[text_col] = aug_data[text_col].str.strip()
    aug_data = aug_data.drop_duplicates()
    return aug_data
Esempio n. 17
0
 def eda(self, sentence, aspect, adjusted=False):
     sent_adjusted = sentence.replace(aspect, '$t$')
     assert sent_adjusted != sentence, 'Something went wrong, the aspect "{}" cannot be found in "{}"'.format(
         aspect, sentence)
     augmented_sent = eda(sent_adjusted,
                          aspect,
                          alpha_ri=FLAGS.EDA_insertion,
                          alpha_rs=FLAGS.EDA_swap,
                          alpha_sr=FLAGS.EDA_replacement,
                          p_rd=FLAGS.EDA_deletion,
                          percentage=FLAGS.EDA_pct,
                          adjusted=adjusted,
                          counter=self.counter)
     augmented_with_aspect = []
     for sent in augmented_sent:
         augmented_with_aspect.append(sent.replace('$t$', aspect))
         assert sent != sent.replace(
             '$t$', aspect
         ), 'Something went wrong, the aspect "{}" cannot be found in "{}"'.format(
             "$t$", sent)
     return augmented_with_aspect, aspect
Esempio n. 18
0
def gen_eda(train_orig, output_file, alpha, path_to_synonyms, num_aug=9):

    writer = open(output_file, 'w')
    lines = open(train_orig, 'r').readlines()

    for i, line in enumerate(lines):
        parts = line[:-1].split('\t')
        label = parts[0]
        sentence = parts[1]
        aug_sentences = eda(sentence,
                            alpha_sr=alpha,
                            alpha_ri=alpha,
                            alpha_rs=alpha,
                            p_rd=alpha,
                            num_aug=num_aug,
                            path_to_synonyms=path_to_synonyms)
        for aug_sentence in aug_sentences:
            writer.write(label + "\t" + aug_sentence + '\n')

    writer.close()
    print(
        f"generated augmented sentences with eda for {train_orig} to {output_file} with num_aug={num_aug}"
    )
Esempio n. 19
0
def augment(path):
    filenames = random.sample(glob.glob(path + "/*txt"), TOTAL)
    for filename in filenames:
        new_filename = filename[:-4] + "_" + str(0) + filename[-4:]
        os.rename(filename, new_filename)
        for i in range(1, NUM_AUG + 1):
            copy_filename = new_filename[:-5] + str(i) + new_filename[-4:]
            shutil.copyfile(new_filename, copy_filename)
        with open(new_filename, 'r', encoding='utf-8') as fpr:
            data_raw = json.load(fpr)
            article = data_raw['article']
        aug_article = eda(article,
                          alpha_sr=ALPHA,
                          alpha_ri=ALPHA,
                          alpha_rs=ALPHA,
                          p_rd=ALPHA,
                          num_aug=NUM_AUG)
        for i in range(NUM_AUG + 1):
            write_file = new_filename[:-5] + str(i) + new_filename[-4:]
            with open(write_file, 'r', encoding='utf-8') as fpr:
                data_raw = json.load(fpr)
                data_raw['article'] = aug_article[i]
            with open(write_file, 'w', encoding='utf-8') as fpr:
                fpr.write(json.dumps(data_raw))
Esempio n. 20
0
        lines.append(line)


ori_df = pd.DataFrame(columns=["sentence1", "sentence2", "label"])
for (i, line) in enumerate(lines):
    if i == 0:
        continue
    text_a = line[1]
    text_b = line[2]
    label = line[3]
    ori_df = ori_df.append({'sentence1': text_a,'sentence2': text_b, 'label': label}, ignore_index=True)

print(ori_df.head())

for i in ori_df.sentence1:
    ori_sentence = i

    method_label = np.random.randint(0, 4, 1)[0]
    method = augment_single_with_label(method_label)

    aug_sentences = eda(ori_sentence, alpha=0.15, num_aug=1, method=method)
    for aug_sentence in aug_sentences:
        aug_df = aug_df.append({'sentence1': aug_sentence, 'label': method}, ignore_index=True)

print("generated augmented sentences finished.")

print(aug_df['label'].value_counts(normalize=True) * 100)

aug_df.to_csv('augment_train.tsv', sep = '\t', index=False)

Esempio n. 21
0
def sentimentGenerateTestSuite(r,threshold_CC,threshold_MC,symbols_SQ,seq,TestCaseNum,minimalTest,TargMetri,CoverageStop):
    r.resetTime()
    random.seed(1)
    # set oracle radius
    oracleRadius = 0.2
    # load model
    sm = Sentiment()
    sm.load_model()
    # test layer
    layer = 1
    termin = 0
    # minimal test dataset generation
    if minimalTest != '0':
        ncdata = []
        ccdata = []
        mcdata = []
        sqpdata = []
        sqndata = []

    # predict sentiment from reviews
    review = "i really dislike the movie"
    tmp = sm.fromTextToID(review)
    test = np.squeeze(sm.pre_processing_x(tmp))
    h_t, c_t, f_t = sm.cal_hidden_state(test)

    # input seeds
    X_train = sm.X_train[random.sample(range(20000),5000)]

    # test objective NC
    nctoe = NCTestObjectiveEvaluation(r)
    nctoe.model = sm.model
    nctoe.testObjective.layer = layer
    nctoe.testCase = test
    activations_nc = nctoe.get_activations()
    nctoe.testObjective.feature = (np.argwhere(activations_nc >= np.min(activations_nc))).tolist()
    nctoe.testObjective.setOriginalNumOfFeature()

    # test objective CC
    cctoe = CCTestObjectiveEvaluation(r)
    cctoe.model = sm.model
    cctoe.testObjective.layer = layer
    cctoe.hidden = h_t
    cctoe.threshold = float(threshold_CC)
    activations_cc = cctoe.get_activations()
    total_features_cc = (np.argwhere(activations_cc >= np.min(activations_cc))).tolist()
    cctoe.testObjective.feature = total_features_cc
    cctoe.testObjective.setOriginalNumOfFeature()
    cctoe.testObjective.setfeaturecount()

    # test objective MC
    mctoe = MCTestObjectiveEvaluation(r)
    mctoe.model = sm.model
    mctoe.testObjective.layer = layer
    mctoe.hidden = f_t
    mctoe.threshold = float(threshold_MC)
    activations_mc = mctoe.get_activations()
    total_features_mc = (np.argwhere(activations_mc >= np.min(activations_mc))).tolist()
    mctoe.testObjective.feature = total_features_mc
    mctoe.testObjective.setOriginalNumOfFeature()
    mctoe.testObjective.setfeaturecount()

    # test objective SQ
    sqtoe = SQTestObjectiveEvaluation(r)
    sqtoe.model = sm.model
    sqtoe.testObjective.layer = layer
    sqtoe.symbols = int(symbols_SQ)
    # generate all the features
    # choose time steps to cover
    t1 = int(seq[0])
    t2 = int(seq[1])
    indices = slice(t1, t2 + 1)
    # slice(480, 485)
    # characters to represent time series
    alpha_list = [chr(i) for i in range(97, 97 + int(symbols_SQ))]
    symb = ''.join(alpha_list)
    sqtoe.testObjective.feature_p = list(iter.product(symb, repeat=t2-t1+1))
    sqtoe.testObjective.feature_n = list(iter.product(symb, repeat=t2-t1+1))
    sqtoe.testObjective.setOriginalNumOfFeature()


    for test in X_train:
        for i in range(4):
            text = sm.fromIDToText(test)
            (label1, conf1) = sm.displayInfo(test)
            # get next input test2
            # test case pertubations
            alpha = random.uniform(0.001, oracleRadius)
            aug_text = eda(text, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=1)
            tmp = sm.fromTextToID(str(aug_text[0]))
            test2 = np.squeeze(sm.pre_processing_x(tmp))

            if not (test2 is None):
                (label2, conf2) = sm.displayInfo(test2)
                h_t, c_t, f_t = sm.cal_hidden_state(test2)
                cctoe.hidden = h_t
                sm.updateSample(label2, label1, alpha, True)
                # update NC coverage
                nctoe.testCase = test2
                nctoe.update_features()
                # update CC coverage
                cctoe.hidden = h_t
                cctoe.update_features()
                # update MC coverage
                mctoe.hidden = f_t
                mctoe.update_features()
                # update SQ coverage
                sqtoe.hidden = h_t
                sqtoe.update_features(indices)
                # write information to file
                writeInfo(r, sm.numSamples, sm.numAdv, sm.perturbations,nctoe.coverage,cctoe.coverage, mctoe.coverage, sqtoe.coverage_p,sqtoe.coverage_n)

                # terminate condition
                if TargMetri == 'CC':
                    termin = cctoe.coverage
                elif TargMetri == 'GC':
                    termin = mctoe.coverage
                elif TargMetri == 'SQN':
                    termin = sqtoe.coverage_n
                elif TargMetri == 'SQP':
                    termin = sqtoe.coverage_p


                # output test cases and adversarial examples
                if minimalTest == '0':
                    f = open('output/test_set.txt', 'a')
                    f.write(str(label1))
                    f.write('\t')
                    f.writelines(str(aug_text[0]))
                    f.write('\n')
                    f.close()
                    if label2 != label1 :
                        f = open('adv_output/adv_test_set.txt', 'a')
                        f.write(str(label1))
                        f.write('\t')
                        f.write(str(label2))
                        f.write('\t')
                        f.writelines(str(aug_text[0]))
                        f.write('\n')
                        f.close()

                else:
                    if nctoe.minimal == 1 :
                        ncdata.append(test2)
                        f = open('minimal_nc/test_set.txt', 'a')
                        f.write(str(label1))
                        f.write('\t')
                        f.writelines(str(aug_text[0]))
                        f.write('\n')
                        f.close()
                    if cctoe.minimal == 1 :
                        ccdata.append(test2)
                        f = open('minimal_cc/test_set.txt', 'a')
                        f.write(str(label1))
                        f.write('\t')
                        f.writelines(str(aug_text[0]))
                        f.write('\n')
                        f.close()
                    if mctoe.minimal == 1 :
                        mcdata.append(test2)
                        f = open('minimal_mc/test_set.txt', 'a')
                        f.write(str(label1))
                        f.write('\t')
                        f.writelines(str(aug_text[0]))
                        f.write('\n')
                        f.close()
                    if sqtoe.minimalp == 1 :
                        sqpdata.append(test2)
                        f = open('minimal_sqp/test_set.txt', 'a')
                        f.write(str(label1))
                        f.write('\t')
                        f.writelines(str(aug_text[0]))
                        f.write('\n')
                        f.close()
                    if sqtoe.minimaln == 1 :
                        sqndata.append(test2)
                        f = open('minimal_sqn/test_set.txt', 'a')
                        f.write(str(label1))
                        f.write('\t')
                        f.writelines(str(aug_text[0]))
                        f.write('\n')
                        f.close()

            # check termination condition
            if sm.numSamples < int(TestCaseNum) and termin < float(CoverageStop):
                continue
            else:
                io.savemat('log_folder/feature_count_CC.mat', {'feature_count_CC': cctoe.testObjective.feature_count})
                io.savemat('log_folder/feature_count_GC.mat', {'feature_count_GC': mctoe.testObjective.feature_count})
                # if minimalTest != '0':
                #     np.save('minimal_nc/ncdata', ncdata)
                #     np.save('minimal_cc/ccdata', ccdata)
                #     np.save('minimal_mc/mcdata', mcdata)
                #     np.save('minimal_sqp/sqpdata', sqpdata)
                #     np.save('minimal_sqn/sqndata', sqndata)
                break
        if sm.numSamples < int(TestCaseNum) and termin < float(CoverageStop):
            continue
        else:
            break

    print("statistics: \n")
    nctoe.displayCoverage()
    cctoe.displayCoverage()
    mctoe.displayCoverage()
    sqtoe.displayCoverage1()
    sqtoe.displayCoverage2()
    sm.displaySamples()
    sm.displaySuccessRate()
Esempio n. 22
0
def generate_aug_eda(X_train, y_train, eda_args, first):

    alpha_sr = eda_args.alpha_sr
    alpha_ri = eda_args.alpha_ri
    alpha_rs = eda_args.alpha_rs
    alpha_rd = eda_args.alpha_rd
    num_aug = eda_args.num_aug

    # create dataframe for storing augmented data
    X_train_aug = pd.DataFrame(columns=['X'])
    y_train_aug = pd.DataFrame(columns=['y'])

    # iterate the current dataset
    num_sample = len(X_train)
    for index in tqdm(range(num_sample)):
        # print(str(index), " / ", str(num_sample))
        if (first == True):
            X_train_sample = X_train.iloc[index]
            y_train_sample = y_train.iloc[index]
            if (y_train_sample):
                # perform augmentation by using eda with current hyper-parameters
                X_train_sample = cleanText(X_train_sample)
                aug_sentences = eda(X_train_sample,
                                    alpha_sr=alpha_sr,
                                    alpha_ri=alpha_ri,
                                    alpha_rs=alpha_rs,
                                    p_rd=alpha_rd,
                                    num_aug=num_aug)

                # append every augmented data to dataset
                for aug_sentence in aug_sentences:
                    X_train_aug = X_train_aug.append({'X': aug_sentence},
                                                     ignore_index=True)
                    y_train_aug = y_train_aug.append({'y': y_train_sample},
                                                     ignore_index=True)
            else:
                X_train_aug = X_train_aug.append({'X': X_train_sample},
                                                 ignore_index=True)
                y_train_aug = y_train_aug.append({'y': y_train_sample},
                                                 ignore_index=True)

        elif (first == False):
            X_train_sample = X_train['X'].iloc[index]
            y_train_sample = y_train['y'].iloc[index]
            if (y_train_sample):
                # perform augmentation by using eda with current hyper-parameters
                X_train_sample = cleanText(X_train_sample)
                aug_sentences = eda(X_train_sample,
                                    alpha_sr=alpha_sr,
                                    alpha_ri=alpha_ri,
                                    alpha_rs=alpha_rs,
                                    p_rd=alpha_rd,
                                    num_aug=num_aug)

                # append every augmented data to dataset
                for aug_sentence in aug_sentences:
                    X_train_aug = X_train_aug.append({'X': aug_sentence},
                                                     ignore_index=True)
                    y_train_aug = y_train_aug.append({'y': y_train_sample},
                                                     ignore_index=True)
            else:
                X_train_aug = X_train_aug.append({'X': X_train_sample},
                                                 ignore_index=True)
                y_train_aug = y_train_aug.append({'y': y_train_sample},
                                                 ignore_index=True)

    # print("generate augmented sentences by eda")
    return X_train_aug, y_train_aug
Esempio n. 23
0
def sentimentGenerateTestSuite(r, threshold_SC, threshold_BC, symbols_TC, seq,
                               TestCaseNum, Mutation, CoverageStop):
    r.resetTime()
    seeds = 3
    random.seed(seeds)
    # set oracle radius
    oracleRadius = 0.2
    # load model
    sm = Sentiment()
    sm.load_model()
    # test layer
    layer = 1

    #choose time step to cover
    t1 = int(seq[0])
    t2 = int(seq[1])
    indices = slice(t1, t2 + 1)

    # calculate mean and std for z-norm
    h_train = sm.cal_hidden_keras(sm.X_train, layer)
    mean_TC, std_TC, max_SC, min_SC, max_BC, min_BC = aggregate_inf(
        h_train, indices)

    # get the seeds pool
    X_seeds = []
    # input seeds
    for label_idx in range(2):
        x_class = sm.X_train[sm.y_train == label_idx]
        for i in range(100, 200):
            X_seeds.append(x_class[i])
    # X_seeds = sm.X_train[sm.y_train == 0]
    # X_seeds = X_seeds[:100]

    # predict sentiment from reviews
    review = "really good film to watch and highly recommended"
    # review = "movie is horrible and watching experience is terrible"
    tmp = sm.fromTextToID(review)
    test = np.squeeze(sm.pre_processing_x([tmp]))
    [h_t, c_t, f_t] = sm.cal_hidden_state(test, layer)

    # test objective NC
    nctoe = NCTestObjectiveEvaluation(r)
    threshold_nc = 0
    nctoe.testObjective.setParamters(sm.model, layer, threshold_nc, test)

    # test objective KMNC
    kmnctoe = KMNCTestObjectiveEvaluation(r)
    k_sec = 10
    kmnctoe.testObjective.setParamters(sm.model, layer, k_sec, test)

    # test objective NBC
    nbctoe = NBCTestObjectiveEvaluation(r)
    ub = 0.7
    lb = -0.7
    nbctoe.testObjective.setParamters(sm.model, layer, ub, lb, test)

    # test objective SNAC
    snactoe = NCTestObjectiveEvaluation(r)
    threshold_snac = 0.7
    snactoe.testObjective.setParamters(sm.model, layer, threshold_snac, test)

    # test objective SC
    SCtoe = SCTestObjectiveEvaluation(r)
    SC_test_obj = 'h'
    act_SC = SCtoe.get_activations(np.array([h_t]))
    SCtoe.testObjective.setParamters(sm.model, SC_test_obj, layer,
                                     float(threshold_SC), indices, max_SC,
                                     min_SC, np.squeeze(act_SC))

    # test objective BC
    BCtoe = BCTestObjectiveEvaluation(r)
    BC_test_obj = 'h'
    act_BC = BCtoe.get_activations(np.array([h_t]))
    BCtoe.testObjective.setParamters(sm.model, BC_test_obj, layer,
                                     float(threshold_BC), indices, max_BC,
                                     min_BC, np.squeeze(act_BC))

    # test objective TC
    TCtoe = TCTestObjectiveEvaluation(r)
    seq_len = 5
    TC_test_obj = 'h'
    act_TC = TCtoe.get_activations(np.array([h_t]))
    TCtoe.testObjective.setParamters(sm.model, TC_test_obj, layer,
                                     int(symbols_TC), seq_len, indices,
                                     mean_TC, std_TC)

    # visualize internal structure information
    # act_TC = np.squeeze(act_TC)[-8:]
    # act_SC = np.squeeze(act_SC)[-8:]
    # act_TC = Z_ScoreNormalization(act_TC, mean_TC, std_TC)
    # act_BC = np.sum(f_t, axis=1) / float(f_t.shape[1])
    # act_BC = act_BC[-8:]
    # act_SC = (act_SC - min_SC) / (max_SC - min_SC)
    #
    # plt.figure(1)
    # plot_x = np.arange(len(act_TC))
    # plt.plot(plot_x, act_TC)
    # plt.ylabel('$\\xi_t^{h}$', fontsize=14)
    # plt.xticks(fontsize=14)
    # plt.yticks(fontsize=14)
    # plt.figure(2)
    # plot_x = np.arange(len(act_BC))
    # plt.bar(plot_x, act_BC)
    # plt.ylabel('$\\xi_t^{f, avg}$', fontsize=14)
    # plt.xticks(fontsize=14)
    # plt.yticks(fontsize=14)
    # plt.figure(3)
    # plot_x = np.arange(len(act_SC))
    # plt.bar(plot_x, act_SC)
    # plt.xlabel('Input', fontsize=14)
    # plt.ylabel('$\Delta\\xi_t^{h}$', fontsize=14)
    # plt.xticks(fontsize=14)
    # plt.yticks(fontsize=14)
    # plt.show()

    text_seeds = [sm.fromIDToText(item) for item in X_seeds]
    y_seeds = sm.getOutputResult(X_seeds)
    X_test = []
    r_t = 400 // len(X_seeds)

    while sm.numSamples < int(TestCaseNum):

        # generate test cases
        unique_test = np.repeat(np.arange(len(X_seeds)), r_t, axis=0)
        y_test1 = np.repeat(y_seeds, r_t, axis=0)

        alpha = random.uniform(0.01, oracleRadius)
        aug_text = []
        for text in text_seeds:
            out = eda(text,
                      sm.numSamples,
                      alpha_sr=alpha,
                      alpha_ri=alpha,
                      alpha_rs=alpha,
                      p_rd=alpha,
                      num_aug=r_t)
            aug_text = aug_text + out

        tmp = [sm.fromTextToID(text) for text in aug_text]
        test2 = sm.pre_processing_x(tmp)

        if sm.numSamples > 0 and Mutation == 'genetic':
            y_test1 = np.concatenate(
                (y_test1, np.array([sc_test_1]), np.array([bc_test_1]),
                 np.array([tc_test_1])))
            test2 = np.concatenate(
                (test2, np.array([sc_test_2]), np.array([bc_test_2]),
                 np.array([tc_test_2])))
            unique_test = np.concatenate(
                (unique_test, np.array([seed_id_sc]), np.array([seed_id_bc]),
                 np.array([seed_id_tc])))

        y_test2 = sm.getOutputResult(test2)
        # # display statistics of adv.
        sm.displayInfo(y_test1, y_test2, alpha, unique_test)

        # calculate the hidden state
        h_test = sm.cal_hidden_keras(test2, layer)

        # update the coverage
        # update NC coverage
        nctoe.update_features(test2)
        # update KMNC coverage
        kmnctoe.update_features(test2)
        # update NBC coverage
        nbctoe.update_features(test2)
        # update SNAC coverage
        snactoe.update_features(test2)
        # update SC coverage
        SCtoe.update_features(h_test, len(X_test))
        # update BC coverage
        BCtoe.update_features(h_test, len(X_test))
        # update TC coverage
        TCtoe.update_features(h_test, len(X_test))

        X_test = X_test + test2.tolist()

        if Mutation == 'genetic':
            num_generation = 10
            sc_test_record = SCtoe.testObjective.test_record
            bc_test_record = BCtoe.testObjective.test_record
            tc_test_record = TCtoe.testObjective.test_record

            if len(sc_test_record) != 0:
                print('boost coverage for SC')
                sc_feature, sc_cov_fit = random.choice(
                    list(sc_test_record.items()))
                seed_id_sc = sc_cov_fit[0] % len(X_seeds)
                sc_test_1 = y_seeds[seed_id_sc]
                # boost coverage with GA
                sc_test_2 = getNextInputByGA(sm, SCtoe, sc_feature,
                                             np.array(X_test[sc_cov_fit[0]]),
                                             num_generation, sm.numSamples)
                print('\n')

            if len(bc_test_record) != 0:
                print('boost coverage for BC')
                bc_feature, bc_cov_fit = random.choice(
                    list(bc_test_record.items()))
                seed_id_bc = bc_cov_fit[0] % len(X_seeds)
                bc_test_1 = y_seeds[seed_id_bc]
                # boost coverage with GA
                bc_test_2 = getNextInputByGA(sm, BCtoe, bc_feature,
                                             np.array(X_test[bc_cov_fit[0]]),
                                             num_generation, sm.numSamples)
                print('\n')

            if len(tc_test_record) != 0:
                print('boost coverage for TC')
                tc_feature, tc_cov_fit = random.choice(
                    list(tc_test_record.items()))
                seed_id_tc = tc_cov_fit[1] % len(X_seeds)
                tc_test_1 = y_seeds[seed_id_tc]
                # boost coverage with GA
                tc_test_2 = getNextInputByGA(sm, TCtoe, tc_feature,
                                             np.array(X_test[tc_cov_fit[1]]),
                                             num_generation, sm.numSamples)

        # write information to file
        writeInfo(r, sm.numSamples, sm.numAdv, sm.perturbations,
                  nctoe.coverage, kmnctoe.coverage, nbctoe.coverage,
                  snactoe.coverage, SCtoe.coverage, BCtoe.coverage,
                  TCtoe.coverage, len(sm.unique_adv))

    print("statistics: \n")
    nctoe.displayCoverage()
    kmnctoe.displayCoverage()
    nbctoe.displayCoverage()
    snactoe.displayCoverage()
    SCtoe.displayCoverage()
    BCtoe.displayCoverage()
    TCtoe.displayCoverage()
    print('unique adv.', len(sm.unique_adv))
    sm.displaySuccessRate()
Esempio n. 24
0
    np.random.seed(example_id)
    Context.append(knowledgebase.Context.values[example_id])
    Utterance.append(np.random.choice(tmp_distractors, 1)[0])
    Labels.append(0)

train_df = pd.DataFrame({
    'Context': Context,
    'Utterance': Utterance,
    'Label': Labels
})

for i in range(len(train_df)):
    try:
        tmp_df = pd.DataFrame({
            'Context':
            eda(train_df.Context.values[i], .2, .2, .2, .2, 15),
            'Utterance':
            list(np.repeat(train_df.Utterance.values[i], 16)),
            'Label':
            list(np.repeat(train_df.Label.values[i], 16))
        })
        train_df = train_df.append(tmp_df)
    except Exception:
        pass

train_df.reset_index(level=None, drop=True, inplace=True)

Context, WOzAnswers, Labels = [], [], []
KBanswers = list(np.unique(knowledgebase.Utterance.values))
for example_id in range(len(validation_dialogues)):
    exampleWOzAnswers = list(validation_dialogues.iloc[example_id, 7:].values)
    id_to_exclude = KBanswers.index(knowledgebase.Utterance.values[example_id])
    tmp_distractors = [KBanswers[i] for i in
            np.array(range(len(KBanswers)))
            [np.isin(range(len(KBanswers)), id_to_exclude, invert=True)]
            ]
    np.random.seed(example_id)
    Context.append(knowledgebase.Context.values[example_id])
    Utterance.append(np.random.choice(tmp_distractors, 1)[0])
    Labels.append(0)

train_df = pd.DataFrame({'Context':Context, 'Utterance':Utterance, 'Label':Labels})

for i in range(len(train_df)):
    try:
        tmp_df = pd.DataFrame({
            'Context': eda(train_df.Context.values[i], .2, .2, .2, .2, 15),
            'Utterance': list(np.repeat(train_df.Utterance.values[i], 16)),
            'Label': list(np.repeat(train_df.Label.values[i], 16))
            })
        train_df = train_df.append(tmp_df)
    except Exception:
        pass

train_df.reset_index(level=None, drop=True, inplace=True)
###################
ar_msk = np.random.rand(len(train_df)) < 2/3
train_df[ar_msk].to_csv('data/data.csv', encoding='utf-8', index=False)
train_df[~ar_msk].to_csv('data/data_test.csv', encoding='utf-8', index=False)
###-clear space-###
del(train_df, ar_msk)
###################