Esempio n. 1
0
def experiment_RQ5(mode="WPDP", datatype="ast", im=False):
    dataset, dataset_list = dataset_generation(mode=mode, datatype="tokens")
    res = {}
    count = 0
    vocab = WordVocab.load_vocab("../model_files/vocab.txt")
    tokenEmb, posEmb = load_bert_weight(max_feature)
    for project_name in dataset_list:
        res_in = {}
        for num in [8, 16, 32, 48, 64, 128, 256]:
            if mode == "WPDP":
                pre_project_name = dataset[project_name][0]
                train_seq_feat, train_stat_feat, train_y = dataset[project_name][1][0], dataset[project_name][1][1], \
                                                           dataset[project_name][1][2]
                target_seq_feat, target_stat_feat, target_y = dataset[project_name][2][0], dataset[project_name][2][1], \
                                                              dataset[project_name][2][2]
            else:
                train_seq_feat, train_stat_feat, train_y = dataset[project_name][0][0], dataset[project_name][0][1], \
                                                           dataset[project_name][0][2]
                target_seq_feat, target_stat_feat, target_y = dataset[project_name][1][0], dataset[project_name][1][1], \
                                                              dataset[project_name][1][2]

            train_seq_feat, train_stat_feat, train_y = data_oversampling(
                train_seq_feat, train_stat_feat, train_y)
            max_len = 512
            print("processing begin..")
            train_seq_feat = train_seq_feat.apply(
                lambda x: tokenize(x, 512, max_feature, vocab))
            target_seq_feat = target_seq_feat.apply(
                lambda x: tokenize(x, 512, max_feature, vocab))
            print("processing finished")
            train_seq_feat = np.array(list(train_seq_feat))
            target_seq_feat = np.array(list(target_seq_feat))
            train_posEmb = np.expand_dims(posEmb,
                                          0).repeat(train_seq_feat.shape[0],
                                                    axis=0)
            target_posEmbd = np.expand_dims(posEmb,
                                            0).repeat(target_seq_feat.shape[0],
                                                      axis=0)
            pred = bert_lstm(train_seq_feat,
                             train_y,
                             target_seq_feat,
                             target_y,
                             tokenEmb,
                             train_posEmb,
                             target_posEmbd,
                             max_feature,
                             hidden_size=num)
            f1 = f1_score(target_y, pred)
            precision = precision_score(target_y, pred)
            recall = recall_score(target_y, pred)
            print([f1, precision, recall], num)
            project_nam = project_name.split("-")[0]
            res_in[project_nam + str(num)] = [f1, precision, recall]
        print(count)
        count += 1
        res[project_name] = res_in
    with open("../data/experiment_results/RQ5/hidden.pkl", "wb") as f:
        pickle.dump(res, f)
Esempio n. 2
0
    def train(self):

        print("Loading vocab", self.vocab_path)
        vocab = WordVocab.load_vocab(self.vocab_path)
        print("vocab Size: ", len(vocab))

        print("\nLoading Train Dataset")
        logkey_train, logkey_valid, time_train, time_valid = generate_train_valid(self.output_path + "train", window_size=self.window_size,
                                     adaptive_window=self.adaptive_window,
                                     valid_size=self.valid_ratio,
                                     sample_ratio=self.sample_ratio,
                                     scale=self.scale,
                                     scale_path=self.scale_path,
                                     seq_len=self.seq_len,
                                     min_len=self.min_len
                                    )

        train_dataset = LogDataset(logkey_train,time_train, vocab, seq_len=self.seq_len,
                                    corpus_lines=self.corpus_lines, on_memory=self.on_memory, mask_ratio=self.mask_ratio)

        print("\nLoading valid Dataset")
        # valid_dataset = generate_train_valid(self.output_path + "train", window_size=self.window_size,
        #                              adaptive_window=self.adaptive_window,
        #                              sample_ratio=self.valid_ratio)

        valid_dataset = LogDataset(logkey_valid, time_valid, vocab, seq_len=self.seq_len, on_memory=self.on_memory, mask_ratio=self.mask_ratio)

        print("Creating Dataloader")
        self.train_data_loader = DataLoader(train_dataset, batch_size=self.batch_size, num_workers=self.num_workers,
                                      collate_fn=train_dataset.collate_fn, drop_last=True)
        self.valid_data_loader = DataLoader(valid_dataset, batch_size=self.batch_size, num_workers=self.num_workers,
                                       collate_fn=train_dataset.collate_fn, drop_last=True)
        del train_dataset
        del valid_dataset
        del logkey_train
        del logkey_valid
        del time_train
        del time_valid
        gc.collect()

        print("Building BERT model")
        bert = BERT(len(vocab), max_len=self.max_len, hidden=self.hidden, n_layers=self.layers, attn_heads=self.attn_heads,
                    is_logkey=self.is_logkey, is_time=self.is_time)

        print("Creating BERT Trainer")
        self.trainer = BERTTrainer(bert, len(vocab), train_dataloader=self.train_data_loader, valid_dataloader=self.valid_data_loader,
                              lr=self.lr, betas=(self.adam_beta1, self.adam_beta2), weight_decay=self.adam_weight_decay,
                              with_cuda=self.with_cuda, cuda_devices=self.cuda_devices, log_freq=self.log_freq,
                              is_logkey=self.is_logkey, is_time=self.is_time,
                              hypersphere_loss=self.hypersphere_loss)

        self.start_iteration(surfix_log="log2")

        self.plot_train_valid_loss("_log2")
Esempio n. 3
0
def test_custom_dataset():
    vocab = WordVocab.load_vocab(args.vocab_path)

    cd = CustomBERTDataset(corpus_path=args.corpus_path,
                           vocab=vocab,
                           seq_len=args.seq_len,
                           encoding=args.encoding,
                           corpus_lines=args.corpus_lines,
                           on_memory=args.on_memory)

    t1 = cd.get_random_line()
    t1_random, t1_label = cd.random_word(t1)
    t1_random
    t1_label

    cd[0]
Esempio n. 4
0
    def __init__(self, device=None, jit=False):
        self.device = device
        self.jit = jit
        args = parse_args(args=[
            '--train_dataset', 'data/corpus.small',
            '--test_dataset', 'data/corpus.small',
            '--vocab_path', 'data/vocab.small',
            '--output_path', 'bert.model',
        ]) # Avoid reading sys.argv here
        args.with_cuda = self.device == 'cuda'
        args.script = self.jit
        print("Loading Vocab", args.vocab_path)
        vocab = WordVocab.load_vocab(args.vocab_path)
        print("Vocab Size: ", len(vocab))

        train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len,
                                    corpus_lines=args.corpus_lines, on_memory=args.on_memory)
        test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory) \
            if args.test_dataset is not None else None

        print("Creating Dataloader")
        train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
        test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \
            if test_dataset is not None else None

        print("Building BERT model")
        bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads)

        if args.script:
            print("Scripting BERT model")
            bert = torch.jit.script(bert)

        self.trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader,
                                   lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay,
                                   with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq, debug=args.debug)

        example_batch = next(iter(train_data_loader))
        self.example_inputs = example_batch['bert_input'].to(self.device), example_batch['segment_label'].to(self.device)
Esempio n. 5
0
parser.add_argument("-e", "--epochs", type=int, default=10)
parser.add_argument("-w", "--num_workers", type=int, default=5)
parser.add_argument("--corpus_lines", type=int, default=None)

parser.add_argument("--lr", type=float, default=1e-3)
parser.add_argument("--adam_weight_decay", type=float, default=0.01)
parser.add_argument("--adam_beta1", type=float, default=0.9)
parser.add_argument("--adam_beta2", type=float, default=0.999)
parser.add_argument("--log_freq", type=int, default=10)

parser.add_argument("-c", "--cuda", type=bool, default=True)

args = parser.parse_args()

print("Loading Vocab", args.vocab_path)
vocab = WordVocab.load_vocab(args.vocab_path)
print("Vocab Size: ", len(vocab))

print("Loading Train Dataset", args.train_dataset)
train_dataset = BERTDataset(args.train_dataset,
                            vocab,
                            seq_len=args.seq_len,
                            corpus_lines=args.corpus_lines)

print("Loading Test Dataset", args.test_dataset)
test_dataset = BERTDataset(
    args.test_dataset, vocab,
    seq_len=args.seq_len) if args.test_dataset is not None else None

print("Creating Dataloader")
train_data_loader = DataLoader(train_dataset,
Esempio n. 6
0
    def predict(self):
        model = torch.load(self.model_path)
        model.to(self.device)
        model.eval()
        print('model_path: {}'.format(self.model_path))

        start_time = time.time()
        vocab = WordVocab.load_vocab(self.vocab_path)

        scale = None
        error_dict = None
        if self.is_time:
            with open(self.scale_path, "rb") as f:
                scale = pickle.load(f)

            with open(self.model_dir + "error_dict.pkl", 'rb') as f:
                error_dict = pickle.load(f)

        if self.hypersphere_loss:
            center_dict = torch.load(self.model_dir + "best_center.pt")
            self.center = center_dict["center"]
            self.radius = center_dict["radius"]
            # self.center = self.center.view(1,-1)

        print("test normal predicting")
        test_normal_results, test_normal_errors = self.helper(
            model, self.output_dir, "test_normal", vocab, scale, error_dict)

        print("test abnormal predicting")
        test_abnormal_results, test_abnormal_errors = self.helper(
            model, self.output_dir, "test_abnormal", vocab, scale, error_dict)

        print("Saving test normal results")
        with open(self.model_dir + "test_normal_results", "wb") as f:
            pickle.dump(test_normal_results, f)

        print("Saving test abnormal results")
        with open(self.model_dir + "test_abnormal_results", "wb") as f:
            pickle.dump(test_abnormal_results, f)

        print("Saving test normal errors")
        with open(self.model_dir + "test_normal_errors.pkl", "wb") as f:
            pickle.dump(test_normal_errors, f)

        print("Saving test abnormal results")
        with open(self.model_dir + "test_abnormal_errors.pkl", "wb") as f:
            pickle.dump(test_abnormal_errors, f)

        params = {
            "is_logkey": self.is_logkey,
            "is_time": self.is_time,
            "hypersphere_loss": self.hypersphere_loss,
            "hypersphere_loss_test": self.hypersphere_loss_test
        }
        best_th, best_seq_th, FP, TP, TN, FN, P, R, F1 = find_best_threshold(
            test_normal_results,
            test_abnormal_results,
            params=params,
            th_range=np.arange(10),
            seq_range=np.arange(0, 1, 0.1))

        print("best threshold: {}, best threshold ratio: {}".format(
            best_th, best_seq_th))
        print("TP: {}, TN: {}, FP: {}, FN: {}".format(TP, TN, FP, FN))
        print(
            'Precision: {:.2f}%, Recall: {:.2f}%, F1-measure: {:.2f}%'.format(
                P, R, F1))
        elapsed_time = time.time() - start_time
        print('elapsed_time: {}'.format(elapsed_time))
Esempio n. 7
0
    train_parser.set_defaults(mode='train')

    predict_parser = subparsers.add_parser('predict')
    predict_parser.set_defaults(mode='predict')
    predict_parser.add_argument("-m", "--mean", type=float, default=0)
    predict_parser.add_argument("-s", "--std", type=float, default=1)

    vocab_parser = subparsers.add_parser('vocab')
    vocab_parser.set_defaults(mode='vocab')
    vocab_parser.add_argument("-s", "--vocab_size", type=int, default=None)
    vocab_parser.add_argument("-e", "--encoding", type=str, default="utf-8")
    vocab_parser.add_argument("-m", "--min_freq", type=int, default=1)

    args = parser.parse_args()
    print("arguments", args)
    # Trainer(options).train()
    # Predictor(options).predict()

    if args.mode == 'train':
        Trainer(options).train()

    elif args.mode == 'predict':
        Predictor(options).predict()

    elif args.mode == 'vocab':
        with open(options["train_vocab"], 'r') as f:
            logs = f.readlines()
        vocab = WordVocab(logs)
        print("vocab_size", len(vocab))
        vocab.save_vocab(options["vocab_path"])
Esempio n. 8
0
def train():
    parser = argparse.ArgumentParser()

    parser.add_argument("-c",
                        "--train_dataset",
                        type=str,
                        default='../data/data.bert',
                        help="train dataset for train bert")
    parser.add_argument("-t",
                        "--test_dataset",
                        type=str,
                        default=None,
                        help="test set for evaluate train set")
    parser.add_argument("-v",
                        "--vocab_path",
                        type=str,
                        default='../data/bert.vb',
                        help="built vocab model path with bert-vocab")
    parser.add_argument("-o",
                        "--output_path",
                        type=str,
                        default='../data/bert.model',
                        help="ex)output/bert.model")

    parser.add_argument("-hs",
                        "--hidden",
                        type=int,
                        default=256,
                        help="hidden size of transformer model")
    parser.add_argument("-l",
                        "--layers",
                        type=int,
                        default=8,
                        help="number of layers")
    parser.add_argument("-a",
                        "--attn_heads",
                        type=int,
                        default=8,
                        help="number of attention heads")
    parser.add_argument("-s",
                        "--seq_len",
                        type=int,
                        default=20,
                        help="maximum sequence len")

    parser.add_argument("-b",
                        "--batch_size",
                        type=int,
                        default=64,
                        help="number of batch_size")
    parser.add_argument("-e",
                        "--epochs",
                        type=int,
                        default=10,
                        help="number of epochs")
    parser.add_argument("-w",
                        "--num_workers",
                        type=int,
                        default=5,
                        help="dataloader worker size")

    parser.add_argument("--with_cuda",
                        type=bool,
                        default=True,
                        help="training with CUDA: true, or false")
    parser.add_argument("--log_freq",
                        type=int,
                        default=10,
                        help="printing loss every n iter: setting n")
    parser.add_argument("--corpus_lines",
                        type=int,
                        default=None,
                        help="total number of lines in corpus")
    parser.add_argument("--cuda_devices",
                        type=int,
                        nargs='+',
                        default=None,
                        help="CUDA device ids")
    parser.add_argument("--on_memory",
                        type=bool,
                        default=True,
                        help="Loading on memory: true or false")

    parser.add_argument("--lr",
                        type=float,
                        default=1e-3,
                        help="learning rate of adam")
    parser.add_argument("--adam_weight_decay",
                        type=float,
                        default=0.01,
                        help="weight_decay of adam")
    parser.add_argument("--adam_beta1",
                        type=float,
                        default=0.9,
                        help="adam first beta value")
    parser.add_argument("--adam_beta2",
                        type=float,
                        default=0.999,
                        help="adam first beta value")

    args = parser.parse_args()

    print("Loading Vocab", args.vocab_path)
    vocab = WordVocab.load_vocab(args.vocab_path)
    print("Vocab Size: ", len(vocab))

    print("Loading Train Dataset", args.train_dataset)
    train_dataset = BERTDataset(args.train_dataset,
                                vocab,
                                seq_len=args.seq_len,
                                corpus_lines=args.corpus_lines,
                                on_memory=args.on_memory)

    print("Loading Test Dataset", args.test_dataset)
    test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory) \
        if args.test_dataset is not None else None

    # tmp_ = train_dataset.__getitem__(20)
    # print(tmp_['bert_input'])
    #
    # x = tmp_['bert_input'].unsqueeze(0)

    # mask_ = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
    #
    # print(mask_.size())
    # print(mask_)

    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)
    test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \
        if test_dataset is not None else None
    #
    #
    #
    print("Building BERT model")
    bert = BERT(len(vocab),
                hidden=args.hidden,
                n_layers=args.layers,
                attn_heads=args.attn_heads)

    print("Creating BERT Trainer")
    trainer = BERTTrainer(bert,
                          len(vocab),
                          train_dataloader=train_data_loader,
                          test_dataloader=test_data_loader,
                          lr=args.lr,
                          betas=(args.adam_beta1, args.adam_beta2),
                          weight_decay=args.adam_weight_decay,
                          with_cuda=args.with_cuda,
                          cuda_devices=args.cuda_devices,
                          log_freq=args.log_freq)
    #
    print("Training Start")
    for epoch in range(args.epochs):
        trainer.train(epoch)
        trainer.save(epoch, args.output_path)

        if test_data_loader is not None:
            trainer.test(epoch)
Esempio n. 9
0
import argparse

from bert_pytorch.dataset import WordVocab

parser = argparse.ArgumentParser()
parser.add_argument("-c", "--corpus_path", required=True, type=str)
parser.add_argument("-o", "--output_path", required=True, type=str)
parser.add_argument("-s", "--vocab_size", type=int, default=None)
parser.add_argument("-e", "--encoding", type=str, default="utf-8")
parser.add_argument("-m", "--min_freq", type=int, default=1)
args = parser.parse_args()

with open(args.corpus_path, "r", encoding=args.encoding) as f:
    vocab = WordVocab(f, max_size=args.vocab_size, min_freq=args.min_freq)

print("VOCAB SIZE:", len(vocab))
vocab.save_vocab(args.output_path)
Esempio n. 10
0
                        required=True)
    parser.add_argument("-d", "--sop_dataset_path", type=str, required=True)
    parser.add_argument("-t", "--train_and_validate", type=str, default="True")
    parser.add_argument("-e", "--epochs", type=int, default=10)
    args = parser.parse_args()

    # file paths
    vocab_path = args.vocab_path
    albert_model_path = args.bert_model_path
    clf_model_path = args.sop_classifier_model_path
    sop_dataset_path = args.sop_dataset_path
    train_and_validate = True if args.train_and_validate.lower(
    ) == 'true' else False

    # load vocabulary
    vocab = WordVocab.load_vocab(vocab_path)

    # load pretrainer albert model
    bert = ALBERT(vocab_size=len(vocab),
                  embed_size=128,
                  hidden=256,
                  n_layers=8,
                  attn_heads=8,
                  seq_len=64)
    bert = torch.load(albert_model_path)

    # parameters
    num_class = 37
    batch_size = 64
    seq_len = 64
    epochs = 20
Esempio n. 11
0
    predict_parser = subparsers.add_parser('predict')
    predict_parser.set_defaults(mode='predict')
    predict_parser.add_argument("-m", "--mean", type=float, default=0)
    predict_parser.add_argument("-s", "--std", type=float, default=1)

    vocab_parser = subparsers.add_parser('vocab')
    vocab_parser.set_defaults(mode='vocab')
    vocab_parser.add_argument("-s", "--vocab_size", type=int, default=None)
    vocab_parser.add_argument("-e", "--encoding", type=str, default="utf-8")
    vocab_parser.add_argument("-m", "--min_freq", type=int, default=1)

    args = parser.parse_args()
    print("arguments", args)

    if args.mode == 'train':
        Trainer(options).train()

    elif args.mode == 'predict':
        Predictor(options).predict()

    elif args.mode == 'vocab':
        with open(options["train_vocab"], "r") as f:
            texts = f.readlines()
        vocab = WordVocab(texts,
                          max_size=args.vocab_size,
                          min_freq=args.min_freq)
        print("VOCAB SIZE:", len(vocab))
        print("save vocab in", options["vocab_path"])
        vocab.save_vocab(options["vocab_path"])
Esempio n. 12
0
def train():
    parser = argparse.ArgumentParser()

    parser.add_argument("-c",
                        "--train_dataset",
                        required=True,
                        type=str,
                        help="train dataset for train bert")
    parser.add_argument("-t",
                        "--valid_dataset",
                        type=str,
                        default=None,
                        help="valid set for evaluate train set")
    parser.add_argument("-v",
                        "--vocab_path",
                        required=True,
                        type=str,
                        help="built vocab model path with bert-vocab")
    parser.add_argument("-o",
                        "--output_path",
                        required=True,
                        type=str,
                        help="ex)output/bert.model")

    parser.add_argument("-hs",
                        "--hidden",
                        type=int,
                        default=256,
                        help="hidden size of transformer model")
    parser.add_argument("-l",
                        "--layers",
                        type=int,
                        default=8,
                        help="number of layers")
    parser.add_argument("-a",
                        "--attn_heads",
                        type=int,
                        default=8,
                        help="number of attention heads")
    parser.add_argument("-s",
                        "--seq_len",
                        type=int,
                        default=20,
                        help="maximum sequence len")

    parser.add_argument("-b",
                        "--batch_size",
                        type=int,
                        default=64,
                        help="number of batch_size")
    parser.add_argument("-e",
                        "--epochs",
                        type=int,
                        default=10,
                        help="number of epochs")
    parser.add_argument("-w",
                        "--num_workers",
                        type=int,
                        default=5,
                        help="dataloader worker size")

    parser.add_argument("--with_cuda",
                        type=bool,
                        default=True,
                        help="training with CUDA: true, or false")
    parser.add_argument("--log_freq",
                        type=int,
                        default=10,
                        help="printing loss every n iter: setting n")
    parser.add_argument("--corpus_lines",
                        type=int,
                        default=None,
                        help="total number of lines in corpus")
    parser.add_argument("--cuda_devices",
                        type=int,
                        nargs='+',
                        default=None,
                        help="CUDA device ids")
    parser.add_argument("--on_memory",
                        type=bool,
                        default=True,
                        help="Loading on memory: true or false")

    parser.add_argument("--lr",
                        type=float,
                        default=1e-3,
                        help="learning rate of adam")
    parser.add_argument("--adam_weight_decay",
                        type=float,
                        default=0.01,
                        help="weight_decay of adam")
    parser.add_argument("--adam_beta1",
                        type=float,
                        default=0.9,
                        help="adam first beta value")
    parser.add_argument("--adam_beta2",
                        type=float,
                        default=0.999,
                        help="adam first beta value")

    args = parser.parse_args(
        '-c ../data/corpus.small -t ../data/valid.small -v ../data/vocab.small -o ../output/bert.model'
        .split())

    print("Loading Vocab", args.vocab_path)
    vocab = WordVocab.load_vocab(args.vocab_path)
    print("Vocab Size: ", len(vocab))

    print("Loading Train Dataset", args.train_dataset)
    train_dataset = BERTDataset(args.train_dataset,
                                vocab,
                                seq_len=args.seq_len,
                                corpus_lines=args.corpus_lines,
                                on_memory=args.on_memory)

    print("Loading valid Dataset", args.valid_dataset)
    valid_dataset = BERTDataset(args.valid_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory) \
        if args.valid_dataset is not None else None

    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)
    valid_data_loader = DataLoader(valid_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \
        if valid_dataset is not None else None
    # valid_data_loader = train_data_loader[:5]

    print("Building BERT model")
    bert = BERT(len(vocab),
                hidden=args.hidden,
                n_layers=args.layers,
                attn_heads=args.attn_heads)

    print("Creating BERT Trainer")
    trainer = BERTTrainer(bert,
                          len(vocab),
                          train_dataloader=train_data_loader,
                          valid_dataloader=valid_data_loader,
                          lr=args.lr,
                          betas=(args.adam_beta1, args.adam_beta2),
                          weight_decay=args.adam_weight_decay,
                          with_cuda=args.with_cuda,
                          cuda_devices=args.cuda_devices,
                          log_freq=args.log_freq)

    print("Training Start")
    best_loss = float('inf')
    for epoch in range(args.epochs):
        trainer.train(epoch)
        avg_loss = trainer.valid(epoch)
        if (avg_loss < best_loss):
            best_loss = avg_loss
            trainer.save(epoch, args.output_path)
Esempio n. 13
0
def main():
    opt.use_bert = False
    opt.build_own_vocab = True
    opt.useAreadyVocab = True
    #opt.src_seq_length_trunc = 510

    if opt.use_bert:
        bert_model = 'bert-base-uncased'
        opt.tokenizer = BertTokenizer.from_pretrained(bert_model)

    if opt.dataset_name == 'kp20k':
        src_fields = ['title', 'abstract']
        trg_fields = ['keyword']
    elif opt.dataset_name == 'stackexchange':
        src_fields = ['title', 'question']
        trg_fields = ['tags']
    else:
        raise Exception('Unsupported dataset name=%s' % opt.dataset_name)

    print("Loading training/validation/test data...")
    tokenized_train_pairs = pykp.io.load_src_trgs_pairs(
        source_json_path=opt.source_train_file,
        dataset_name=opt.dataset_name,
        src_fields=src_fields,
        trg_fields=trg_fields,
        opt=opt,
        valid_check=True)

    tokenized_valid_pairs = pykp.io.load_src_trgs_pairs(
        source_json_path=opt.source_valid_file,
        dataset_name=opt.dataset_name,
        src_fields=src_fields,
        trg_fields=trg_fields,
        opt=opt,
        valid_check=False)

    tokenized_test_pairs = pykp.io.load_src_trgs_pairs(
        source_json_path=opt.source_test_file,
        dataset_name=opt.dataset_name,
        src_fields=src_fields,
        trg_fields=trg_fields,
        opt=opt,
        valid_check=False)

    if opt.use_bert and not opt.build_own_vocab:
        print("Loading BERT Vocab...")
        word2id = opt.tokenizer.vocab
        id2word = opt.tokenizer.ids_to_tokens
        vocab = None
        print('Vocab size = %d' % len(word2id))
    elif opt.useAreadyVocab:
        vocab = WordVocab.load_vocab("data4/vocab.30")
        word2id = vocab.stoi
        id2word = vocab.itos
        vocab = vocab.freqs
    else:
        print("Building Vocab...")
        word2id, id2word, vocab = pykp.io.build_vocab(tokenized_train_pairs,
                                                      opt)
        print('Vocab size = %d' % len(vocab))

    print("Dumping dict to disk")
    opt.vocab_path = os.path.join(opt.subset_output_path,
                                  opt.dataset_name + '.vocab.pt')
    torch.save([word2id, id2word, vocab], open(opt.vocab_path, 'wb'))
    opt.vocab_path = os.path.join(opt.output_path,
                                  opt.dataset_name + '.vocab.pt')
    torch.save([word2id, id2word, vocab], open(opt.vocab_path, 'wb'))

    print("Exporting a small dataset to %s (for debugging), "
          "size of train/valid/test is 20000" % opt.subset_output_path)
    pykp.io.process_and_export_dataset(tokenized_train_pairs[:20000],
                                       word2id,
                                       id2word,
                                       opt,
                                       opt.subset_output_path,
                                       dataset_name=opt.dataset_name,
                                       data_type='train')

    pykp.io.process_and_export_dataset(tokenized_valid_pairs,
                                       word2id,
                                       id2word,
                                       opt,
                                       opt.subset_output_path,
                                       dataset_name=opt.dataset_name,
                                       data_type='valid')

    pykp.io.process_and_export_dataset(tokenized_test_pairs,
                                       word2id,
                                       id2word,
                                       opt,
                                       opt.subset_output_path,
                                       dataset_name=opt.dataset_name,
                                       data_type='test')

    print("Exporting complete dataset to %s" % opt.output_path)
    pykp.io.process_and_export_dataset(tokenized_train_pairs,
                                       word2id,
                                       id2word,
                                       opt,
                                       opt.output_path,
                                       dataset_name=opt.dataset_name,
                                       data_type='train')

    pykp.io.process_and_export_dataset(tokenized_valid_pairs,
                                       word2id,
                                       id2word,
                                       opt,
                                       opt.output_path,
                                       dataset_name=opt.dataset_name,
                                       data_type='valid')

    pykp.io.process_and_export_dataset(tokenized_test_pairs,
                                       word2id,
                                       id2word,
                                       opt,
                                       opt.output_path,
                                       dataset_name=opt.dataset_name,
                                       data_type='test')
Esempio n. 14
0
def experiment_RQ3(mode="WPDP",
                   datatype="tokens",
                   embedding="word2vec",
                   model="textcnn"):
    dataset, dataset_list = dataset_generation(mode=mode, datatype=datatype)
    res = {}
    vocab = WordVocab.load_vocab("./pretrained_models/bert/vocab.txt")
    tokenEmb, posEmb = load_bert_weight(max_feature)
    for project_name in dataset_list:
        if mode == "WPDP":
            pre_project_name = dataset[project_name][0]
            train_seq_feat, train_stat_feat, train_y = dataset[project_name][1][0], dataset[project_name][1][1], \
                                                       dataset[project_name][1][2]
            target_seq_feat, target_stat_feat, target_y = dataset[project_name][2][0], dataset[project_name][2][1], \
                                                          dataset[project_name][2][2]
        else:
            train_seq_feat, train_stat_feat, train_y = dataset[project_name][0][0], dataset[project_name][0][1], \
                                                       dataset[project_name][0][2]
            target_seq_feat, target_stat_feat, target_y = dataset[project_name][1][0], dataset[project_name][1][1], \
                                                          dataset[project_name][1][2]
        train_seq_feat, train_stat_feat, train_y = data_oversampling(
            train_seq_feat, train_stat_feat, train_y)
        train_seq_feat, train_stat_feat, train_y = data_oversampling(
            train_seq_feat, train_stat_feat, train_y)
        if mode == "WPDP":
            new_data, train_stat_feat = gen_eda(train_seq_feat.tolist(),
                                                train_stat_feat,
                                                train_y.tolist(), 0.1, 3)
            # new_data, statics_feat = gen_eda(seq_feat.tolist(), statics_feat, y.tolist(), 0.1, i)
            train_seq_feat = new_data["seq"]
            train_y = new_data["bug"]

        if embedding == "word2vec":

            tokenizer = Tokenizer(num_words=max_feature, lower=False)
            tokenizer.fit_on_texts(
                list(train_seq_feat) + list(target_seq_feat))
            word_index = tokenizer.word_index

            train_seq_feat = tokenizer.texts_to_sequences(list(train_seq_feat))
            train_seq_feat = pad_sequences(train_seq_feat, maxlen=maxlen)

            target_seq_feat = tokenizer.texts_to_sequences(
                list(target_seq_feat))
            target_seq_feat = pad_sequences(target_seq_feat, maxlen=maxlen)

            with open("./data/embedding_index.pkl", "rb") as f:
                embedding_index = pickle.load(f)
            embedding_matrix = build_matrix(word_index, embedding_index)
            if model == "textcnn":
                # baseline: textCNN for classification
                f1, precision, recall = textcnn(train_x=train_seq_feat,
                                                train_y=train_y,
                                                vocab=tokenizer.index_word,
                                                val_x=target_seq_feat,
                                                val_y=target_y,
                                                embedding=embedding_matrix,
                                                maxlen=maxlen,
                                                mode=embedding,
                                                trainable=False)
            else:

                f1, precision, recall = bilstm_att_model(
                    embedding,
                    train_seq_feat,
                    train_y,
                    target_seq_feat,
                    target_y,
                    tokenizer.word_index,
                    64,
                    2,
                    embedding=embedding_matrix,
                    trainable=False)
        else:

            #del new_data
            print("processing begin...")
            train_seq_feat = train_seq_feat.apply(
                lambda x: tokenize(x, 512, max_feature, vocab))
            target_seq_feat = target_seq_feat.apply(
                lambda x: tokenize(x, 512, max_feature, vocab))
            print("processing finished")
            train_seq_feat = np.array(list(train_seq_feat))
            target_seq_feat = np.array(list(target_seq_feat))
            train_posEmb = np.expand_dims(posEmb,
                                          0).repeat(train_seq_feat.shape[0],
                                                    axis=0)
            target_posEmbd = np.expand_dims(posEmb,
                                            0).repeat(target_seq_feat.shape[0],
                                                      axis=0)
            if model == "textcnn":
                f1, precision, recall = textcnn(train_x=train_seq_feat,
                                                train_y=train_y,
                                                vocab=max_feature,
                                                val_x=target_seq_feat,
                                                val_y=target_y,
                                                embedding=None,
                                                maxlen=maxlen,
                                                tokenEmb=tokenEmb,
                                                train_posEmb=train_posEmb,
                                                target_posEmb=target_posEmbd,
                                                mode="bert",
                                                trainable=False)
            else:
                f1, precision, recall = bilstm_att_model("bert",
                                                         train_seq_feat,
                                                         train_y,
                                                         target_seq_feat,
                                                         target_y,
                                                         max_feature,
                                                         64,
                                                         2,
                                                         None,
                                                         tokenEmb,
                                                         train_posEmb,
                                                         target_posEmbd,
                                                         trainable=False)
            print([f1, precision, recall])
        res[project_name] = [
            round(f1, 2), round(precision, 2),
            round(recall, 2)
        ]
    df = pd.DataFrame(res)
    df.to_csv("./data/experiment_results/RQ3/" + embedding + "_" + mode + "_" +
              model + ".csv",
              index=False)
Esempio n. 15
0
def experiment_RQ2(mode="WPDP",
                   feature="semantics",
                   classifier="lr",
                   datatype="tokens"):
    """
    Args:
        mode: WPDP or CPDP
        feature: "semantics","token" or "statistical"
        classifer: given only set 'model=traditional'
        classifier: logistics regression.
    """
    dataset, dataset_list = dataset_generation(mode, datatype)
    vocab = WordVocab.load_vocab("./pretrained_models/bert/vocab.txt")
    res = {}
    tokenEmb, posEmb = load_bert_weight(max_feature)
    for project_name in dataset_list:
        if mode == "WPDP":

            pre_project_name = dataset[project_name][0][0]
            train_seq_feat, train_stat_feat, train_y = dataset[project_name][1][0], dataset[project_name][1][1], \
                                                       dataset[project_name][1][2]
            target_seq_feat, target_stat_feat, target_y = dataset[project_name][2][0], dataset[project_name][2][1], \
                                                          dataset[project_name][2][2]
        else:
            train_seq_feat, train_stat_feat, train_y = dataset[project_name][0][0], dataset[project_name][0][1], \
                                                       dataset[project_name][0][2]
            target_seq_feat, target_stat_feat, target_y = dataset[project_name][1][0], dataset[project_name][1][1], \
                                                          dataset[project_name][1][2]

        train_seq_feat, train_stat_feat, train_y = data_oversampling(
            train_seq_feat, train_stat_feat, train_y)
        if mode == "WPDP":
            new_data, train_stat_feat = gen_eda(train_seq_feat.tolist(),
                                                train_stat_feat,
                                                train_y.tolist(), 0.1, 3)
            # new_data, statics_feat = gen_eda(seq_feat.tolist(), statics_feat, y.tolist(), 0.1, i)
            train_seq_feat = new_data["seq"]
            train_y = new_data["bug"]
            del new_data
        if classifier == "lr":
            print("processing begin..")
            train_seq_feat = train_seq_feat.apply(
                lambda x: tokenize(x, 512, max_feature, vocab))
            target_seq_feat = target_seq_feat.apply(
                lambda x: tokenize(x, 512, max_feature, vocab))
            print("processing finished")
            train_seq_feat = np.array(list(train_seq_feat))
            target_seq_feat = np.array(list(target_seq_feat))
            if feature == "semantics":
                train_posEmb = np.expand_dims(posEmb, 0).repeat(
                    train_seq_feat.shape[0], axis=0)
                target_posEmbd = np.expand_dims(posEmb, 0).repeat(
                    target_seq_feat.shape[0], axis=0)
                pred = bert_lstm(train_seq_feat, train_y, target_seq_feat,
                                 target_y, tokenEmb, train_posEmb,
                                 target_posEmbd, max_feature)
            else:
                classification = LogisticRegression()
                if feature == "tokens":
                    classification.fit(train_seq_feat, train_y)
                    pred = classification.predict(target_seq_feat)
                else:
                    classification.fit(train_stat_feat, train_y)
                    pred = classification.predict(target_stat_feat)
            f1 = f1_score(target_y, pred)
            precision = precision_score(target_y, pred)
            recall = recall_score(target_y, pred)
        else:

            tokenizer = Tokenizer(num_words=max_feature, lower=False)
            tokenizer.fit_on_texts(
                list(train_seq_feat) + list(target_seq_feat))
            word_index = tokenizer.word_index
            train_seq_feat = tokenizer.texts_to_sequences(list(train_seq_feat))
            train_seq_feat = pad_sequences(train_seq_feat, maxlen=maxlen)

            target_seq_feat = tokenizer.texts_to_sequences(
                list(target_seq_feat))
            target_seq_feat = pad_sequences(target_seq_feat, maxlen=maxlen)
            # load the embedding index
            with open("./data/embedding_index.pkl", "rb") as f:
                embedding_index = pickle.load(f)
            embedding_matrix = build_matrix(word_index, embedding_index)

            if classifier == "textcnn":
                f1, precision, recall = textcnn("word2vec",
                                                train_seq_feat,
                                                train_y,
                                                word_index,
                                                target_seq_feat,
                                                target_y,
                                                embedding=embedding_matrix,
                                                maxlen=maxlen)
            else:
                f1, precision, recall = bilstm_att_model(
                    "word2vec",
                    train_seq_feat,
                    train_y,
                    target_seq_feat,
                    target_y,
                    word_index,
                    64,
                    2,
                    embedding=embedding_matrix)

        if mode == "WPDP":
            res[pre_project_name] = [
                round(f1, 2),
                round(precision, 2),
                round(recall, 2)
            ]
        else:
            res[project_name] = [
                round(f1, 2),
                round(precision, 2),
                round(recall, 2)
            ]
    df = pd.DataFrame(res)

    if classifier != "lr":
        df.to_csv("./data/experiment_results/RQ2/" + mode + "_" + classifier +
                  ".csv",
                  index=False)
    else:
        df.to_csv("./data/experiment_results/RQ2/" + mode + "_" + feature +
                  "_" + classifier + ".csv",
                  index=False)
Esempio n. 16
0
def experiment_RQ1(mode="WPDP", datatype="tokens"):
    res = {}
    dataset, dataset_list = dataset_generation(mode)
    vocab = WordVocab.load_vocab("./pretrained_models/bert/vocab.txt")

    tokenEmb, posEmb = load_bert_weight(max_feature)
    count = 0
    for project_name in dataset_list:
        print(project_name)
        res_in = {}
        if len(dataset[project_name][1][0]) >= 1000:
            continue
        if count == 4:
            break
        count += 1
        for i in [0, 2, 4, 8, 16, 32]:
            if mode == "WPDP":
                pre_project_name = dataset[project_name][0][0]
                train_seq_feat, train_stat_feat, train_y = dataset[project_name][1][0], dataset[project_name][1][1], \
                                                           dataset[project_name][1][2]
                target_seq_feat, target_stat_feat, target_y = dataset[project_name][2][0], dataset[project_name][2][1], \
                                                              dataset[project_name][2][2]
            else:
                train_seq_feat, train_stat_feat, train_y = dataset[project_name][0][0], dataset[project_name][0][1], \
                                                           dataset[project_name][0][2]
                target_seq_feat, target_stat_feat, target_y = dataset[project_name][1][0], dataset[project_name][1][1], \
                                                              dataset[project_name][1][2]

            #oversampling
            train_seq_feat, train_stat_feat, train_y = data_oversampling(
                train_seq_feat, train_stat_feat, train_y)
            # data generation: generated times: i
            new_data, train_stat_feat = gen_eda(train_seq_feat.tolist(),
                                                train_stat_feat,
                                                train_y.tolist(), 0.1, i)
            #new_data, statics_feat = gen_eda(seq_feat.tolist(), statics_feat, y.tolist(), 0.1, i)
            train_seq_feat = new_data["seq"]
            train_y = new_data["bug"]
            maxlen = 512
            del new_data
            print("processing begin..")
            train_seq_feat = train_seq_feat.apply(
                lambda x: tokenize(x, 512, max_feature, vocab))
            target_seq_feat = target_seq_feat.apply(
                lambda x: tokenize(x, 512, max_feature, vocab))
            print("processing finished")
            train_seq_feat = np.array(list(train_seq_feat))
            target_seq_feat = np.array(list(target_seq_feat))
            train_posEmb = np.expand_dims(posEmb,
                                          0).repeat(train_seq_feat.shape[0],
                                                    axis=0)
            target_posEmbd = np.expand_dims(posEmb,
                                            0).repeat(target_seq_feat.shape[0],
                                                      axis=0)
            pred = bert_lstm(train_seq_feat, train_y, target_seq_feat,
                             target_y, tokenEmb, train_posEmb, target_posEmbd,
                             max_feature)
            f1 = f1_score(target_y, pred)
            precision = precision_score(target_y, pred)
            recall = recall_score(target_y, pred)

            num_project_name = project_name + str(i)
            res_in[num_project_name] = [
                round(f1, 2),
                round(precision, 2),
                round(recall, 2)
            ]
        res[project_name] = res_in
    with open("./data/experiment_results/RQ1/" + mode + ".pkl", "wb") as f:
        pickle.dump(res, f)