def experiment_RQ5(mode="WPDP", datatype="ast", im=False): dataset, dataset_list = dataset_generation(mode=mode, datatype="tokens") res = {} count = 0 vocab = WordVocab.load_vocab("../model_files/vocab.txt") tokenEmb, posEmb = load_bert_weight(max_feature) for project_name in dataset_list: res_in = {} for num in [8, 16, 32, 48, 64, 128, 256]: if mode == "WPDP": pre_project_name = dataset[project_name][0] train_seq_feat, train_stat_feat, train_y = dataset[project_name][1][0], dataset[project_name][1][1], \ dataset[project_name][1][2] target_seq_feat, target_stat_feat, target_y = dataset[project_name][2][0], dataset[project_name][2][1], \ dataset[project_name][2][2] else: train_seq_feat, train_stat_feat, train_y = dataset[project_name][0][0], dataset[project_name][0][1], \ dataset[project_name][0][2] target_seq_feat, target_stat_feat, target_y = dataset[project_name][1][0], dataset[project_name][1][1], \ dataset[project_name][1][2] train_seq_feat, train_stat_feat, train_y = data_oversampling( train_seq_feat, train_stat_feat, train_y) max_len = 512 print("processing begin..") train_seq_feat = train_seq_feat.apply( lambda x: tokenize(x, 512, max_feature, vocab)) target_seq_feat = target_seq_feat.apply( lambda x: tokenize(x, 512, max_feature, vocab)) print("processing finished") train_seq_feat = np.array(list(train_seq_feat)) target_seq_feat = np.array(list(target_seq_feat)) train_posEmb = np.expand_dims(posEmb, 0).repeat(train_seq_feat.shape[0], axis=0) target_posEmbd = np.expand_dims(posEmb, 0).repeat(target_seq_feat.shape[0], axis=0) pred = bert_lstm(train_seq_feat, train_y, target_seq_feat, target_y, tokenEmb, train_posEmb, target_posEmbd, max_feature, hidden_size=num) f1 = f1_score(target_y, pred) precision = precision_score(target_y, pred) recall = recall_score(target_y, pred) print([f1, precision, recall], num) project_nam = project_name.split("-")[0] res_in[project_nam + str(num)] = [f1, precision, recall] print(count) count += 1 res[project_name] = res_in with open("../data/experiment_results/RQ5/hidden.pkl", "wb") as f: pickle.dump(res, f)
def train(self): print("Loading vocab", self.vocab_path) vocab = WordVocab.load_vocab(self.vocab_path) print("vocab Size: ", len(vocab)) print("\nLoading Train Dataset") logkey_train, logkey_valid, time_train, time_valid = generate_train_valid(self.output_path + "train", window_size=self.window_size, adaptive_window=self.adaptive_window, valid_size=self.valid_ratio, sample_ratio=self.sample_ratio, scale=self.scale, scale_path=self.scale_path, seq_len=self.seq_len, min_len=self.min_len ) train_dataset = LogDataset(logkey_train,time_train, vocab, seq_len=self.seq_len, corpus_lines=self.corpus_lines, on_memory=self.on_memory, mask_ratio=self.mask_ratio) print("\nLoading valid Dataset") # valid_dataset = generate_train_valid(self.output_path + "train", window_size=self.window_size, # adaptive_window=self.adaptive_window, # sample_ratio=self.valid_ratio) valid_dataset = LogDataset(logkey_valid, time_valid, vocab, seq_len=self.seq_len, on_memory=self.on_memory, mask_ratio=self.mask_ratio) print("Creating Dataloader") self.train_data_loader = DataLoader(train_dataset, batch_size=self.batch_size, num_workers=self.num_workers, collate_fn=train_dataset.collate_fn, drop_last=True) self.valid_data_loader = DataLoader(valid_dataset, batch_size=self.batch_size, num_workers=self.num_workers, collate_fn=train_dataset.collate_fn, drop_last=True) del train_dataset del valid_dataset del logkey_train del logkey_valid del time_train del time_valid gc.collect() print("Building BERT model") bert = BERT(len(vocab), max_len=self.max_len, hidden=self.hidden, n_layers=self.layers, attn_heads=self.attn_heads, is_logkey=self.is_logkey, is_time=self.is_time) print("Creating BERT Trainer") self.trainer = BERTTrainer(bert, len(vocab), train_dataloader=self.train_data_loader, valid_dataloader=self.valid_data_loader, lr=self.lr, betas=(self.adam_beta1, self.adam_beta2), weight_decay=self.adam_weight_decay, with_cuda=self.with_cuda, cuda_devices=self.cuda_devices, log_freq=self.log_freq, is_logkey=self.is_logkey, is_time=self.is_time, hypersphere_loss=self.hypersphere_loss) self.start_iteration(surfix_log="log2") self.plot_train_valid_loss("_log2")
def test_custom_dataset(): vocab = WordVocab.load_vocab(args.vocab_path) cd = CustomBERTDataset(corpus_path=args.corpus_path, vocab=vocab, seq_len=args.seq_len, encoding=args.encoding, corpus_lines=args.corpus_lines, on_memory=args.on_memory) t1 = cd.get_random_line() t1_random, t1_label = cd.random_word(t1) t1_random t1_label cd[0]
def __init__(self, device=None, jit=False): self.device = device self.jit = jit args = parse_args(args=[ '--train_dataset', 'data/corpus.small', '--test_dataset', 'data/corpus.small', '--vocab_path', 'data/vocab.small', '--output_path', 'bert.model', ]) # Avoid reading sys.argv here args.with_cuda = self.device == 'cuda' args.script = self.jit print("Loading Vocab", args.vocab_path) vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", len(vocab)) train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len, corpus_lines=args.corpus_lines, on_memory=args.on_memory) test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory) \ if args.test_dataset is not None else None print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \ if test_dataset is not None else None print("Building BERT model") bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads) if args.script: print("Scripting BERT model") bert = torch.jit.script(bert) self.trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader, lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq, debug=args.debug) example_batch = next(iter(train_data_loader)) self.example_inputs = example_batch['bert_input'].to(self.device), example_batch['segment_label'].to(self.device)
parser.add_argument("-e", "--epochs", type=int, default=10) parser.add_argument("-w", "--num_workers", type=int, default=5) parser.add_argument("--corpus_lines", type=int, default=None) parser.add_argument("--lr", type=float, default=1e-3) parser.add_argument("--adam_weight_decay", type=float, default=0.01) parser.add_argument("--adam_beta1", type=float, default=0.9) parser.add_argument("--adam_beta2", type=float, default=0.999) parser.add_argument("--log_freq", type=int, default=10) parser.add_argument("-c", "--cuda", type=bool, default=True) args = parser.parse_args() print("Loading Vocab", args.vocab_path) vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", len(vocab)) print("Loading Train Dataset", args.train_dataset) train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len, corpus_lines=args.corpus_lines) print("Loading Test Dataset", args.test_dataset) test_dataset = BERTDataset( args.test_dataset, vocab, seq_len=args.seq_len) if args.test_dataset is not None else None print("Creating Dataloader") train_data_loader = DataLoader(train_dataset,
def predict(self): model = torch.load(self.model_path) model.to(self.device) model.eval() print('model_path: {}'.format(self.model_path)) start_time = time.time() vocab = WordVocab.load_vocab(self.vocab_path) scale = None error_dict = None if self.is_time: with open(self.scale_path, "rb") as f: scale = pickle.load(f) with open(self.model_dir + "error_dict.pkl", 'rb') as f: error_dict = pickle.load(f) if self.hypersphere_loss: center_dict = torch.load(self.model_dir + "best_center.pt") self.center = center_dict["center"] self.radius = center_dict["radius"] # self.center = self.center.view(1,-1) print("test normal predicting") test_normal_results, test_normal_errors = self.helper( model, self.output_dir, "test_normal", vocab, scale, error_dict) print("test abnormal predicting") test_abnormal_results, test_abnormal_errors = self.helper( model, self.output_dir, "test_abnormal", vocab, scale, error_dict) print("Saving test normal results") with open(self.model_dir + "test_normal_results", "wb") as f: pickle.dump(test_normal_results, f) print("Saving test abnormal results") with open(self.model_dir + "test_abnormal_results", "wb") as f: pickle.dump(test_abnormal_results, f) print("Saving test normal errors") with open(self.model_dir + "test_normal_errors.pkl", "wb") as f: pickle.dump(test_normal_errors, f) print("Saving test abnormal results") with open(self.model_dir + "test_abnormal_errors.pkl", "wb") as f: pickle.dump(test_abnormal_errors, f) params = { "is_logkey": self.is_logkey, "is_time": self.is_time, "hypersphere_loss": self.hypersphere_loss, "hypersphere_loss_test": self.hypersphere_loss_test } best_th, best_seq_th, FP, TP, TN, FN, P, R, F1 = find_best_threshold( test_normal_results, test_abnormal_results, params=params, th_range=np.arange(10), seq_range=np.arange(0, 1, 0.1)) print("best threshold: {}, best threshold ratio: {}".format( best_th, best_seq_th)) print("TP: {}, TN: {}, FP: {}, FN: {}".format(TP, TN, FP, FN)) print( 'Precision: {:.2f}%, Recall: {:.2f}%, F1-measure: {:.2f}%'.format( P, R, F1)) elapsed_time = time.time() - start_time print('elapsed_time: {}'.format(elapsed_time))
train_parser.set_defaults(mode='train') predict_parser = subparsers.add_parser('predict') predict_parser.set_defaults(mode='predict') predict_parser.add_argument("-m", "--mean", type=float, default=0) predict_parser.add_argument("-s", "--std", type=float, default=1) vocab_parser = subparsers.add_parser('vocab') vocab_parser.set_defaults(mode='vocab') vocab_parser.add_argument("-s", "--vocab_size", type=int, default=None) vocab_parser.add_argument("-e", "--encoding", type=str, default="utf-8") vocab_parser.add_argument("-m", "--min_freq", type=int, default=1) args = parser.parse_args() print("arguments", args) # Trainer(options).train() # Predictor(options).predict() if args.mode == 'train': Trainer(options).train() elif args.mode == 'predict': Predictor(options).predict() elif args.mode == 'vocab': with open(options["train_vocab"], 'r') as f: logs = f.readlines() vocab = WordVocab(logs) print("vocab_size", len(vocab)) vocab.save_vocab(options["vocab_path"])
def train(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--train_dataset", type=str, default='../data/data.bert', help="train dataset for train bert") parser.add_argument("-t", "--test_dataset", type=str, default=None, help="test set for evaluate train set") parser.add_argument("-v", "--vocab_path", type=str, default='../data/bert.vb', help="built vocab model path with bert-vocab") parser.add_argument("-o", "--output_path", type=str, default='../data/bert.model', help="ex)output/bert.model") parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model") parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers") parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads") parser.add_argument("-s", "--seq_len", type=int, default=20, help="maximum sequence len") parser.add_argument("-b", "--batch_size", type=int, default=64, help="number of batch_size") parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs") parser.add_argument("-w", "--num_workers", type=int, default=5, help="dataloader worker size") parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false") parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n") parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus") parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids") parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false") parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam") parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value") parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") args = parser.parse_args() print("Loading Vocab", args.vocab_path) vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", len(vocab)) print("Loading Train Dataset", args.train_dataset) train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len, corpus_lines=args.corpus_lines, on_memory=args.on_memory) print("Loading Test Dataset", args.test_dataset) test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory) \ if args.test_dataset is not None else None # tmp_ = train_dataset.__getitem__(20) # print(tmp_['bert_input']) # # x = tmp_['bert_input'].unsqueeze(0) # mask_ = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1) # # print(mask_.size()) # print(mask_) print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \ if test_dataset is not None else None # # # print("Building BERT model") bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads) print("Creating BERT Trainer") trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader, lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq) # print("Training Start") for epoch in range(args.epochs): trainer.train(epoch) trainer.save(epoch, args.output_path) if test_data_loader is not None: trainer.test(epoch)
import argparse from bert_pytorch.dataset import WordVocab parser = argparse.ArgumentParser() parser.add_argument("-c", "--corpus_path", required=True, type=str) parser.add_argument("-o", "--output_path", required=True, type=str) parser.add_argument("-s", "--vocab_size", type=int, default=None) parser.add_argument("-e", "--encoding", type=str, default="utf-8") parser.add_argument("-m", "--min_freq", type=int, default=1) args = parser.parse_args() with open(args.corpus_path, "r", encoding=args.encoding) as f: vocab = WordVocab(f, max_size=args.vocab_size, min_freq=args.min_freq) print("VOCAB SIZE:", len(vocab)) vocab.save_vocab(args.output_path)
required=True) parser.add_argument("-d", "--sop_dataset_path", type=str, required=True) parser.add_argument("-t", "--train_and_validate", type=str, default="True") parser.add_argument("-e", "--epochs", type=int, default=10) args = parser.parse_args() # file paths vocab_path = args.vocab_path albert_model_path = args.bert_model_path clf_model_path = args.sop_classifier_model_path sop_dataset_path = args.sop_dataset_path train_and_validate = True if args.train_and_validate.lower( ) == 'true' else False # load vocabulary vocab = WordVocab.load_vocab(vocab_path) # load pretrainer albert model bert = ALBERT(vocab_size=len(vocab), embed_size=128, hidden=256, n_layers=8, attn_heads=8, seq_len=64) bert = torch.load(albert_model_path) # parameters num_class = 37 batch_size = 64 seq_len = 64 epochs = 20
predict_parser = subparsers.add_parser('predict') predict_parser.set_defaults(mode='predict') predict_parser.add_argument("-m", "--mean", type=float, default=0) predict_parser.add_argument("-s", "--std", type=float, default=1) vocab_parser = subparsers.add_parser('vocab') vocab_parser.set_defaults(mode='vocab') vocab_parser.add_argument("-s", "--vocab_size", type=int, default=None) vocab_parser.add_argument("-e", "--encoding", type=str, default="utf-8") vocab_parser.add_argument("-m", "--min_freq", type=int, default=1) args = parser.parse_args() print("arguments", args) if args.mode == 'train': Trainer(options).train() elif args.mode == 'predict': Predictor(options).predict() elif args.mode == 'vocab': with open(options["train_vocab"], "r") as f: texts = f.readlines() vocab = WordVocab(texts, max_size=args.vocab_size, min_freq=args.min_freq) print("VOCAB SIZE:", len(vocab)) print("save vocab in", options["vocab_path"]) vocab.save_vocab(options["vocab_path"])
def train(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--train_dataset", required=True, type=str, help="train dataset for train bert") parser.add_argument("-t", "--valid_dataset", type=str, default=None, help="valid set for evaluate train set") parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with bert-vocab") parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model") parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model") parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers") parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads") parser.add_argument("-s", "--seq_len", type=int, default=20, help="maximum sequence len") parser.add_argument("-b", "--batch_size", type=int, default=64, help="number of batch_size") parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs") parser.add_argument("-w", "--num_workers", type=int, default=5, help="dataloader worker size") parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false") parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n") parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus") parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids") parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false") parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam") parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value") parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") args = parser.parse_args( '-c ../data/corpus.small -t ../data/valid.small -v ../data/vocab.small -o ../output/bert.model' .split()) print("Loading Vocab", args.vocab_path) vocab = WordVocab.load_vocab(args.vocab_path) print("Vocab Size: ", len(vocab)) print("Loading Train Dataset", args.train_dataset) train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len, corpus_lines=args.corpus_lines, on_memory=args.on_memory) print("Loading valid Dataset", args.valid_dataset) valid_dataset = BERTDataset(args.valid_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory) \ if args.valid_dataset is not None else None print("Creating Dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) valid_data_loader = DataLoader(valid_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \ if valid_dataset is not None else None # valid_data_loader = train_data_loader[:5] print("Building BERT model") bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads) print("Creating BERT Trainer") trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, valid_dataloader=valid_data_loader, lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq) print("Training Start") best_loss = float('inf') for epoch in range(args.epochs): trainer.train(epoch) avg_loss = trainer.valid(epoch) if (avg_loss < best_loss): best_loss = avg_loss trainer.save(epoch, args.output_path)
def main(): opt.use_bert = False opt.build_own_vocab = True opt.useAreadyVocab = True #opt.src_seq_length_trunc = 510 if opt.use_bert: bert_model = 'bert-base-uncased' opt.tokenizer = BertTokenizer.from_pretrained(bert_model) if opt.dataset_name == 'kp20k': src_fields = ['title', 'abstract'] trg_fields = ['keyword'] elif opt.dataset_name == 'stackexchange': src_fields = ['title', 'question'] trg_fields = ['tags'] else: raise Exception('Unsupported dataset name=%s' % opt.dataset_name) print("Loading training/validation/test data...") tokenized_train_pairs = pykp.io.load_src_trgs_pairs( source_json_path=opt.source_train_file, dataset_name=opt.dataset_name, src_fields=src_fields, trg_fields=trg_fields, opt=opt, valid_check=True) tokenized_valid_pairs = pykp.io.load_src_trgs_pairs( source_json_path=opt.source_valid_file, dataset_name=opt.dataset_name, src_fields=src_fields, trg_fields=trg_fields, opt=opt, valid_check=False) tokenized_test_pairs = pykp.io.load_src_trgs_pairs( source_json_path=opt.source_test_file, dataset_name=opt.dataset_name, src_fields=src_fields, trg_fields=trg_fields, opt=opt, valid_check=False) if opt.use_bert and not opt.build_own_vocab: print("Loading BERT Vocab...") word2id = opt.tokenizer.vocab id2word = opt.tokenizer.ids_to_tokens vocab = None print('Vocab size = %d' % len(word2id)) elif opt.useAreadyVocab: vocab = WordVocab.load_vocab("data4/vocab.30") word2id = vocab.stoi id2word = vocab.itos vocab = vocab.freqs else: print("Building Vocab...") word2id, id2word, vocab = pykp.io.build_vocab(tokenized_train_pairs, opt) print('Vocab size = %d' % len(vocab)) print("Dumping dict to disk") opt.vocab_path = os.path.join(opt.subset_output_path, opt.dataset_name + '.vocab.pt') torch.save([word2id, id2word, vocab], open(opt.vocab_path, 'wb')) opt.vocab_path = os.path.join(opt.output_path, opt.dataset_name + '.vocab.pt') torch.save([word2id, id2word, vocab], open(opt.vocab_path, 'wb')) print("Exporting a small dataset to %s (for debugging), " "size of train/valid/test is 20000" % opt.subset_output_path) pykp.io.process_and_export_dataset(tokenized_train_pairs[:20000], word2id, id2word, opt, opt.subset_output_path, dataset_name=opt.dataset_name, data_type='train') pykp.io.process_and_export_dataset(tokenized_valid_pairs, word2id, id2word, opt, opt.subset_output_path, dataset_name=opt.dataset_name, data_type='valid') pykp.io.process_and_export_dataset(tokenized_test_pairs, word2id, id2word, opt, opt.subset_output_path, dataset_name=opt.dataset_name, data_type='test') print("Exporting complete dataset to %s" % opt.output_path) pykp.io.process_and_export_dataset(tokenized_train_pairs, word2id, id2word, opt, opt.output_path, dataset_name=opt.dataset_name, data_type='train') pykp.io.process_and_export_dataset(tokenized_valid_pairs, word2id, id2word, opt, opt.output_path, dataset_name=opt.dataset_name, data_type='valid') pykp.io.process_and_export_dataset(tokenized_test_pairs, word2id, id2word, opt, opt.output_path, dataset_name=opt.dataset_name, data_type='test')
def experiment_RQ3(mode="WPDP", datatype="tokens", embedding="word2vec", model="textcnn"): dataset, dataset_list = dataset_generation(mode=mode, datatype=datatype) res = {} vocab = WordVocab.load_vocab("./pretrained_models/bert/vocab.txt") tokenEmb, posEmb = load_bert_weight(max_feature) for project_name in dataset_list: if mode == "WPDP": pre_project_name = dataset[project_name][0] train_seq_feat, train_stat_feat, train_y = dataset[project_name][1][0], dataset[project_name][1][1], \ dataset[project_name][1][2] target_seq_feat, target_stat_feat, target_y = dataset[project_name][2][0], dataset[project_name][2][1], \ dataset[project_name][2][2] else: train_seq_feat, train_stat_feat, train_y = dataset[project_name][0][0], dataset[project_name][0][1], \ dataset[project_name][0][2] target_seq_feat, target_stat_feat, target_y = dataset[project_name][1][0], dataset[project_name][1][1], \ dataset[project_name][1][2] train_seq_feat, train_stat_feat, train_y = data_oversampling( train_seq_feat, train_stat_feat, train_y) train_seq_feat, train_stat_feat, train_y = data_oversampling( train_seq_feat, train_stat_feat, train_y) if mode == "WPDP": new_data, train_stat_feat = gen_eda(train_seq_feat.tolist(), train_stat_feat, train_y.tolist(), 0.1, 3) # new_data, statics_feat = gen_eda(seq_feat.tolist(), statics_feat, y.tolist(), 0.1, i) train_seq_feat = new_data["seq"] train_y = new_data["bug"] if embedding == "word2vec": tokenizer = Tokenizer(num_words=max_feature, lower=False) tokenizer.fit_on_texts( list(train_seq_feat) + list(target_seq_feat)) word_index = tokenizer.word_index train_seq_feat = tokenizer.texts_to_sequences(list(train_seq_feat)) train_seq_feat = pad_sequences(train_seq_feat, maxlen=maxlen) target_seq_feat = tokenizer.texts_to_sequences( list(target_seq_feat)) target_seq_feat = pad_sequences(target_seq_feat, maxlen=maxlen) with open("./data/embedding_index.pkl", "rb") as f: embedding_index = pickle.load(f) embedding_matrix = build_matrix(word_index, embedding_index) if model == "textcnn": # baseline: textCNN for classification f1, precision, recall = textcnn(train_x=train_seq_feat, train_y=train_y, vocab=tokenizer.index_word, val_x=target_seq_feat, val_y=target_y, embedding=embedding_matrix, maxlen=maxlen, mode=embedding, trainable=False) else: f1, precision, recall = bilstm_att_model( embedding, train_seq_feat, train_y, target_seq_feat, target_y, tokenizer.word_index, 64, 2, embedding=embedding_matrix, trainable=False) else: #del new_data print("processing begin...") train_seq_feat = train_seq_feat.apply( lambda x: tokenize(x, 512, max_feature, vocab)) target_seq_feat = target_seq_feat.apply( lambda x: tokenize(x, 512, max_feature, vocab)) print("processing finished") train_seq_feat = np.array(list(train_seq_feat)) target_seq_feat = np.array(list(target_seq_feat)) train_posEmb = np.expand_dims(posEmb, 0).repeat(train_seq_feat.shape[0], axis=0) target_posEmbd = np.expand_dims(posEmb, 0).repeat(target_seq_feat.shape[0], axis=0) if model == "textcnn": f1, precision, recall = textcnn(train_x=train_seq_feat, train_y=train_y, vocab=max_feature, val_x=target_seq_feat, val_y=target_y, embedding=None, maxlen=maxlen, tokenEmb=tokenEmb, train_posEmb=train_posEmb, target_posEmb=target_posEmbd, mode="bert", trainable=False) else: f1, precision, recall = bilstm_att_model("bert", train_seq_feat, train_y, target_seq_feat, target_y, max_feature, 64, 2, None, tokenEmb, train_posEmb, target_posEmbd, trainable=False) print([f1, precision, recall]) res[project_name] = [ round(f1, 2), round(precision, 2), round(recall, 2) ] df = pd.DataFrame(res) df.to_csv("./data/experiment_results/RQ3/" + embedding + "_" + mode + "_" + model + ".csv", index=False)
def experiment_RQ2(mode="WPDP", feature="semantics", classifier="lr", datatype="tokens"): """ Args: mode: WPDP or CPDP feature: "semantics","token" or "statistical" classifer: given only set 'model=traditional' classifier: logistics regression. """ dataset, dataset_list = dataset_generation(mode, datatype) vocab = WordVocab.load_vocab("./pretrained_models/bert/vocab.txt") res = {} tokenEmb, posEmb = load_bert_weight(max_feature) for project_name in dataset_list: if mode == "WPDP": pre_project_name = dataset[project_name][0][0] train_seq_feat, train_stat_feat, train_y = dataset[project_name][1][0], dataset[project_name][1][1], \ dataset[project_name][1][2] target_seq_feat, target_stat_feat, target_y = dataset[project_name][2][0], dataset[project_name][2][1], \ dataset[project_name][2][2] else: train_seq_feat, train_stat_feat, train_y = dataset[project_name][0][0], dataset[project_name][0][1], \ dataset[project_name][0][2] target_seq_feat, target_stat_feat, target_y = dataset[project_name][1][0], dataset[project_name][1][1], \ dataset[project_name][1][2] train_seq_feat, train_stat_feat, train_y = data_oversampling( train_seq_feat, train_stat_feat, train_y) if mode == "WPDP": new_data, train_stat_feat = gen_eda(train_seq_feat.tolist(), train_stat_feat, train_y.tolist(), 0.1, 3) # new_data, statics_feat = gen_eda(seq_feat.tolist(), statics_feat, y.tolist(), 0.1, i) train_seq_feat = new_data["seq"] train_y = new_data["bug"] del new_data if classifier == "lr": print("processing begin..") train_seq_feat = train_seq_feat.apply( lambda x: tokenize(x, 512, max_feature, vocab)) target_seq_feat = target_seq_feat.apply( lambda x: tokenize(x, 512, max_feature, vocab)) print("processing finished") train_seq_feat = np.array(list(train_seq_feat)) target_seq_feat = np.array(list(target_seq_feat)) if feature == "semantics": train_posEmb = np.expand_dims(posEmb, 0).repeat( train_seq_feat.shape[0], axis=0) target_posEmbd = np.expand_dims(posEmb, 0).repeat( target_seq_feat.shape[0], axis=0) pred = bert_lstm(train_seq_feat, train_y, target_seq_feat, target_y, tokenEmb, train_posEmb, target_posEmbd, max_feature) else: classification = LogisticRegression() if feature == "tokens": classification.fit(train_seq_feat, train_y) pred = classification.predict(target_seq_feat) else: classification.fit(train_stat_feat, train_y) pred = classification.predict(target_stat_feat) f1 = f1_score(target_y, pred) precision = precision_score(target_y, pred) recall = recall_score(target_y, pred) else: tokenizer = Tokenizer(num_words=max_feature, lower=False) tokenizer.fit_on_texts( list(train_seq_feat) + list(target_seq_feat)) word_index = tokenizer.word_index train_seq_feat = tokenizer.texts_to_sequences(list(train_seq_feat)) train_seq_feat = pad_sequences(train_seq_feat, maxlen=maxlen) target_seq_feat = tokenizer.texts_to_sequences( list(target_seq_feat)) target_seq_feat = pad_sequences(target_seq_feat, maxlen=maxlen) # load the embedding index with open("./data/embedding_index.pkl", "rb") as f: embedding_index = pickle.load(f) embedding_matrix = build_matrix(word_index, embedding_index) if classifier == "textcnn": f1, precision, recall = textcnn("word2vec", train_seq_feat, train_y, word_index, target_seq_feat, target_y, embedding=embedding_matrix, maxlen=maxlen) else: f1, precision, recall = bilstm_att_model( "word2vec", train_seq_feat, train_y, target_seq_feat, target_y, word_index, 64, 2, embedding=embedding_matrix) if mode == "WPDP": res[pre_project_name] = [ round(f1, 2), round(precision, 2), round(recall, 2) ] else: res[project_name] = [ round(f1, 2), round(precision, 2), round(recall, 2) ] df = pd.DataFrame(res) if classifier != "lr": df.to_csv("./data/experiment_results/RQ2/" + mode + "_" + classifier + ".csv", index=False) else: df.to_csv("./data/experiment_results/RQ2/" + mode + "_" + feature + "_" + classifier + ".csv", index=False)
def experiment_RQ1(mode="WPDP", datatype="tokens"): res = {} dataset, dataset_list = dataset_generation(mode) vocab = WordVocab.load_vocab("./pretrained_models/bert/vocab.txt") tokenEmb, posEmb = load_bert_weight(max_feature) count = 0 for project_name in dataset_list: print(project_name) res_in = {} if len(dataset[project_name][1][0]) >= 1000: continue if count == 4: break count += 1 for i in [0, 2, 4, 8, 16, 32]: if mode == "WPDP": pre_project_name = dataset[project_name][0][0] train_seq_feat, train_stat_feat, train_y = dataset[project_name][1][0], dataset[project_name][1][1], \ dataset[project_name][1][2] target_seq_feat, target_stat_feat, target_y = dataset[project_name][2][0], dataset[project_name][2][1], \ dataset[project_name][2][2] else: train_seq_feat, train_stat_feat, train_y = dataset[project_name][0][0], dataset[project_name][0][1], \ dataset[project_name][0][2] target_seq_feat, target_stat_feat, target_y = dataset[project_name][1][0], dataset[project_name][1][1], \ dataset[project_name][1][2] #oversampling train_seq_feat, train_stat_feat, train_y = data_oversampling( train_seq_feat, train_stat_feat, train_y) # data generation: generated times: i new_data, train_stat_feat = gen_eda(train_seq_feat.tolist(), train_stat_feat, train_y.tolist(), 0.1, i) #new_data, statics_feat = gen_eda(seq_feat.tolist(), statics_feat, y.tolist(), 0.1, i) train_seq_feat = new_data["seq"] train_y = new_data["bug"] maxlen = 512 del new_data print("processing begin..") train_seq_feat = train_seq_feat.apply( lambda x: tokenize(x, 512, max_feature, vocab)) target_seq_feat = target_seq_feat.apply( lambda x: tokenize(x, 512, max_feature, vocab)) print("processing finished") train_seq_feat = np.array(list(train_seq_feat)) target_seq_feat = np.array(list(target_seq_feat)) train_posEmb = np.expand_dims(posEmb, 0).repeat(train_seq_feat.shape[0], axis=0) target_posEmbd = np.expand_dims(posEmb, 0).repeat(target_seq_feat.shape[0], axis=0) pred = bert_lstm(train_seq_feat, train_y, target_seq_feat, target_y, tokenEmb, train_posEmb, target_posEmbd, max_feature) f1 = f1_score(target_y, pred) precision = precision_score(target_y, pred) recall = recall_score(target_y, pred) num_project_name = project_name + str(i) res_in[num_project_name] = [ round(f1, 2), round(precision, 2), round(recall, 2) ] res[project_name] = res_in with open("./data/experiment_results/RQ1/" + mode + ".pkl", "wb") as f: pickle.dump(res, f)