def __init__(self, opt, model_classes): self.opt = opt if 'bert' in opt.model_name: self.tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) else: self.tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.trainset = ABSADataset(opt.dataset_file['train'], self.tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], self.tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) if 'bert' in opt.model_name: # ,cache_dir="pretrained/bert/" print("--------load module BERT --------") #To from pytorch_transformers import BertModel self.bert = BertModel.from_pretrained(opt.pretrained_bert_name, output_attentions=True, cache_dir="pretrained/bert/") # Bert pretrained (Old version) #bert = BertModel.from_pretrained(opt.pretrained_bert_name, cache_dir="pretrained/bert/") print("--------DDDD-----") print("OUTPUT") print("------ Module LOADED -------") #self.model = model_classes[opt.model_name](bert, opt).to(opt.device) self.model = opt.model_class(self.bert, opt).to(opt.device) #self.model = AEN_BERT(self.bert, opt).to(opt.device) print("MODULE LOADED SPECIFIC") else: self.model = model_classes[opt.model_name](embedding_matrix, opt).to(opt.device) self._print_args()
def __init__(self, opt): """ 初始化模型和数据预处理,并token化 :param opt: argparse的参数 """ self.opt = opt #是否是bert类模型,使用bert类模型初始化, 非BERT类使用GloVe if 'bert' in opt.model_name: #初始化tokenizer tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name, cache_dir=opt.pretrained_bert_cache_dir) # 加载BERT模型 bert = BertModel.from_pretrained( opt.pretrained_bert_name, cache_dir=opt.pretrained_bert_cache_dir) # 然后把BERT模型和opt参数传入自定义模型,进行进一步处理 self.model = opt.model_class(bert, opt).to(opt.device) else: # 自定义tokenizer,生成id2word,word2idx tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) #返回所有单词的词嵌入 [word_nums, embedding_dimesion] embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) # 加载模型 self.model = opt.model_class(embedding_matrix, opt).to(opt.device) # 加载训练集 self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer, recreate_caches=opt.recreate_caches) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer, recreate_caches=opt.recreate_caches) #如果valset_ratio为0,测试集代替验证集 assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset # 检查cuda的内存 if opt.device.type == 'cuda': logger.info('cuda 可用内存: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='temp_data/' + '{0}_tokenizer.dat'.format(opt.dataset), step=4 if opt.tabsa else 3) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='temp_data/' + '{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) if opt.tabsa: if opt.tabsa_with_absa: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer, True) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer, True) else: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer, False) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer, False) else: self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'roberta' in opt.pretrained_bert_name: tokenizer = RobertaTokenizer.from_pretrained( opt.pretrained_bert_name) transformer = RobertaModel.from_pretrained( opt.pretrained_bert_name, output_attentions=True) elif 'bert' in opt.pretrained_bert_name: tokenizer = BertTokenizer.from_pretrained(opt.pretrained_bert_name) transformer = BertModel.from_pretrained(opt.pretrained_bert_name, output_attentions=True) elif 'xlnet' in opt.pretrained_bert_name: tokenizer = XLNetTokenizer.from_pretrained( opt.pretrained_bert_name) transformer = XLNetModel.from_pretrained(opt.pretrained_bert_name, output_attentions=True) if 'bert' or 'xlnet' in opt.model_name: tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len) self.model = opt.model_class(transformer, opt).to(opt.device) # elif 'xlnet' in opt.model_name: # tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len) # self.model = opt.model_class(bert,opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) # self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) # assert 0 <= opt.valset_ratio < 1 # if opt.valset_ratio > 0: # valset_len = int(len(self.trainset) * opt.valset_ratio) # self.trainset, self.valset = random_split(self.trainset, (len(self.trainset) - valset_len, valset_len)) # else: # self.valset = self.testset # # if opt.device.type == 'cuda': # logger.info('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=opt.device.index))) model_path = 'saved/'+opt.model_name+'.hdf5' self.model.load_state_dict(torch.load(model_path))
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test1']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.testset = ABSADataset(opt.dataset_file['test1'], tokenizer) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) model_path = 'state_dict/bert_spc_law_val_acc0.5314.hdf5' # provide best model path self.model.load_state_dict(torch.load(model_path))
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) # else: # tokenizer = build_tokenizer( # fnames=[opt.dataset_file['train'], opt.dataset_file['test']], # max_seq_len=opt.max_seq_len, # dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) # embedding_matrix = build_embedding_matrix( # word2idx=tokenizer.word2idx, # embed_dim=opt.embed_dim, # dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset)) # self.model = opt.model_class(embedding_matrix, opt).to(opt.device) if 'pair' in opt.model_name: if not self.opt.do_eval: self.trainset = ABSADataset_sentence_pair( opt.dataset_file['train'], tokenizer) self.testset = ABSADataset_sentence_pair(opt.dataset_file['test'], tokenizer) elif 'SA' in opt.model_name: if not self.opt.do_eval: self.trainset = SADataset(opt.dataset_file['train'], tokenizer) self.testset = SADataset(opt.dataset_file['test'], tokenizer) else: if not self.opt.do_eval: self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if not self.opt.do_eval: if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def get_predictlist(models, opt_list, tokenizer): pred_list = [] testset = ABSADataset('./datasets/semeval14/processed.csv', tokenizer) for i in range(len(models)): pred = Predictor(opt_list[i], models[i], testset) predictions = pred.save_predictions() pred_list.append(predictions) return pred_list
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: # set bert_based_vocab tokenizer = Tokenizer4Bert( opt.max_seq_len, '/data/kkzhang/aaa/command/bert-base-uncased-vocab.txt') #tokenizer = Tokenizer4Bert(opt.max_seq_len, '/home/kkzhang/bert-large-uncased/bert-large-uncased-vocab.txt') # set bert pre_train model bert = BertModel.from_pretrained( '/data/kkzhang/WordeEmbedding/bert_base/') ##### multi gpu ########## if torch.cuda.device_count() > 1: logging.info('The device has {} gpus!!!!!!!!!!!!!'.format( torch.cuda.device_count())) bert = nn.DataParallel(bert) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) # self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) # self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) ## using our own dataset data = pd.read_csv('train_data1.csv') # test_data = pd.read_csv('../test_tOlRoBf.csv') train_data, test_data = train_test_split(data, test_size=0.1, random_state=42) self.trainset = ABSADataset(train_data, tokenizer) self.testset = ABSADataset(test_data, tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def do_predict(self, TEXT, TARGET): TEXT_1 = PreProcessing(TEXT).get_file_text() predict_set = ABSADataset(data_type=None, fname=(TARGET.tolist(), TEXT_1.tolist(), None), tokenizer=self.tokenizer) predict_loader = DataLoader(dataset=predict_set, batch_size=len(TEXT)) outputs = None for i_batch, sample_batched in enumerate(predict_loader): inputs = [ sample_batched[col].to(self.args.device) for col in self.args.input_colses[self.args.model_name] ] if self.args.topics is None: outputs = self.net(inputs) elif self.args.topics.index(TARGET[0]) == 0: outputs = self.net_0(inputs) elif self.args.topics.index(TARGET[0]) == 1: outputs = self.net_1(inputs) elif self.args.topics.index(TARGET[0]) == 2: outputs = self.net_2(inputs) elif self.args.topics.index(TARGET[0]) == 3: outputs = self.net_3(inputs) elif self.args.topics.index(TARGET[0]) == 4: outputs = self.net_4(inputs) # ############################# 特征词库的方法 效果不好 # WORDS = list(jieba.cut(TEXT_1.tolist()[0], cut_all=False)) # none, favor, against = 0, 0, 0 # for word in WORDS: # if word in self.word_count_none: # none += len(word) * self.word_count_none[word] # if word in self.word_count_favor: # favor += len(word) * self.word_count_favor[word] # if word in self.word_count_against: # against += len(word) * self.word_count_against[word] # # none = 0.3 * none + 0.7 * outputs.detach().numpy().tolist()[0][0] # favor = 0.3 * favor + 0.7 * outputs.detach().numpy().tolist()[0][1] # against = 0.3 * against + 0.7 * outputs.detach().numpy().tolist()[0][2] # # outputs = [none, favor, against] # outputs = outputs.index(max(outputs)) # print( # '{}, {}, {}, {}, {}, {}, {}'.format(self.idx2label[outputs], # round(none, 4), # round(favor, 4), # round(against, 4), # TARGET[0], TEXT[0], # TEXT_1[0])) # ############################# 特征词库的方法 效果不好 outputs = torch.argmax(outputs, dim=-1).numpy().tolist() return outputs
def __init__(self, opt): self.opt = opt if opt.model_name.lower() in ['vh_bert', 'bert_att', 'my_lcf']: tokenizer = BertTokenizer.from_pretrained(opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) self.model = opt.model_class(config, ).to(opt.device) elif 'bert' in opt.model_name.lower(): tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='./cache/{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='./cache/{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.ç) if opt.dataset in ['twitter', 'restaurant', 'laptop']: self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) #返回 torch 的dataset类 self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) else: self.trainset = CovData(opt.dataset_file['train'], tokenizer) #返回 torch 的dataset类 self.testset = CovData(opt.dataset_file['test'], tokenizer) # 定义切分数据集的比例 切分训练集 assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True) # tokenizer = Tokenizer4Bert(opt.max_seq_len, '/content/drive/My Drive/FYP/pretrained_BERT_further_trained_with_criminal_corpus/vocab.txt') # bert = BertModel.from_pretrained('/content/drive/My Drive/FYP/pretrained_BERT_further_trained_with_criminal_corpus') self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset( opt.dataset_file['train'], './datasets/semeval14/law_train.raw.graph', tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], './datasets/semeval14/law_train.raw.graph', tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'aen_simple' == opt.model_name: if 'bert' == opt.bert_type: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) elif 'roberta' == opt.bert_type: tokenizer = Tokenizer4RoBerta(opt.max_seq_len, opt.pretrained_bert_name) roberta = RobertaModel.from_pretrained( opt.pretrained_bert_name) self.model = opt.model_class(roberta, opt).to(opt.device) elif 'roberta' in opt.model_name: tokenizer = Tokenizer4RoBerta(opt.max_seq_len, opt.pretrained_bert_name) roberta = RobertaModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(roberta, opt).to(opt.device) elif 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) # freeze pretrained bert params # for param in bert.parameters(): # param.requires_grad = False self.model = opt.model_class(bert, opt) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) trainset = ABSADataset(opt.dataset_file['train'], tokenizer) testset = ABSADataset(opt.dataset_file['test'], tokenizer) self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) if opt.device.type == 'cuda': self.model = nn.DataParallel(self.model).cuda() print("cuda memory allocated:", torch.cuda.memory_allocated(device=opt.device.index)) else: self.model = self.model.to(opt.device) self._print_args()
def __init__(self, opt): self.opt = opt # prepare inputs tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) if opt.dan == True: boc = build_boc(' ', dat_fname='bag_of_concepts.dat') affective_matrix = build_embedding_matrix( word2idx=boc.word2idx, embed_dim=100, dat_fname='100_concept_embeddings.dat') self.model = opt.model_class(embedding_matrix, opt).to(opt.device) else: boc = None self.model = opt.model_class(embedding_matrix, opt).to(opt.device) trainset = ABSADataset(opt.dataset_file['train'], tokenizer, boc) testset = ABSADataset(opt.dataset_file['test'], tokenizer, boc) self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) if opt.device.type == 'cuda': print("cuda memory allocated:", torch.cuda.memory_allocated(device=opt.device.index)) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name + '/vocab.txt') bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, fname=opt.embed_fname, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.train_dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0 and (not opt.val_test): print('Splitting trainset in train and val') valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: print('Setting testset as valset through valsetratio = 0') self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: # opt.learning_rate = 2e-5 tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: # opt.learning_rate = 0.001 tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) trainset = ABSADataset(opt.dataset_file['train'], tokenizer) testset = ABSADataset(opt.dataset_file['test'], tokenizer) self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) if opt.device.type == 'cuda': logging.info("cuda memory allocated:{}".format( torch.cuda.memory_allocated(device=opt.device.index))) self._log_write_args()
def __init__(self, arguments): # 项目的超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size") self.args = parser.parse_args() self.arguments = arguments self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) if 'bert' in self.arguments.model_name: self.tokenizer = Tokenizer4Bert(max_seq_len=self.arguments.max_seq_len, pretrained_bert_name=os.path.join(os.getcwd(), self.arguments.pretrained_bert_name)) bert = BertModel.from_pretrained(pretrained_model_name_or_path=self.arguments.pretrained_bert_name) self.model = self.arguments.model_class(bert, self.arguments).to(self.arguments.device) else: self.tokenizer = Util.bulid_tokenizer( fnames=[self.arguments.dataset_file['train'], self.arguments.dataset_file['test']], max_seq_len=self.arguments.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset) ) embedding_matrix = Util.build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=self.arguments.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(self.arguments.embed_dim), self.arguments.dataset) ) self.model = self.arguments.model_class(embedding_matrix, self.arguments).to(self.arguments.device) if self.arguments.device.type == 'cuda': logger.info( 'cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=self.arguments.device.index))) Util.print_args(model=self.model, logger=logger, args=self.arguments) target_text, stance, _, _ = self.dataset.get_all_data() target = np.asarray([i['TARGET'].lower() for i in target_text]) text = np.asarray([i['TEXT'].lower() for i in target_text]) stance = np.asarray([i['STANCE'] for i in stance]) self.target_set = set() for tar in target: self.target_set.add(tar) text = PreProcessing(text).get_file_text() trainset = ABSADataset(data_type=None, fname=(target, text, stance), tokenizer=self.tokenizer) valset_len = int(len(trainset) * self.arguments.valset_ratio) self.trainset, self.valset = random_split(trainset, (len(trainset) - valset_len, valset_len))
def __init__(self, opt): self.opt = opt tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) self.trainset = ABSADataset('./data/Train_Data.csv', tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def get_model(models): opt_list = [] pred_list = [] for model in models: opt = main(model) opt_list.append(opt) tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True) testset = ABSADataset(opt.dataset_file['test'], tokenizer) for opt in opt_list: if (opt.model_name == "bert_spc" or opt.model_name == "lcf_bert"): bert1 = BertModel.from_pretrained(opt.pretrained_bert_name) pred = Predictor(opt, tokenizer, bert1, testset) else: pred = Predictor(opt, tokenizer, bert, testset) predictions = pred.save_predictions() pred_list.append(predictions) return pred_list
def __init__(self, arguments): # 项目的超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=4, type=int, help="batch size") self.args = parser.parse_args() self.arguments = arguments self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) if 'bert' in self.arguments.model_name: self.tokenizer = Tokenizer4Bert( max_seq_len=self.arguments.max_seq_len, pretrained_bert_name=os.path.join( os.getcwd(), self.arguments.pretrained_bert_name)) bert = BertModel.from_pretrained(pretrained_model_name_or_path=self .arguments.pretrained_bert_name) self.model = self.arguments.model_class(bert, self.arguments).to( self.arguments.device) else: self.tokenizer = Util.bulid_tokenizer( fnames=[ self.arguments.dataset_file['train'], self.arguments.dataset_file['test'] ], max_seq_len=self.arguments.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset)) embedding_matrix = Util.build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=self.arguments.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(self.arguments.embed_dim), self.arguments.dataset)) self.model = self.arguments.model_class( embedding_matrix, self.arguments).to(self.arguments.device) if self.arguments.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated( device=self.arguments.device.index))) Util.print_args(model=self.model, logger=logger, args=self.arguments) target_text, stance, _, _ = self.dataset.get_all_data() target = np.asarray([i['TARGET'].lower() for i in target_text]) text = np.asarray([i['TEXT'].lower() for i in target_text]) stance = np.asarray([i['STANCE'] for i in stance]) # ############################# 特征词库的方法 效果不好 # train_data = pd.DataFrame(data=[stance, target, text]).T # train_data.columns = ['STANCE', 'TARGET', 'TEXT'] # Util.calculate_word_count(train_data) # ############################# 特征词库的方法 效果不好 self.target_set = set() for tar in target: self.target_set.add(tar) text = PreProcessing(text).get_file_text() # ############################# 同义词替换的方法 效果不好 # self.synonyms = SynonymsReplacer() # text_add = [] # for index in range(len(text)): # text_add.append(self.synonyms.get_syno_sents_list(text[index])) # target = np.append(target, target) # text = np.append(text, np.asarray(text_add)) # stance = np.append(stance, stance) # ############################# 同义词替换的方法 效果不好 print('target.shape: {}, text.shape: {}, stance.shape: {}'.format( target.shape, text.shape, stance.shape)) trainset = ABSADataset(data_type=None, fname=(target, text, stance), tokenizer=self.tokenizer) valset_len = int(len(trainset) * self.arguments.valset_ratio) self.trainset, self.valset = random_split( trainset, (len(trainset) - valset_len, valset_len))
def __int__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.datasets_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args() def _print_args(self): n_trainable_params, n_nontrainable_params = 0, 0 for p in self.model.parameters(): n_params = torch.prod(torch.tensor(p.shape)) if p.requires_grad: n_trainable_params += n_params else: n_nontrainable_params += n_params logger.info( 'n_trainable_params: {0}, n_nontrainable_params: {1}'.format( n_trainable_params, n_nontrainable_params)) logger.info('> training arguments:') for arg in vars(self.opt): logger.info('>>> {0}: {1}'.format(arg, getattr(self.opt, arg))) def _reset_params(self): for child in self.model.children(): if type(child) != BertModel: # skip bert params for p in child.parameters(): if p.requires_grad: if len(p.shape) > 1: self.opt.initializer(p) else: stdv = 1. / math.sqrt(p.shape[0]) torch.nn.init.uniform_(p, a=-stdv, b=stdv) def _train(self, criterion, optimizer, train_data_loader, val_data_loader): max_val_acc = 0 max_val_f1 = 0 global_step = 0 path = None for epoch in range(self.opt.num_epoch): logger.info('>' * 100) logger.info('epoch: {}'.format(epoch)) n_correct, n_total, loss_total = 0, 0, 0 # switch model to training mode self.model.train() for i_batch, sample_batched in enumerate(train_data_loader): global_step += 1 # clear gradient accumulators optimizer.zero_grad() inputs = [ sample_batched[col].to(self.opt.device) for col in self.opt.inputs_cols ] outputs = self.model(inputs) targets = sample_batched['polarity'].to(self.opt.device) loss = criterion(outputs, targets) loss.backward() optimizer.step() n_correct += (torch.argmax(outputs, -1) == targets).sum().item() n_total += len(outputs) loss_total += loss.item() * len(outputs) if global_step % self.opt.log_step == 0: train_acc = n_correct / n_total train_loss = loss_total / n_total logger.info('loss: {:.4f}, acc: {:.4f}'.format( train_loss, train_acc)) val_acc, val_f1 = self._evaluate_acc_f1(val_data_loader) logger.info('> val_acc: {:.4f}, val_f1: {:.4f}'.format( val_acc, val_f1)) if val_acc > max_val_acc: max_val_acc = val_acc if not os.path.exists('state_dict'): os.mkdir('state_dict') path = 'state_dict/{0}_{1}_val_acc{2}'.format( self.opt.model_name, self.opt.dataset, round(val_acc, 4)) torch.save(self.model.state_dict(), path) logger.info('>> saved: {}'.format(path)) if val_f1 > max_val_f1: max_val_f1 = val_f1 return path def _evaluate_acc_f1(self, data_loader): n_correct, n_total = 0, 0 t_targets_all, t_outputs_all = None, None # switch model to evaluation mode self.model.eval() with torch.no_grad(): for t_batch, t_sample_batched in enumerate(data_loader): t_inputs = [ t_sample_batched[col].to(self.opt.device) for col in self.opt.inputs_cols ] t_targets = t_sample_batched['polarity'].to( self.opt.device) t_outputs = self.model(t_inputs) n_correct += (torch.argmax(t_outputs, -1) == t_targets).sum().item() n_total += len(t_outputs) if t_targets_all is None: t_targets_all = t_targets t_outputs_all = t_outputs else: t_targets_all = torch.cat((t_targets_all, t_targets), dim=0) t_outputs_all = torch.cat((t_outputs_all, t_outputs), dim=0) acc = n_correct / n_total f1 = metrics.f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average='macro') return acc, f1 def run(self): # Loss and Optimizer criterion = nn.CrossEntropyLoss() _params = filter(lambda p: p.requires_grad, self.model.parameters()) optimizer = self.opt.optimizer(_params, lr=self.opt.learning_rate, weight_decay=self.opt.l2reg) train_data_loader = DataLoader(dataset=self.trainset, batch_size=self.opt.batch_size, shuffle=True) test_data_loader = DataLoader(dataset=self.testset, batch_size=self.opt.batch_size, shuffle=False) val_data_loader = DataLoader(dataset=self.valset, batch_size=self.opt.batch_size, shuffle=False) self._reset_params() best_model_path = self._train(criterion, optimizer, train_data_loader, val_data_loader) self.model.load_state_dict(torch.load(best_model_path)) self.model.eval() test_acc, test_f1 = self._evaluate_acc_f1(test_data_loader) logger.info('>> test_acc: {:.4f}, test_f1: {:.4f}'.format( test_acc, test_f1))
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: # tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) # bert = BertModel.from_pretrained(opt.pretrained_bert_name) # self.model = opt.model_class(bert, opt).to(opt.device) pass else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='gen_data/tokenizer/{0}_tokenizer.dat'.format( opt.dataset)) pos_tagger_train = build_pos_tagger( fname=opt.dataset_file['train'], dat_fname="gen_data/pos/pos_tagger_{}_train.dat".format( opt.dataset), modelfile=opt.stanford_pos_model, jarfile=opt.stanford_pos_jar, tokenizer=tokenizer) embedding_matrix = build_embedding_matrix( glove_path=opt.glove_path, word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname=opt.embedding_matrix_path) if "embed" in opt.model_name: self.model = opt.model_class(embedding_matrix, pos_tagger_train.index2vec, opt).to(opt.device) else: self.model = opt.model_class(embedding_matrix, opt).to(opt.device) pos_tagger_test = build_pos_tagger( fname=opt.dataset_file['test'], dat_fname="gen_data/pos/pos_tagger_{}_test.dat".format( opt.dataset), modelfile=opt.stanford_pos_model, jarfile=opt.stanford_pos_jar, tokenizer=tokenizer) self.trainset = ABSADataset( opt.dataset_file['train'], 'gen_data/dataset/{}_train_dataset.dat'.format(opt.dataset), tokenizer, pos_tagger_train) self.testset = ABSADataset( opt.dataset_file['test'], 'gen_data/dataset/{}_test_dataset.dat'.format(opt.dataset), tokenizer, pos_tagger_test) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self.tokenizer = tokenizer self._print_args()
def __init__(self, opt): self.opt = opt if 'v1' in opt.model_name and 'albert' in opt.model_name: tokenizer = Tokenizer4AlbertGcn(opt.max_seq_len, opt.pretrained_bert_name) bert = None self.model = opt.model_class(bert, opt).to(opt.device) elif 'v1' in opt.model_name and 'bert' in opt.model_name: tokenizer = Tokenizer4BertGcn(opt.max_seq_len, opt.pretrained_bert_name) bert = None self.model = opt.model_class(bert, opt).to(opt.device) elif 'albert_gcn' in opt.model_name: tokenizer = Tokenizer4AlbertGcn(opt.max_seq_len, opt.pretrained_bert_name) bert = AlbertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) elif 'bert_gcn' in opt.model_name: tokenizer = Tokenizer4BertGcn(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) elif 'albert' in opt.model_name: tokenizer = Tokenizer4Albert(opt.max_seq_len, opt.pretrained_bert_name) bert = AlbertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) elif 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) if 'bert' in opt.model_name and opt.freeze_bert: try: self.model.bert.requires_grad = False except: self.model.context_bert.requires_grad = False if 'gcn' in opt.model_name: self.trainset = ABSAGcnData(opt.dataset_file['train'], tokenizer, debug=opt.debug, from_xml=opt.from_xml) self.testset = ABSAGcnData(opt.dataset_file['test'], tokenizer, debug=opt.debug, from_xml=opt.from_xml) else: self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer, debug=opt.debug) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer, debug=opt.debug) assert 0 <= opt.valset_ration < 1 if opt.valset_ration > 0: valset_len = int(len(self.trainset) * opt.valset_ration) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def run(self): # Loss and Optimizer criterion = nn.CrossEntropyLoss() _params = filter(lambda p: p.requires_grad, self.model.parameters()) optimizer = self.opt.optimizer(_params, lr=self.opt.learning_rate, weight_decay=self.opt.l2reg) train_data_loader = DataLoader(dataset=self.trainset, batch_size=self.opt.batch_size, shuffle=True) test_data_loader = DataLoader(dataset=self.testset, batch_size=self.opt.batch_size, shuffle=False) val_data_loader = DataLoader(dataset=self.valset, batch_size=self.opt.batch_size, shuffle=False) self._reset_params() # best_model_path = self._train(criterion, optimizer, train_data_loader, val_data_loader) # self.model.load_state_dict(torch.load(best_model_path)) best_epoch = self._train(criterion, optimizer, train_data_loader, val_data_loader) logger.info(f'>> Optimal no. of epochs: {best_epoch+1}') # Re-initialize the model and train with the full train_set opt = self.opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name + '/vocab.txt') bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, fname=opt.embed_fname, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.train_dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) _params = filter(lambda p: p.requires_grad, self.model.parameters()) optimizer = self.opt.optimizer(_params, lr=self.opt.learning_rate, weight_decay=self.opt.l2reg) train_data_loader = DataLoader(dataset=self.trainset, batch_size=self.opt.batch_size, shuffle=True) self._reset_params() for epoch in range(best_epoch + 1): self.model.train() for i_batch, sample_batched in enumerate(train_data_loader): # global_step += 1 # clear gradient accumulators optimizer.zero_grad() inputs = [ sample_batched[col].to(self.opt.device) for col in self.opt.inputs_cols ] outputs = self.model(inputs) targets = sample_batched['polarity'].to(self.opt.device) loss = criterion(outputs, targets) loss.backward() optimizer.step() self.model.eval() test_acc, test_f1, test_outputs, test_targets = self._evaluate_acc_f1( test_data_loader) pred_fname = 'logs/{0}-{1}-{2}-{3}-{4}-{5}-test_acc-{6}-test_f1-{7}.csv'.format( self.opt.model_name, self.opt.train_dataset, self.opt.test_dataset, self.opt.seed, self.opt.valset_ratio, self.opt.expr_idx, round(test_acc, 4), round(test_f1, 4)) numpy.savetxt(pred_fname, test_outputs.numpy()) target_fname = 'logs/{0}-target.csv'.format(self.opt.test_dataset) numpy.savetxt(target_fname, test_targets.numpy()) logger.info('>> test_acc: {:.4f}, test_f1: {:.4f}'.format( test_acc, test_f1))
def run(self): # loss and optimizer criterion = nn.CrossEntropyLoss() _params = filter(lambda x: x.requires_grad, self.model.parameters()) optimizer = self.arguments.optimizer(_params, lr=self.arguments.learning_rate, weight_decay=self.arguments.l2reg) for topic in self.arguments.topics: logger.info('>' * 100) logger.info('topic: {}'.format(topic)) index = np.where(self.target == topic.lower()) self.trainset = ABSADataset(data_type=None, fname=(self.target[index], self.text[index], self.stance[index]), tokenizer=self.tokenizer) self.valset_len = int(len(self.trainset) * self.arguments.valset_ratio) self.trainset, self.valset = random_split(self.trainset, (len(self.trainset) - self.valset_len, self.valset_len)) train_data_loader = DataLoader(dataset=self.trainset, batch_size=self.args.BATCH, shuffle=True) val_data_loader = DataLoader(dataset=self.valset, batch_size=self.args.BATCH, shuffle=False) # 训练 max_val_acc = 0 max_val_f1 = 0 global_step = 0 best_model_path = None Util.reset_params(model=self.model, args=self.arguments) for epoch in range(self.args.EPOCHS): logger.info('>>') logger.info('epoch: {}'.format(epoch)) n_correct, n_total, loss_total = 0, 0, 0 self.model.train() for i_batch, sample_batched in enumerate(train_data_loader): global_step += 1 optimizer.zero_grad() inputs = [sample_batched[col].to(self.arguments.device) for col in self.arguments.inputs_cols] outputs = self.model(inputs) targets = torch.tensor(sample_batched['polarity']).to(self.arguments.device) loss = criterion(outputs, targets) loss.backward() optimizer.step() n_correct += (torch.argmax(outputs, -1) == targets).sum().item() n_total += len(outputs) loss_total += loss.item() * len(outputs) if global_step % self.arguments.log_step == 0: train_acc = n_correct / n_total train_loss = loss_total / n_total logger.info('loss: {:.4f}, acc: {:.4f}'.format(train_loss, train_acc)) val_acc, val_f1 = Util.evaluate_acc_f1(model=self.model, args=self.arguments, data_loader=val_data_loader) logger.info('> val_acc: {:.4f}, val_f1: {:.4f}'.format(val_acc, val_f1)) if val_acc > max_val_acc: max_val_acc = val_acc best_model_path = os.path.join(os.getcwd(), self.arguments.best_model_path, topic) if os.path.exists(best_model_path) is False: os.mkdir(best_model_path) Util.save_model(model=self.model, output_dir=best_model_path) logger.info('>> saved: {}'.format(best_model_path)) if val_f1 > max_val_f1: max_val_f1 = val_f1 Util.save_model(model=self.model, output_dir=best_model_path) logger.info('>>> target: {}'.format(self.target_set)) logger.info('> max_val_acc: {0} max_val_f1: {1}'.format(max_val_acc, max_val_f1)) logger.info('> train save model path: {}'.format(best_model_path))
from torch.utils.data import DataLoader from data_utils import Tokenizer4Bert, ABSADataset from models.aen import AEN_BERT from utils import get_options opt = get_options() bert = BertModel.from_pretrained(opt.pretrained_bert_name) model_path = 'state_dict/aen_bert_laptop_val_acc0.7821' model = AEN_BERT(bert, opt) model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) model.eval() tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) test_set = ABSADataset(opt.dataset_file['test'], tokenizer) data_loader = DataLoader(dataset=test_set, batch_size=1, shuffle=False) n_correct, n_total = 0, 0 t_targets_all, t_outputs_all = None, None with torch.no_grad(): for t_batch, t_sample_batched in enumerate(data_loader): t_inputs = [ t_sample_batched[col].to(opt.device) for col in opt.inputs_cols ] print("input: ", t_inputs) t_targets = t_sample_batched['polarity'].to(opt.device) print("targets: ", t_targets) t_outputs = model(t_inputs) print("outputs: ", t_outputs)
def __init__(self, opt): self.opt = opt out_file = './stat/{}_{}_domain{}_adv{}_aux{}_resplit{}_epoch{}'.format( self.opt.model_name, self.opt.dataset, self.opt.domain, str(self.opt.adv), str(self.opt.aux), str(self.opt.resplit), (self.opt.num_epoch)) print(out_file) if 'bert' in opt.model_name: # if opt.model_name == 'bert_kg': # tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) # bert = BertForTokenClassification.from_pretrained('ernie_base') # self.model = opt.model_class(bert, opt).to(opt.device) # self.model.to(opt.device) if opt.model_name == 'lcf_bert': from pytorch_transformers import BertModel, BertForTokenClassification, BertConfig tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=False) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) elif opt.model_name == 'bert': from pytorch_transformers import BertModel, BertForTokenClassification, BertConfig tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) elif opt.model_name in ['bert_spc', 'td_bert']: from pytorch_transformers import BertModel, BertForTokenClassification, BertConfig tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) # self.model.load_state_dict(torch.load('./state_dict/bert_multi_target_val_acc0.7714')) elif opt.model_name == 'bert_label': tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) elif opt.model_name == 'bert_compete': tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) num_added_tokens = tokenizer.add_tokens( ['[aspect_b]', '[aspect_e]']) bert.resize_token_embeddings(len(tokenizer.tokenizer)) self.model = opt.model_class(bert, opt).to(opt.device) else: from modeling_bert import BertModel, BertForTokenClassification, BertConfig # bert_mulit_target tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) if opt.domain == 'pt': bert = BertModel.from_pretrained( './bert_models/pt_bert-base-uncased_amazon_yelp') if opt.domain == 'joint': bert = BertModel.from_pretrained( './bert_models/laptops_and_restaurants_2mio_ep15') if opt.domain == 'res': bert = BertModel.from_pretrained( './bert_models/restaurants_10mio_ep3') if opt.domain == 'laptop': bert = BertModel.from_pretrained( './bert_models/laptops_1mio_ep30') if opt.domain == 'ernie': bert = BertModel.from_pretrained( './bert_models/ERNIE_Base_en_stable-2.0.0_pytorch') # num_added_tokens = tokenizer.add_tokens(['[target_b]','[target_e]']) # num_added_tokens = tokenizer.add_tokens(['[aspect_b]','[aspect_e]']) for i in range(20): b = '[' + str(i) + 'b]' e = '[' + str(i) + 'e]' num_added_tokens = tokenizer.add_tokens([b, e]) bert.resize_token_embeddings(len(tokenizer.tokenizer)) self.model = opt.model_class(bert, opt).to(opt.device) # self.model.load_state_dict(torch.load('./state_dict/state_dict/bert_multi_target_restaurant_doamin-res_can0_adv0_aux1.0_val_acc0.8688')) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer, 'train', opt) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer, 'test', opt) if int(opt.resplit) == 0: valset_ratio = 0.05 assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: if int(self.opt.resplit) == 1 or int(self.opt.resplit) == 2: self.valset = ABSADataset('valid', tokenizer, 'valid', opt) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) # if opt.load_mode == 1: # self.model.load_state_dict(torch.load('/home/nus/temp/ABSA-PyTorch/state_dict/bert_spc_twitter_val_acc0.7384')) # find the highese # model.load_state_dict(torch.load(PATH)) self._print_args()