def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) # freeze pretrained bert params # for param in bert.parameters(): # param.requires_grad = False self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) trainset = ABSADataset(opt.dataset_file['train'], tokenizer) testset = ABSADataset(opt.dataset_file['test'], tokenizer) self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) if opt.device.type == 'cuda': print("cuda memory allocated:", torch.cuda.memory_allocated(device=opt.device.index)) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) self.model.load_state_dict(torch.load(opt.state_dict_path)) logger.info( f"Loaded model {opt.model_name} from {opt.state_dict_path}") else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.model.load_state_dict(torch.load(opt.state_dict_path)) self.valset = ABSADataset(opt.dataset_file['val'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index)))
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) # bert = BertModel.from_pretrained(opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.pretrained_bert_state_dict = bert.state_dict() self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt, dtl_param): """ A wrapper for running HAOFL based models. :param opt: An object stores all hyper-parameters :param dtl_param: A string indicates that parameter of used data transformation method used in DTL layer. """ self.opt = opt if 'bert' in self.opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, 'bert-base-uncased') self.model = opt.model_class(opt, tokenizer).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='normal_tokenizer.dat') embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_embedding_matrix.dat'.format(str( opt.embed_dim))) self.model = opt.model_class(opt, tokenizer, embedding_matrix).to(opt.device) self.train_set = TrainDataset(opt.dataset_file['train'], tokenizer, opt, opt.dtl_method, dtl_param, opt.name_tail) self.val_set = TrainDataset(opt.dataset_file['test'], tokenizer, opt, opt.dtl_method, dtl_param, opt.name_tail) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) if self.opt.bert_path: bert_path = self.opt.bert_path.replace("\r", "").replace("\n", "") bert = BertModel.from_pretrained(bert_path) self.model = opt.model_class(bert, opt).to(opt.device) self.trainset = PreTrainDataset(opt.dataset, tokenizer, train_or_test='train') np.random.shuffle(self.trainset.data) self.trainset.data = self.trainset.data[:self.opt.few_shot_num] self.testset = PreTrainDataset(opt.dataset, tokenizer, train_or_test='test') # np.random.shuffle(self.testset.data) if self.opt.cross_val_fold < 0: self.valset = PreTrainDataset(opt.dataset, tokenizer, train_or_test='val') if self.opt.cross_val_fold == 0: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def aspectSentiment_api(): data = request.json opt = get_parameters() opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) model = AEN_BERT(bert, opt).to(opt.device) print('loading model {0} ...'.format(opt.model_name)) model.load_state_dict( torch.load('aen_bert_restaurant_val_acc0.8098', map_location=opt.device)) model.eval() torch.autograd.set_grad_enabled(False) out = [] for entity, sentences in data.items(): for sentence in sentences: sentence_d = {'aspect': '', 'sentiment': '', 'sentence': ''} sentiment_d = {-1: 'Negative', 0: 'Neutral', 1: 'Positive'} left = sentence['left'] aspect = sentence['aspect'] right = sentence['right'] sentence = left + aspect + right text_bert_indices, bert_segments_ids, text_raw_bert_indices, aspect_bert_indices = prepare_data( left, aspect, right, tokenizer) text_bert_indices = torch.tensor([text_bert_indices], dtype=torch.int64).to(opt.device) bert_segments_ids = torch.tensor([bert_segments_ids], dtype=torch.int64).to(opt.device) text_raw_bert_indices = torch.tensor([text_raw_bert_indices], dtype=torch.int64).to( opt.device) aspect_bert_indices = torch.tensor([aspect_bert_indices], dtype=torch.int64).to( opt.device) inputs = [text_raw_bert_indices, aspect_bert_indices] outputs = model(inputs) t_probs = F.softmax(outputs, dim=-1).cpu().numpy() aspect_sentiment_n = t_probs.argmax(axis=-1) - 1 print(aspect_sentiment_n) aspect_sentiment = sentiment_d[aspect_sentiment_n[0]] sentence_d['aspect'] = aspect sentence_d['sentiment'] = aspect_sentiment sentence_d['sentence'] = sentence out.append(sentence_d) dic = absa_chapter_combined_s(out) absaChapterCombinedS = absa_chapter_to_react(dic) returnJson = { 'sentimentTableData': absaChapterCombinedS, 'absaChapter': dic } return returnJson
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained('/home/yinrongdi/bert/bert-base-uncased.tar.gz') self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='temp_data/'+'{0}_tokenizer.dat'.format(opt.dataset), step = 4 if opt.tabsa else 3) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='temp_data/'+'{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) if opt.classifier: if opt.classifier_with_absa_target: self.classifierset = TABSADataset(opt.dataset_file['classifier_absa_target'],tokenizer,False) else: if opt.classifier_with_absa: self.classifierset = TABSADataset(opt.dataset_file['classifier'],tokenizer,True) else: self.classifierset = TABSADataset(opt.dataset_file['classifier'],tokenizer,False) if opt.tabsa: if opt.tabsa_with_absa: if opt.gating: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer,True,True) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer,True,True) else: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer,True) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer,True) else: if opt.gating: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer,False,True) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer,True) else: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer,False) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer,False) else: self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split(self.trainset, (len(self.trainset)-valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(bert_path) self.model = opt.model_class(bert, opt).to(opt.device) self.testset = PreTrainDataset(opt.dataset, tokenizer, train_or_test='test') if opt.device.type == 'cuda': print('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=opt.device.index)))
def __init__(self, test_query, test_reply): self.tokenizer = Tokenizer4Bert(opt.max_length, opt.pretrained_bert_name) bert_model = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True) self.model = opt.model_class(bert_model, opt).to(opt.device) # * testset df_test_query = pd.read_csv(test_query, sep='\t', header=None, encoding='utf-8', engine='python') df_test_query.columns = ['id', 'q1'] df_test_reply = pd.read_csv(test_reply, sep='\t', header=None, encoding='utf-8', engine='python') df_test_reply.columns = ['id', 'id_sub', 'q2'] df_test_reply['q2'] = df_test_reply['q2'].fillna('好的') df_test_data = df_test_query.merge(df_test_reply, how='left') if opt.add_pseudo_data: self.pseudo_groups = df_test_data.loc[:, 'id'].to_numpy() self.pseudo_index = np.array(df_test_data.index) self.pseudo_data = copy.deepcopy(df_test_data) self.submit = copy.deepcopy(df_test_reply) # self.pseudo = copy.deepcopy(df_test_data) testset = BertSentenceDataset(df_test_data, self.tokenizer, test=True) if opt.dialogue: self.test_dataloader = DataLoader(dataset=testset, batch_size=opt.eval_batch_size, shuffle=False, collate_fn=collate_wrapper) else: self.test_dataloader = DataLoader(dataset=testset, batch_size=opt.eval_batch_size, shuffle=False) if opt.datareverse: df_test_data_reverse = copy.deepcopy( df_test_data[['id', 'q2', 'id_sub', 'q1']]) testset_reverse = BertSentenceDataset(df_test_data_reverse, self.tokenizer, test=True) self.test_dataloader_reverse = DataLoader( dataset=testset_reverse, batch_size=opt.eval_batch_size, shuffle=False) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(opt.device.index))) self._print_args()
def __init__(self, opt, model_classes): self.opt = opt if 'bert' in opt.model_name: self.tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) else: self.tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.trainset = ABSADataset(opt.dataset_file['train'], self.tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], self.tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) if 'bert' in opt.model_name: # ,cache_dir="pretrained/bert/" print("--------load module BERT --------") #To from pytorch_transformers import BertModel self.bert = BertModel.from_pretrained(opt.pretrained_bert_name, output_attentions=True, cache_dir="pretrained/bert/") # Bert pretrained (Old version) #bert = BertModel.from_pretrained(opt.pretrained_bert_name, cache_dir="pretrained/bert/") print("--------DDDD-----") print("OUTPUT") print("------ Module LOADED -------") #self.model = model_classes[opt.model_name](bert, opt).to(opt.device) self.model = opt.model_class(self.bert, opt).to(opt.device) #self.model = AEN_BERT(self.bert, opt).to(opt.device) print("MODULE LOADED SPECIFIC") else: self.model = model_classes[opt.model_name](embedding_matrix, opt).to(opt.device) self._print_args()
def __init__(self, opt): self.opt = opt self.tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) self.model.load_state_dict( torch.load(opt.state_dict_path, map_location='cpu')) self.model = self.model.to(opt.device) # switch model to evaluation mode self.model.eval() torch.autograd.set_grad_enabled(False)
def __init__(self, opt): self.opt = opt if 'aen_simple' == opt.model_name: if 'bert' == opt.bert_type: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) elif 'roberta' == opt.bert_type: tokenizer = Tokenizer4RoBerta(opt.max_seq_len, opt.pretrained_bert_name) roberta = RobertaModel.from_pretrained( opt.pretrained_bert_name) self.model = opt.model_class(roberta, opt).to(opt.device) elif 'roberta' in opt.model_name: tokenizer = Tokenizer4RoBerta(opt.max_seq_len, opt.pretrained_bert_name) roberta = RobertaModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(roberta, opt).to(opt.device) elif 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): """ 初始化模型和数据预处理,并token化 :param opt: argparse的参数 """ self.opt = opt #是否是bert类模型,使用bert类模型初始化, 非BERT类使用GloVe if 'bert' in opt.model_name: #初始化tokenizer tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name, cache_dir=opt.pretrained_bert_cache_dir) # 加载BERT模型 bert = BertModel.from_pretrained( opt.pretrained_bert_name, cache_dir=opt.pretrained_bert_cache_dir) # 然后把BERT模型和opt参数传入自定义模型,进行进一步处理 self.model = opt.model_class(bert, opt).to(opt.device) else: # 自定义tokenizer,生成id2word,word2idx tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) #返回所有单词的词嵌入 [word_nums, embedding_dimesion] embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) # 加载模型 self.model = opt.model_class(embedding_matrix, opt).to(opt.device) # 加载训练集 self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer, recreate_caches=opt.recreate_caches) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer, recreate_caches=opt.recreate_caches) #如果valset_ratio为0,测试集代替验证集 assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset # 检查cuda的内存 if opt.device.type == 'cuda': logger.info('cuda 可用内存: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) self.trainset = IOBDataset(opt.dataset_file['train'], tokenizer) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, arguments): # 项目的超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size") self.args = parser.parse_args() self.arguments = arguments self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) if 'bert' in self.arguments.model_name: self.tokenizer = Tokenizer4Bert( max_seq_len=self.arguments.max_seq_len, pretrained_bert_name=os.path.join( os.getcwd(), self.arguments.pretrained_bert_name)) bert = BertModel.from_pretrained(pretrained_model_name_or_path=self .arguments.pretrained_bert_name) self.model = self.arguments.model_class(bert, self.arguments).to( self.arguments.device) else: self.tokenizer = Util.bulid_tokenizer( fnames=[ self.arguments.dataset_file['train'], self.arguments.dataset_file['test'] ], max_seq_len=self.arguments.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset)) embedding_matrix = Util.build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=self.arguments.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(self.arguments.embed_dim), self.arguments.dataset)) self.model = self.arguments.model_class( embedding_matrix, self.arguments).to(self.arguments.device) if self.arguments.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated( device=self.arguments.device.index))) Util.print_args(model=self.model, logger=logger, args=self.arguments)
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: self.tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) print('loading model {0} ...'.format(opt.model_name)) # remember removed map_location='cpu' when using on server w GPU self.model.load_state_dict(torch.load(opt.state_dict_path)) # switch model to evaluation mode self.model.eval() torch.autograd.set_grad_enabled(False)
def __init__(self): opt = module_opt() bert = BertModel.from_pretrained('bert-base-uncased') self.tokenizer = Tokenizer4Bert(80, 'bert-base-uncased') model = LCF_BERT(bert, opt).to(opt.device) print('loading sa module ...') model.load_state_dict( torch.load('state_dict/lcf_bert_movie_val_acc0.8203', map_location=torch.device('cpu'))) model.eval() torch.autograd.set_grad_enabled(False) self.model = model self.opt = opt
def initialize(): opt = get_parameters() opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) model = model_classes[opt.model_name](bert, opt).to(opt.device) print('loading model {0} ...'.format(opt.model_name)) torch.autograd.set_grad_enabled(False) model.load_state_dict(torch.load(state_dict_paths[opt.model_name])) model.eval() torch.autograd.set_grad_enabled(False) return opt, tokenizer, model
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: self.tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = BERT_SSC(bert, opt).to(opt.device) logger.info('loading model {0} ... done'.format(opt.model_name)) # remember removed map_location='cpu' when using on server w GPU self.model.load_state_dict(torch.load(opt.state_dict_path)) # switch model to evaluation mode self.model.eval() torch.autograd.set_grad_enabled(False) else: logger.info('Now, we only support bert-based model') raise ValueError("Now, we only support bert-based model")
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: # set bert_based_vocab tokenizer = Tokenizer4Bert( opt.max_seq_len, '/data/kkzhang/aaa/command/bert-base-uncased-vocab.txt') #tokenizer = Tokenizer4Bert(opt.max_seq_len, '/home/kkzhang/bert-large-uncased/bert-large-uncased-vocab.txt') # set bert pre_train model bert = BertModel.from_pretrained( '/data/kkzhang/WordeEmbedding/bert_base/') ##### multi gpu ########## if torch.cuda.device_count() > 1: logging.info('The device has {} gpus!!!!!!!!!!!!!'.format( torch.cuda.device_count())) bert = nn.DataParallel(bert) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, arguments): # 项目的超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size") self.args = parser.parse_args() self.arguments = arguments self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) if 'bert' in self.arguments.model_name: self.tokenizer = Tokenizer4Bert(max_seq_len=self.arguments.max_seq_len, pretrained_bert_name=os.path.join(os.getcwd(), self.arguments.pretrained_bert_name)) bert = BertModel.from_pretrained(pretrained_model_name_or_path=self.arguments.pretrained_bert_name) self.model = self.arguments.model_class(bert, self.arguments).to(self.arguments.device) else: self.tokenizer = Util.bulid_tokenizer( fnames=[self.arguments.dataset_file['train'], self.arguments.dataset_file['test']], max_seq_len=self.arguments.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset) ) embedding_matrix = Util.build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=self.arguments.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(self.arguments.embed_dim), self.arguments.dataset) ) self.model = self.arguments.model_class(embedding_matrix, self.arguments).to(self.arguments.device) if self.arguments.device.type == 'cuda': logger.info( 'cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=self.arguments.device.index))) Util.print_args(model=self.model, logger=logger, args=self.arguments) target_text, stance, _, _ = self.dataset.get_all_data() target = np.asarray([i['TARGET'].lower() for i in target_text]) text = np.asarray([i['TEXT'].lower() for i in target_text]) stance = np.asarray([i['STANCE'] for i in stance]) self.target_set = set() for tar in target: self.target_set.add(tar) text = PreProcessing(text).get_file_text() trainset = ABSADataset(data_type=None, fname=(target, text, stance), tokenizer=self.tokenizer) valset_len = int(len(trainset) * self.arguments.valset_ratio) self.trainset, self.valset = random_split(trainset, (len(trainset) - valset_len, valset_len))
def __init__(self, opt): self.opt = opt print("loading {0} tokenizer...".format(opt.dataset)) self.bert_tokenizer = Tokenizer4Bert('bert-base-chinese') self.model_list = [] for i, model_name in enumerate(opt.model_name_list): print('loading model {0}... '.format(model_name)) bert = BertModel.from_pretrained('bert-base-chinese') model = nn.DataParallel(opt.model_class_list[i](bert, opt).to( opt.device)) model.load_state_dict(torch.load(opt.state_dict_path_list[i])) # switch model to evaluation mode model.eval() self.model_list.append(model) torch.autograd.set_grad_enabled(False)
def __init__(self, data): self.net = None self.data = data self.args = args self.idx2label = dict( (i, args.labels[i]) for i in range(len(args.labels))) self.tokenizer = Tokenizer4Bert(max_seq_len=self.args.max_seq_len, pretrained_bert_name=os.path.join( os.getcwd(), self.args.pretrained_bert_name)) bert = BertModel.from_pretrained( os.path.join(os.getcwd(), self.args.pretrained_bert_name)) model = self.args.model_classes[args.model_name](bert, self.args).to( self.args.device) if self.args.topics is not None: self.net_0 = Util.load_model(model=model, output_dir=os.path.join( os.getcwd(), args.best_model_path, self.args.topics[0])) self.net_0.eval() self.net_1 = Util.load_model(model=model, output_dir=os.path.join( os.getcwd(), args.best_model_path, self.args.topics[1])) self.net_1.eval() self.net_2 = Util.load_model(model=model, output_dir=os.path.join( os.getcwd(), args.best_model_path, self.args.topics[2])) self.net_2.eval() self.net_3 = Util.load_model(model=model, output_dir=os.path.join( os.getcwd(), args.best_model_path, self.args.topics[3])) self.net_3.eval() self.net_4 = Util.load_model(model=model, output_dir=os.path.join( os.getcwd(), args.best_model_path, self.args.topics[4])) self.net_4.eval() else: self.net = Util.load_model(model=model, output_dir=os.path.join( os.getcwd(), args.best_model_path))
def __init__(self, opt): self.opt = opt tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) self.trainset = ABSADataset('./data/Train_Data.csv', tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def get_model(models): opt_list = [] pred_list = [] for model in models: opt = main(model) opt_list.append(opt) tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True) testset = ABSADataset(opt.dataset_file['test'], tokenizer) for opt in opt_list: if (opt.model_name == "bert_spc" or opt.model_name == "lcf_bert"): bert1 = BertModel.from_pretrained(opt.pretrained_bert_name) pred = Predictor(opt, tokenizer, bert1, testset) else: pred = Predictor(opt, tokenizer, bert, testset) predictions = pred.save_predictions() pred_list.append(predictions) return pred_list
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True) # tokenizer = Tokenizer4Bert(opt.max_seq_len, '/content/drive/My Drive/FYP/pretrained_BERT_further_trained_with_criminal_corpus/vocab.txt') # bert = BertModel.from_pretrained('/content/drive/My Drive/FYP/pretrained_BERT_further_trained_with_criminal_corpus') self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset( opt.dataset_file['train'], './datasets/semeval14/law_train.raw.graph', tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], './datasets/semeval14/law_train.raw.graph', tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt tokenizer = Tokenizer4Bert(opt.max_length, opt.pretrained_bert_name) bert_model = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True) self.pretrained_bert_state_dict = bert_model.state_dict() self.model = opt.model_class(bert_model, opt).to(opt.device) print('loading model {0} ...'.format(opt.model_name)) self.model.load_state_dict(torch.load(opt.state_dict_path)) self.model = self.model.to(opt.device) torch.autograd.set_grad_enabled(False) testset = BertSentenceDataset(opt.dataset_file['test'], tokenizer, target_dim=self.opt.polarities_dim, opt=opt) self.test_dataloader = DataLoader(dataset=testset, batch_size=opt.eval_batch_size, shuffle=False)
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: self.tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: self.tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt) print('loading model {0} ...'.format(opt.model_name)) self.model.load_state_dict(torch.load(opt.state_dict_path)) self.model = self.model.to(opt.device) # switch model to evaluation mode self.model.eval() torch.autograd.set_grad_enabled(False)
def get_model(models): opt_list = [] models_list = [] for model in models: opt = main(model) opt_list.append(opt) tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True) for opt in opt_list: if (opt.model_name == "bert_spc" or opt.model_name == "lcf_bert"): bert1 = BertModel.from_pretrained(opt.pretrained_bert_name) pred = Preloader(opt, tokenizer, bert1) models_list.append(pred.get_model()) else: pred = Preloader(opt, tokenizer, bert) models_list.append(pred.get_model()) return models_list, opt_list, tokenizer