def __init__(self, opt): self.opt = opt if opt.dataset_file['val'] == None: fnames = [opt.dataset_file['train'], opt.dataset_file['test']] else: fnames = [ opt.dataset_file['train'], opt.dataset_file['val'], opt.dataset_file['test'] ] tokenizer = build_tokenizer(fnames, max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format( opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = Dataset(opt.dataset_file['train'], tokenizer, dat_fname='{0}_train.dat'.format(opt.dataset)) # self.weight_classes =torch.tensor( compute_class_weight('balanced', np.unique([i['polarity'] for i in self.trainset.data]), self.trainset[4]), dtype = torch.float).to(self.opt.device) # self.valset = ABSADataset(opt.dataset_file['val'], tokenizer)self.trainset[4] self.testset = Dataset(opt.dataset_file['test'], tokenizer, dat_fname='{0}_test.dat'.format(opt.dataset)) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = Dataset(opt.dataset_file['val'], tokenizer, dat_fname='{0}_val.dat'.format(opt.dataset)) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): super(Instructor, self).__init__() self.opt = opt tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, ngram=opt.ngram, min_count=opt.min_count, dat_fname='./bin/{0}_tokenizer.dat'.format(opt.dataset)) opt.vocab_size = len(tokenizer.token2idx) + 2 opt.ngram_vocab_sizes = [ len(tokenizer.ngram2idx[n]) + 2 for n in range(2, opt.ngram + 1) ] if opt.embed_file is not None and os.path.exists(opt.embed_file): embedding_matrix = build_embedding_matrix( token2idx=tokenizer.token2idx, embed_dim=opt.embed_dim, embed_file=opt.embed_file, dat_fname='./bin/{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) else: embedding_matrix = None self.model = opt.model_class(opt, embedding_matrix).to(opt.device) self.train_set = TCDataset(opt.dataset_file['train'], opt.label_mapping, opt.ngram, tokenizer) self.test_set = TCDataset(opt.dataset_file['test'], opt.label_mapping, opt.ngram, tokenizer) assert 0 <= opt.val_set_ratio < 1 if opt.val_set_ratio > 0: val_set_len = int(len(self.train_set) * opt.val_set_ratio) self.train_set, self.val_set = random_split( self.train_set, (len(self.train_set) - val_set_len, val_set_len)) else: self.val_set = self.test_set if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt self.tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_length=opt.max_length, data_file='./embedding/{0}_{1}_tokenizer.dat'.format( opt.model_name, opt.dataset), ) embedding_matrix = build_embedding_matrix( vocab=self.tokenizer.vocab, embed_dim=opt.embed_dim, data_file='./embedding/{0}_{1}d_{2}_embedding_matrix.dat'.format( opt.model_name, str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt) print('loading model {0} ...'.format(opt.model_name)) self.model.load_state_dict(torch.load(opt.state_dict_path)) self.model = self.model.to(opt.device) torch.autograd.set_grad_enabled(False)
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name, output_hidden_states=True) # tokenizer = Tokenizer4Bert(opt.max_seq_len, '/content/drive/My Drive/FYP/pretrained_BERT_further_trained_with_criminal_corpus/vocab.txt') # bert = BertModel.from_pretrained('/content/drive/My Drive/FYP/pretrained_BERT_further_trained_with_criminal_corpus') self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset( opt.dataset_file['train'], './datasets/semeval14/law_train.raw.graph', tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], './datasets/semeval14/law_train.raw.graph', tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.ç) if opt.dataset in ['twitter', 'restaurant', 'laptop']: self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) #返回 torch 的dataset类 self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) else: self.trainset = CovData(opt.dataset_file['train'], tokenizer) #返回 torch 的dataset类 self.testset = CovData(opt.dataset_file['test'], tokenizer) # 定义切分数据集的比例 切分训练集 assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: #valset_len = 150 valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) #self.valset= self.trainset[:150] #self.trainset= self.trainset[150:] #valset_len print("ASA") else: self.valset = self.testset print("BBBSD") if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) # freeze pretrained bert params # for param in bert.parameters(): # param.requires_grad = False self.model = opt.model_class(bert, opt) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) trainset = ABSADataset(opt.dataset_file['train'], tokenizer) testset = ABSADataset(opt.dataset_file['test'], tokenizer) self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) if opt.device.type == 'cuda': self.model = nn.DataParallel(self.model).cuda() print("cuda memory allocated:", torch.cuda.memory_allocated(device=opt.device.index)) else: self.model = self.model.to(opt.device) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: self.tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: self.tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt) print('loading model {0} ...'.format(opt.model_name)) self.model.load_state_dict(torch.load(opt.state_dict_path)) self.model = self.model.to(opt.device) # switch model to evaluation mode self.model.eval() torch.autograd.set_grad_enabled(False)
def __init__(self, opt): self.opt = opt absa_data_reader = ABSADataReader(data_dir=opt.data_dir) self.tokenizer = build_tokenizer(data_dir=opt.data_dir) embedding_matrix = build_embedding_matrix(opt.data_dir, self.tokenizer.word2idx, opt.embed_dim, opt.dataset) self.idx2tag, self.idx2polarity = absa_data_reader.reverse_tag_map, absa_data_reader.reverse_polarity_map self.model = opt.model_class(embedding_matrix, opt, self.idx2tag, self.idx2polarity).to(opt.device) print('loading model {0} ...'.format(opt.model_name)) # self.model.load_state_dict(torch.load(opt.state_dict_path, map_location=lambda storage, loc: storage)) # switch model to evaluation mode self.model.eval() # get a handle on s3 session = boto3.Session( aws_access_key_id='XXXXXXXXXXXX', aws_secret_access_key='XXXXXXXX', region_name='XXXXXXXX') self.s3 = session.resource('s3') self.bucket = self.s3.Bucket('surveybuddy-responses') # example: energy_market_procesing torch.autograd.set_grad_enabled(False)
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name + '/vocab.txt') bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, fname=opt.embed_fname, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.train_dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0 and (not opt.val_test): print('Splitting trainset in train and val') valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: print('Setting testset as valset through valsetratio = 0') self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt # prepare inputs tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) if opt.dan == True: boc = build_boc(' ', dat_fname='bag_of_concepts.dat') affective_matrix = build_embedding_matrix( word2idx=boc.word2idx, embed_dim=100, dat_fname='100_concept_embeddings.dat') self.model = opt.model_class(embedding_matrix, opt).to(opt.device) else: boc = None self.model = opt.model_class(embedding_matrix, opt).to(opt.device) trainset = ABSADataset(opt.dataset_file['train'], tokenizer, boc) testset = ABSADataset(opt.dataset_file['test'], tokenizer, boc) self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) if opt.device.type == 'cuda': print("cuda memory allocated:", torch.cuda.memory_allocated(device=opt.device.index)) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: # opt.learning_rate = 2e-5 tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: # opt.learning_rate = 0.001 tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) trainset = ABSADataset(opt.dataset_file['train'], tokenizer) testset = ABSADataset(opt.dataset_file['test'], tokenizer) self.train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) if opt.device.type == 'cuda': logging.info("cuda memory allocated:{}".format( torch.cuda.memory_allocated(device=opt.device.index))) self._log_write_args()
def __int__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.datasets_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args() def _print_args(self): n_trainable_params, n_nontrainable_params = 0, 0 for p in self.model.parameters(): n_params = torch.prod(torch.tensor(p.shape)) if p.requires_grad: n_trainable_params += n_params else: n_nontrainable_params += n_params logger.info( 'n_trainable_params: {0}, n_nontrainable_params: {1}'.format( n_trainable_params, n_nontrainable_params)) logger.info('> training arguments:') for arg in vars(self.opt): logger.info('>>> {0}: {1}'.format(arg, getattr(self.opt, arg))) def _reset_params(self): for child in self.model.children(): if type(child) != BertModel: # skip bert params for p in child.parameters(): if p.requires_grad: if len(p.shape) > 1: self.opt.initializer(p) else: stdv = 1. / math.sqrt(p.shape[0]) torch.nn.init.uniform_(p, a=-stdv, b=stdv) def _train(self, criterion, optimizer, train_data_loader, val_data_loader): max_val_acc = 0 max_val_f1 = 0 global_step = 0 path = None for epoch in range(self.opt.num_epoch): logger.info('>' * 100) logger.info('epoch: {}'.format(epoch)) n_correct, n_total, loss_total = 0, 0, 0 # switch model to training mode self.model.train() for i_batch, sample_batched in enumerate(train_data_loader): global_step += 1 # clear gradient accumulators optimizer.zero_grad() inputs = [ sample_batched[col].to(self.opt.device) for col in self.opt.inputs_cols ] outputs = self.model(inputs) targets = sample_batched['polarity'].to(self.opt.device) loss = criterion(outputs, targets) loss.backward() optimizer.step() n_correct += (torch.argmax(outputs, -1) == targets).sum().item() n_total += len(outputs) loss_total += loss.item() * len(outputs) if global_step % self.opt.log_step == 0: train_acc = n_correct / n_total train_loss = loss_total / n_total logger.info('loss: {:.4f}, acc: {:.4f}'.format( train_loss, train_acc)) val_acc, val_f1 = self._evaluate_acc_f1(val_data_loader) logger.info('> val_acc: {:.4f}, val_f1: {:.4f}'.format( val_acc, val_f1)) if val_acc > max_val_acc: max_val_acc = val_acc if not os.path.exists('state_dict'): os.mkdir('state_dict') path = 'state_dict/{0}_{1}_val_acc{2}'.format( self.opt.model_name, self.opt.dataset, round(val_acc, 4)) torch.save(self.model.state_dict(), path) logger.info('>> saved: {}'.format(path)) if val_f1 > max_val_f1: max_val_f1 = val_f1 return path def _evaluate_acc_f1(self, data_loader): n_correct, n_total = 0, 0 t_targets_all, t_outputs_all = None, None # switch model to evaluation mode self.model.eval() with torch.no_grad(): for t_batch, t_sample_batched in enumerate(data_loader): t_inputs = [ t_sample_batched[col].to(self.opt.device) for col in self.opt.inputs_cols ] t_targets = t_sample_batched['polarity'].to( self.opt.device) t_outputs = self.model(t_inputs) n_correct += (torch.argmax(t_outputs, -1) == t_targets).sum().item() n_total += len(t_outputs) if t_targets_all is None: t_targets_all = t_targets t_outputs_all = t_outputs else: t_targets_all = torch.cat((t_targets_all, t_targets), dim=0) t_outputs_all = torch.cat((t_outputs_all, t_outputs), dim=0) acc = n_correct / n_total f1 = metrics.f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[0, 1, 2], average='macro') return acc, f1 def run(self): # Loss and Optimizer criterion = nn.CrossEntropyLoss() _params = filter(lambda p: p.requires_grad, self.model.parameters()) optimizer = self.opt.optimizer(_params, lr=self.opt.learning_rate, weight_decay=self.opt.l2reg) train_data_loader = DataLoader(dataset=self.trainset, batch_size=self.opt.batch_size, shuffle=True) test_data_loader = DataLoader(dataset=self.testset, batch_size=self.opt.batch_size, shuffle=False) val_data_loader = DataLoader(dataset=self.valset, batch_size=self.opt.batch_size, shuffle=False) self._reset_params() best_model_path = self._train(criterion, optimizer, train_data_loader, val_data_loader) self.model.load_state_dict(torch.load(best_model_path)) self.model.eval() test_acc, test_f1 = self._evaluate_acc_f1(test_data_loader) logger.info('>> test_acc: {:.4f}, test_f1: {:.4f}'.format( test_acc, test_f1))
def run(self): # Loss and Optimizer criterion = nn.CrossEntropyLoss() _params = filter(lambda p: p.requires_grad, self.model.parameters()) optimizer = self.opt.optimizer(_params, lr=self.opt.learning_rate, weight_decay=self.opt.l2reg) train_data_loader = DataLoader(dataset=self.trainset, batch_size=self.opt.batch_size, shuffle=True) test_data_loader = DataLoader(dataset=self.testset, batch_size=self.opt.batch_size, shuffle=False) val_data_loader = DataLoader(dataset=self.valset, batch_size=self.opt.batch_size, shuffle=False) self._reset_params() # best_model_path = self._train(criterion, optimizer, train_data_loader, val_data_loader) # self.model.load_state_dict(torch.load(best_model_path)) best_epoch = self._train(criterion, optimizer, train_data_loader, val_data_loader) logger.info(f'>> Optimal no. of epochs: {best_epoch+1}') # Re-initialize the model and train with the full train_set opt = self.opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name + '/vocab.txt') bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, fname=opt.embed_fname, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.train_dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) _params = filter(lambda p: p.requires_grad, self.model.parameters()) optimizer = self.opt.optimizer(_params, lr=self.opt.learning_rate, weight_decay=self.opt.l2reg) train_data_loader = DataLoader(dataset=self.trainset, batch_size=self.opt.batch_size, shuffle=True) self._reset_params() for epoch in range(best_epoch + 1): self.model.train() for i_batch, sample_batched in enumerate(train_data_loader): # global_step += 1 # clear gradient accumulators optimizer.zero_grad() inputs = [ sample_batched[col].to(self.opt.device) for col in self.opt.inputs_cols ] outputs = self.model(inputs) targets = sample_batched['polarity'].to(self.opt.device) loss = criterion(outputs, targets) loss.backward() optimizer.step() self.model.eval() test_acc, test_f1, test_outputs, test_targets = self._evaluate_acc_f1( test_data_loader) pred_fname = 'logs/{0}-{1}-{2}-{3}-{4}-{5}-test_acc-{6}-test_f1-{7}.csv'.format( self.opt.model_name, self.opt.train_dataset, self.opt.test_dataset, self.opt.seed, self.opt.valset_ratio, self.opt.expr_idx, round(test_acc, 4), round(test_f1, 4)) numpy.savetxt(pred_fname, test_outputs.numpy()) target_fname = 'logs/{0}-target.csv'.format(self.opt.test_dataset) numpy.savetxt(target_fname, test_targets.numpy()) logger.info('>> test_acc: {:.4f}, test_f1: {:.4f}'.format( test_acc, test_f1))
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained( '/home/yinrongdi/bert/bert-base-uncased.tar.gz') self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='temp_data/' + '{0}_tokenizer.dat'.format(opt.dataset), step=4 if opt.tabsa else 3) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='temp_data/' + '{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) if opt.classifier: if opt.classifier_with_absa_target: self.classifierset = TABSADataset( opt.dataset_file['classifier_absa_target'], tokenizer, False) else: if opt.classifier_with_absa: self.classifierset = TABSADataset( opt.dataset_file['classifier'], tokenizer, True) else: self.classifierset = TABSADataset( opt.dataset_file['classifier'], tokenizer, False) if opt.tabsa: if opt.tabsa_with_absa: if opt.gating: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer, True, True) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer, True, True) else: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer, True) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer, True) else: if opt.gating: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer, False, True) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer, True) else: self.trainset = TABSADataset(opt.dataset_file['train'], tokenizer, False) self.testset = TABSADataset(opt.dataset_file['test'], tokenizer, False) else: self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: # tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) # bert = BertModel.from_pretrained(opt.pretrained_bert_name) # self.model = opt.model_class(bert, opt).to(opt.device) pass else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='gen_data/tokenizer/{0}_tokenizer.dat'.format( opt.dataset)) pos_tagger_train = build_pos_tagger( fname=opt.dataset_file['train'], dat_fname="gen_data/pos/pos_tagger_{}_train.dat".format( opt.dataset), modelfile=opt.stanford_pos_model, jarfile=opt.stanford_pos_jar, tokenizer=tokenizer) embedding_matrix = build_embedding_matrix( glove_path=opt.glove_path, word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname=opt.embedding_matrix_path) if "embed" in opt.model_name: self.model = opt.model_class(embedding_matrix, pos_tagger_train.index2vec, opt).to(opt.device) else: self.model = opt.model_class(embedding_matrix, opt).to(opt.device) pos_tagger_test = build_pos_tagger( fname=opt.dataset_file['test'], dat_fname="gen_data/pos/pos_tagger_{}_test.dat".format( opt.dataset), modelfile=opt.stanford_pos_model, jarfile=opt.stanford_pos_jar, tokenizer=tokenizer) self.trainset = ABSADataset( opt.dataset_file['train'], 'gen_data/dataset/{}_train_dataset.dat'.format(opt.dataset), tokenizer, pos_tagger_train) self.testset = ABSADataset( opt.dataset_file['test'], 'gen_data/dataset/{}_test_dataset.dat'.format(opt.dataset), tokenizer, pos_tagger_test) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self.tokenizer = tokenizer self._print_args()
def __init__(self, opt): self.opt = opt if 'v1' in opt.model_name and 'albert' in opt.model_name: tokenizer = Tokenizer4AlbertGcn(opt.max_seq_len, opt.pretrained_bert_name) bert = None self.model = opt.model_class(bert, opt).to(opt.device) elif 'v1' in opt.model_name and 'bert' in opt.model_name: tokenizer = Tokenizer4BertGcn(opt.max_seq_len, opt.pretrained_bert_name) bert = None self.model = opt.model_class(bert, opt).to(opt.device) elif 'albert_gcn' in opt.model_name: tokenizer = Tokenizer4AlbertGcn(opt.max_seq_len, opt.pretrained_bert_name) bert = AlbertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) elif 'bert_gcn' in opt.model_name: tokenizer = Tokenizer4BertGcn(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) elif 'albert' in opt.model_name: tokenizer = Tokenizer4Albert(opt.max_seq_len, opt.pretrained_bert_name) bert = AlbertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) elif 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) if 'bert' in opt.model_name and opt.freeze_bert: try: self.model.bert.requires_grad = False except: self.model.context_bert.requires_grad = False if 'gcn' in opt.model_name: self.trainset = ABSAGcnData(opt.dataset_file['train'], tokenizer, debug=opt.debug, from_xml=opt.from_xml) self.testset = ABSAGcnData(opt.dataset_file['test'], tokenizer, debug=opt.debug, from_xml=opt.from_xml) else: self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer, debug=opt.debug) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer, debug=opt.debug) assert 0 <= opt.valset_ration < 1 if opt.valset_ration > 0: valset_len = int(len(self.trainset) * opt.valset_ration) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, model_name='lstm', dataset='train', optimizer='adam',initializer='xavier_uniform_', learning_rate=2e-5, dropout=0.1, l2reg=0.01, num_epoch=16, batch_size=16, log_step=5, embed_dim=300, hidden_dim=300, max_seq_len=80, polarities_dim=3, device=None, valset_ratio=0): self.model_name = model_name self.dataset = dataset self.optimizer = optimizer self.initializer = initializer self.learning_rate = learning_rate self.dropout = dropout self.l2reg = l2reg self.num_epoch = num_epoch self.batch_size = batch_size self.log_step = log_step self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.max_seq_len = max_seq_len self.polarities_dim = polarities_dim self.device = device self.valset_ratio = valset_ratio log_file = '{}-{}-{}.log'.format(self.model_name, self.dataset, strftime("%y%m%d-%H%M", localtime())) logger.addHandler(logging.FileHandler(log_file)) model_classes = { 'lstm': LSTM, 'rnn' : RNN } dataset_files = { 'train': { 'train': './Preprocess/train.csv', 'test': './Preprocess/test.csv' } } input_colses = { 'lstm': ['text_raw_indices'], 'rnn': ['text_raw_indices'] } initializers = { 'xavier_uniform_': torch.nn.init.xavier_uniform_, 'xavier_normal_': torch.nn.init.xavier_normal, 'orthogonal_': torch.nn.init.orthogonal_, } optimizers = { 'adadelta': torch.optim.Adadelta, # default lr=1.0 'adagrad': torch.optim.Adagrad, # default lr=0.01 'adam': torch.optim.Adam, # default lr=0.001 'adamax': torch.optim.Adamax, # default lr=0.002 'asgd': torch.optim.ASGD, # default lr=0.01 'rmsprop': torch.optim.RMSprop, # default lr=0.01 'sgd': torch.optim.SGD, } self.model_class = model_classes[self.model_name] self.dataset_file = dataset_files[self.dataset] self.inputs_cols = input_colses[self.model_name] self.initializer = initializers[self.initializer] self.optimizer = optimizers[self.optimizer] self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') \ if self.device is None else torch.device(self.device) self.tokenizer = build_tokenizer( fnames=[self.dataset_file['train'], self.dataset_file['test']], max_seq_len=self.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(self.dataset)) self.embedding_matrix = build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=self.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(self.embed_dim), self.dataset)) self.model = self.model_class(self.embedding_matrix, self).to(self.device) self.trainset = SADataset(self.dataset_file['train'], self.tokenizer) self.testset = SADataset(self.dataset_file['test'], self.tokenizer) assert 0 <= self.valset_ratio < 1 if self.valset_ratio > 0: valset_len = int(len(self.trainset) * self.valset_ratio) self.trainset, self.valset = random_split(self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset logger.info('Model selected: {}'.format(self.model_name)) if self.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=self.device.index)))
def __init__(self, opt): self.opt = opt out_file = './stat/{}_{}_domain{}_adv{}_aux{}_resplit{}_epoch{}'.format( self.opt.model_name, self.opt.dataset, self.opt.domain, str(self.opt.adv), str(self.opt.aux), str(self.opt.resplit), (self.opt.num_epoch)) print(out_file) if 'bert' in opt.model_name: # if opt.model_name == 'bert_kg': # tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) # bert = BertForTokenClassification.from_pretrained('ernie_base') # self.model = opt.model_class(bert, opt).to(opt.device) # self.model.to(opt.device) if opt.model_name == 'lcf_bert': from pytorch_transformers import BertModel, BertForTokenClassification, BertConfig tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=False) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) elif opt.model_name == 'bert': from pytorch_transformers import BertModel, BertForTokenClassification, BertConfig tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) elif opt.model_name in ['bert_spc', 'td_bert']: from pytorch_transformers import BertModel, BertForTokenClassification, BertConfig tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) # self.model.load_state_dict(torch.load('./state_dict/bert_multi_target_val_acc0.7714')) elif opt.model_name == 'bert_label': tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) elif opt.model_name == 'bert_compete': tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) num_added_tokens = tokenizer.add_tokens( ['[aspect_b]', '[aspect_e]']) bert.resize_token_embeddings(len(tokenizer.tokenizer)) self.model = opt.model_class(bert, opt).to(opt.device) else: from modeling_bert import BertModel, BertForTokenClassification, BertConfig # bert_mulit_target tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) if opt.domain == 'pt': bert = BertModel.from_pretrained( './bert_models/pt_bert-base-uncased_amazon_yelp') if opt.domain == 'joint': bert = BertModel.from_pretrained( './bert_models/laptops_and_restaurants_2mio_ep15') if opt.domain == 'res': bert = BertModel.from_pretrained( './bert_models/restaurants_10mio_ep3') if opt.domain == 'laptop': bert = BertModel.from_pretrained( './bert_models/laptops_1mio_ep30') if opt.domain == 'ernie': bert = BertModel.from_pretrained( './bert_models/ERNIE_Base_en_stable-2.0.0_pytorch') # num_added_tokens = tokenizer.add_tokens(['[target_b]','[target_e]']) # num_added_tokens = tokenizer.add_tokens(['[aspect_b]','[aspect_e]']) for i in range(20): b = '[' + str(i) + 'b]' e = '[' + str(i) + 'e]' num_added_tokens = tokenizer.add_tokens([b, e]) bert.resize_token_embeddings(len(tokenizer.tokenizer)) self.model = opt.model_class(bert, opt).to(opt.device) # self.model.load_state_dict(torch.load('./state_dict/state_dict/bert_multi_target_restaurant_doamin-res_can0_adv0_aux1.0_val_acc0.8688')) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer, 'train', opt) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer, 'test', opt) if int(opt.resplit) == 0: valset_ratio = 0.05 assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: if int(self.opt.resplit) == 1 or int(self.opt.resplit) == 2: self.valset = ABSADataset('valid', tokenizer, 'valid', opt) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) # if opt.load_mode == 1: # self.model.load_state_dict(torch.load('/home/nus/temp/ABSA-PyTorch/state_dict/bert_spc_twitter_val_acc0.7384')) # find the highese # model.load_state_dict(torch.load(PATH)) self._print_args()
def __init__(self, opt): self.opt = opt if opt.model in [ 'bote', 'bote_v0_ablation', 'bote_v1_ablation', 'bote_v2_ablation', 'bote_v3_ablation', 'bote_v4' ]: absa_data_reader = ABSADataReaderBERT(data_dir=opt.data_dir) tokenizer = BertTokenizer(opt.bert_model, opt.case, opt.spacy_lang, opt.lang) embedding_matrix = [] self.train_data_loader = BucketIteratorBert( data=absa_data_reader.get_train(tokenizer), batch_size=opt.batch_size, shuffle=True) self.dev_data_loader = BucketIteratorBert( data=absa_data_reader.get_dev(tokenizer), batch_size=opt.batch_size, shuffle=False) self.test_data_loader = BucketIteratorBert( data=absa_data_reader.get_test(tokenizer), batch_size=opt.batch_size, shuffle=False) else: absa_data_reader = ABSADataReader(data_dir=opt.data_dir) tokenizer = build_tokenizer(data_dir=opt.data_dir) embedding_matrix = build_embedding_matrix(opt.data_dir, tokenizer.word2idx, opt.embed_dim, opt.dataset, opt.glove_fname) self.train_data_loader = BucketIterator( data=absa_data_reader.get_train(tokenizer), batch_size=opt.batch_size, shuffle=True) self.dev_data_loader = BucketIterator( data=absa_data_reader.get_dev(tokenizer), batch_size=opt.batch_size, shuffle=False) self.test_data_loader = BucketIterator( data=absa_data_reader.get_test(tokenizer), batch_size=opt.batch_size, shuffle=False) self.idx2tag, self.idx2polarity = absa_data_reader.reverse_tag_map, absa_data_reader.reverse_polarity_map self.model = opt.model_class(embedding_matrix, opt, self.idx2tag, self.idx2polarity).to(opt.device) self.history_metrics = { 'epoch': [], 'step': [], 'train_ap_precision': [], 'train_ap_recall': [], 'train_ap_f1': [], 'train_op_precision': [], 'train_op_recall': [], 'train_op_f1': [], 'train_triplet_precision': [], 'train_triplet_recall': [], 'train_triplet_f1': [], 'dev_ap_precision': [], 'dev_ap_recall': [], 'dev_ap_f1': [], 'dev_op_precision': [], 'dev_op_recall': [], 'dev_op_f1': [], 'dev_triplet_precision': [], 'dev_triplet_recall': [], 'dev_triplet_f1': [] } self.results = { 'aspect_extraction': { 'precision': [], 'recall': [], 'f1': [] }, 'opinion_extraction': { 'precision': [], 'recall': [], 'f1': [] }, 'triplet_extraction': { 'precision': [], 'recall': [], 'f1': [] } } self._print_args() if torch.cuda.is_available(): print('>>> cuda memory allocated:', torch.cuda.memory_allocated(device=opt.device.index))