def __init__(self, vocab, model_file_path=None, load_optim=False): super(Emotion, self).__init__() self.device = config.device self.vocab = vocab self.vocab_size = vocab.n_words self.embedding = share_embedding(self.vocab, True) self.encoder = Encoder(config.emb_dim, config.hidden_dim, num_layers=config.hop, num_heads=config.heads, total_key_depth=config.depth, total_value_depth=config.depth, filter_size=config.filter, universal=True) self.context_ecoder = Encoder(config.emb_dim, config.hidden_dim, num_layers=config.hop, num_heads=config.heads, total_key_depth=config.depth, total_value_depth=config.depth, filter_size=config.filter, universal=True) self.linear = nn.Linear(config.emb_dim, 2) # self.linear = nn.Linear(config.emb_dim, len(config.emo_map)) optimizer = torch.optim.Adam(self.parameters(), lr=0, weight_decay=config.weight_decay, betas=(0.9, 0.98), eps=1e-9) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[config.schedule * i for i in range(4)], gamma=0.1) self.scheduler = NoamOpt(config.hidden_dim, 1, 8000, optimizer, scheduler) if model_file_path is not None: print("loading weights") state = torch.load(model_file_path, map_location=lambda storage, location: storage) self.iter = state['iter'] self.current_acc = state['current_acc'] self.embedding.load_state_dict(state['embedding_dict']) self.encoder.load_state_dict(state['encoder_state_dict']) self.context_ecoder.load_state_dict( state['context_encoder_state_dict']) if load_optim: try: self.scheduler.load_state_dict(state['optimizer']) except AttributeError: pass self.model_dir = config.save_path if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.best_path = ""
def __init__(self, vocab, decoder_number, model_file_path=None, load_optim=False): """ vocab: a Lang type data, which is defined in data_reader.py decoder_number: the number of classes """ super(Transformer, self).__init__() self.iter = 0 self.current_loss = 1000 self.vocab = vocab self.vocab_size = vocab.n_words self.embedding = share_embedding(self.vocab, config.pretrain_emb) self.encoder = Encoder(config.emb_dim, config.hidden_dim, num_layers=config.hop, num_heads=config.heads, total_key_depth=config.depth, total_value_depth=config.depth, filter_size=config.filter, universal=config.universal) ## decoders self.decoder = Decoder(config.emb_dim, hidden_size=config.hidden_dim, num_layers=config.hop, num_heads=config.heads, total_key_depth=config.depth, total_value_depth=config.depth, filter_size=config.filter, max_length=config.max_length) self.decoder_key = nn.Linear(config.hidden_dim, decoder_number, bias=False) self.generator = Generator(config.hidden_dim, self.vocab_size) if config.weight_sharing: # Share the weight matrix between target word embedding & the final logit dense layer self.generator.proj.weight = self.embedding.lut.weight self.criterion = nn.NLLLoss(ignore_index=config.PAD_idx) if config.label_smoothing: self.criterion = LabelSmoothing(size=self.vocab_size, padding_idx=config.PAD_idx, smoothing=0.1) self.criterion_ppl = nn.NLLLoss(ignore_index=config.PAD_idx) if (config.noam): optimizer = torch.optim.Adam(self.parameters(), lr=0, weight_decay=config.weight_decay, betas=(0.9, 0.98), eps=1e-9) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[config.schedule * i for i in range(4)], gamma=0.1) self.scheduler = NoamOpt(config.hidden_dim, 1, 8000, optimizer, scheduler) else: self.optimizer = torch.optim.Adam(self.parameters(), lr=config.lr, weight_decay=config.weight_decay) self.scheduler = torch.optim.lr_scheduler.MultiStepLR( self.optimizer, milestones=[config.schedule * i for i in range(4)], gamma=0.1) if model_file_path is not None: print("loading weights") state = torch.load(model_file_path, map_location=lambda storage, location: storage) self.iter = state['iter'] self.current_loss = state['current_loss'] self.encoder.load_state_dict(state['encoder_state_dict']) self.decoder.load_state_dict(state['decoder_state_dict']) self.generator.load_state_dict(state['generator_dict']) self.embedding.load_state_dict(state['embedding_dict']) self.decoder_key.load_state_dict(state['decoder_key_state_dict']) if (load_optim): self.scheduler.load_state_dict(state['optimizer']) self.eval() self.model_dir = config.save_path if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.best_path = ""
class Transformer(nn.Module): def __init__(self, vocab, decoder_number, model_file_path=None, load_optim=False): """ vocab: a Lang type data, which is defined in data_reader.py decoder_number: the number of classes """ super(Transformer, self).__init__() self.iter = 0 self.current_loss = 1000 self.vocab = vocab self.vocab_size = vocab.n_words self.embedding = share_embedding(self.vocab, config.pretrain_emb) self.encoder = Encoder(config.emb_dim, config.hidden_dim, num_layers=config.hop, num_heads=config.heads, total_key_depth=config.depth, total_value_depth=config.depth, filter_size=config.filter, universal=config.universal) ## decoders self.decoder = Decoder(config.emb_dim, hidden_size=config.hidden_dim, num_layers=config.hop, num_heads=config.heads, total_key_depth=config.depth, total_value_depth=config.depth, filter_size=config.filter, max_length=config.max_length) self.decoder_key = nn.Linear(config.hidden_dim, decoder_number, bias=False) self.generator = Generator(config.hidden_dim, self.vocab_size) if config.weight_sharing: # Share the weight matrix between target word embedding & the final logit dense layer self.generator.proj.weight = self.embedding.lut.weight self.criterion = nn.NLLLoss(ignore_index=config.PAD_idx) if config.label_smoothing: self.criterion = LabelSmoothing(size=self.vocab_size, padding_idx=config.PAD_idx, smoothing=0.1) self.criterion_ppl = nn.NLLLoss(ignore_index=config.PAD_idx) if (config.noam): optimizer = torch.optim.Adam(self.parameters(), lr=0, weight_decay=config.weight_decay, betas=(0.9, 0.98), eps=1e-9) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[config.schedule * i for i in range(4)], gamma=0.1) self.scheduler = NoamOpt(config.hidden_dim, 1, 8000, optimizer, scheduler) else: self.optimizer = torch.optim.Adam(self.parameters(), lr=config.lr, weight_decay=config.weight_decay) self.scheduler = torch.optim.lr_scheduler.MultiStepLR( self.optimizer, milestones=[config.schedule * i for i in range(4)], gamma=0.1) if model_file_path is not None: print("loading weights") state = torch.load(model_file_path, map_location=lambda storage, location: storage) self.iter = state['iter'] self.current_loss = state['current_loss'] self.encoder.load_state_dict(state['encoder_state_dict']) self.decoder.load_state_dict(state['decoder_state_dict']) self.generator.load_state_dict(state['generator_dict']) self.embedding.load_state_dict(state['embedding_dict']) self.decoder_key.load_state_dict(state['decoder_key_state_dict']) if (load_optim): self.scheduler.load_state_dict(state['optimizer']) self.eval() self.model_dir = config.save_path if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.best_path = "" def save_model(self, running_avg_ppl, iter): self.iter = iter state = { 'iter': iter, 'encoder_state_dict': self.encoder.state_dict(), 'decoder_state_dict': self.decoder.state_dict(), 'generator_dict': self.generator.state_dict(), 'decoder_key_state_dict': self.decoder_key.state_dict(), 'embedding_dict': self.embedding.state_dict(), 'optimizer': self.scheduler.state_dict(), 'current_loss': running_avg_ppl } model_save_path = os.path.join( self.model_dir, 'model_{}_{:.4f}'.format(iter, running_avg_ppl)) self.best_path = model_save_path torch.save(state, model_save_path) def train_one_batch(self, batch, train=True): enc_batch, cause_batch = get_input_from_batch(batch) dec_batch = get_output_from_batch(batch) if (config.noam): self.scheduler.optimizer.zero_grad() else: self.optimizer.zero_grad() ## Encode mask_src = enc_batch.data.eq(config.PAD_idx).unsqueeze(1) emb_mask = self.embedding(batch["mask_input"]) encoder_outputs = self.encoder( self.embedding(enc_batch) + emb_mask, mask_src) # (batch_size, seq_len, hidden_size) # Decode sos_token = torch.LongTensor([config.SOS_idx] * enc_batch.size(0)).unsqueeze(1).to( config.device) dec_batch_shift = torch.cat( (sos_token, dec_batch[:, :-1]), 1) # make the first token of sentence be SOS mask_trg = dec_batch_shift.data.eq(config.PAD_idx).unsqueeze(1) pre_logit, attn_dist = self.decoder(self.embedding(dec_batch_shift), encoder_outputs, (mask_src, mask_trg)) # shape: pre_logit --> (batch_size, seq_len, hidden_size) ## compute output dist logit = self.generator(pre_logit) loss = self.criterion(logit.contiguous().view(-1, logit.size(-1)), dec_batch.contiguous().view(-1)) loss_bce_program, program_acc = 0, 0 # multi-task if config.emo_multitask: # add the loss function of label prediction q_h = encoder_outputs[:, 0] # the first token of the sentence CLS, shape: (batch_size, 1, hidden_size) logit_prob = self.decoder_key(q_h).to( 'cuda') # (batch_size, 1, decoder_num) loss += nn.CrossEntropyLoss()(logit_prob, torch.LongTensor( batch['program_label']).cuda()) loss_bce_program = nn.CrossEntropyLoss()( logit_prob, torch.LongTensor(batch['program_label']).cuda()).item() pred_program = np.argmax(logit_prob.detach().cpu().numpy(), axis=1) program_acc = accuracy_score(batch["program_label"], pred_program) if (config.label_smoothing): loss_ppl = self.criterion_ppl( logit.contiguous().view(-1, logit.size(-1)), dec_batch.contiguous().view(-1)).item() if (train): loss.backward() self.scheduler.step() if (config.label_smoothing): return loss_ppl, math.exp(min(loss_ppl, 100)), loss_bce_program, program_acc else: return loss.item(), math.exp(min( loss.item(), 100)), loss_bce_program, program_acc def compute_act_loss(self, module): R_t = module.remainders N_t = module.n_updates p_t = R_t + N_t avg_p_t = torch.sum(torch.sum(p_t, dim=1) / p_t.size(1)) / p_t.size(0) loss = config.act_loss_weight * avg_p_t.item() return loss def decoder_greedy(self, batch, max_dec_step=30): enc_batch, cause_batch = get_input_from_batch(batch) mask_src = enc_batch.data.eq(config.PAD_idx).unsqueeze(1) emb_mask = self.embedding(batch["mask_input"]) encoder_outputs = self.encoder( self.embedding(enc_batch) + emb_mask, mask_src) ys = torch.ones(1, 1).fill_(config.SOS_idx).long().to(config.device) mask_trg = ys.data.eq(config.PAD_idx).unsqueeze(1) decoded_words = [] for i in range(max_dec_step + 1): out, attn_dist = self.decoder(self.embedding(ys), encoder_outputs, (mask_src, mask_trg)) prob = self.generator(out) _, next_word = torch.max(prob[:, -1], dim=1) decoded_words.append([ '<EOS>' if ni.item() == config.EOS_idx else self.vocab.index2word[ni.item()] for ni in next_word.view(-1) ]) next_word = next_word.data[0] ys = torch.cat([ ys, torch.ones(1, 1).long().fill_(next_word).to(config.device) ], dim=1).to(config.device) mask_trg = ys.data.eq(config.PAD_idx).unsqueeze(1) sent = [] for _, row in enumerate(np.transpose(decoded_words)): st = '' for e in row: if e == '<EOS>': break else: st += e + ' ' sent.append(st) return sent def beam_search(self, batch, max_dec_step=30): enc_batch, cause_batch = get_input_from_batch(batch)
def main(): args = constant.arg if not os.path.exists(constant.save_path): os.makedirs(constant.save_path) #device = torch.device("cuda", 3) tokenizer = BertTokenizer.from_pretrained('bert-base-cased') f1_avg = [] for seed in range(10): train, val, val_nolab, emoji_tokens, emoji_vectors = get_data_for_bert( seed=seed, emoji_dim=args.emoji_dim) train_emojis, val_emojis, test_emojis = emoji_tokens train_examples = read_examples(train) val_examples = read_examples(val) if args.hier: max_seq_length = 40 else: max_seq_length = 100 train_features = convert_examples_to_features( examples=train_examples, seq_length=max_seq_length, tokenizer=tokenizer, hier=args.hier) val_features = convert_examples_to_features(examples=val_examples, seq_length=max_seq_length, tokenizer=tokenizer, hier=args.hier) if args.hier: model = HierBertModel( context_encoder=args.context_encoder, dropout=args.dropout, double_supervision=args.double_supervision, emoji_vectors=emoji_vectors if args.emoji_emb else None) else: model = FlatBertModel() criterion = nn.CrossEntropyLoss() model.cuda() # Prepare optimizer if args.use_bertadam: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = BertAdam( optimizer_grouped_parameters, lr=5e-5, warmup=0.02, t_total=int(len(train_examples) / args.batch_size / 1 * 15)) elif args.noam: optimizer = NoamOpt( constant.emb_dim, 1, 4000, torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0, betas=(0.9, 0.98), eps=1e-9), ) else: optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3) #training logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.batch_size) #=====================training dataloader======================== all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.input_type_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_emoji_tokens = torch.tensor([emojis for emojis in train_emojis], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_emoji_tokens) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) #=====================val dataloader======================== all_input_ids = torch.tensor([f.input_ids for f in val_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in val_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.input_type_ids for f in val_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in val_features], dtype=torch.long) all_emoji_tokens = torch.tensor([emojis for emojis in val_emojis], dtype=torch.long) val_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_emoji_tokens) val_sampler = SequentialSampler(val_data) val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=args.batch_size) best_f1 = 0 early_stop = 0 for _ in trange(100, desc="Epoch"): model.train() tr_loss = 0 nb_tr_steps = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.cuda() for t in batch) input_ids, input_mask, segment_ids, label_ids, emoji_tokens = batch #print(input_ids.size()) logits = model( input_ids, segment_ids, input_mask, args.sum_tensor, train=True, emoji_tokens=emoji_tokens if args.emoji_emb else None, last_hidden=args.last_hidden) #print(logits.size(), label_ids.size()) if len(logits) == 2: loss = (1 - args.super_ratio) * criterion( logits[0], label_ids) + args.super_ratio * criterion( logits[1], label_ids) else: loss = criterion(logits, label_ids) loss.backward() tr_loss += loss.item() nb_tr_steps += 1 optimizer.step() model.zero_grad() logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(val_examples)) logger.info(" Batch size = %d", args.batch_size) model.eval() all_logits = [] all_labels = [] for step, batch in enumerate(tqdm(val_dataloader, desc="Iteration")): batch = tuple(t.cuda() for t in batch) input_ids, input_mask, segment_ids, label_ids, emoji_tokens = batch logits = model( input_ids, segment_ids, input_mask, args.sum_tensor, emoji_tokens=emoji_tokens if args.emoji_emb else None, last_hidden=args.last_hidden) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() all_logits.append(logits) all_labels.append(label_ids) accuracy, microPrecision, microRecall, microF1 = getMetrics( np.concatenate(all_logits), np.concatenate(all_labels), verbose=True) if best_f1 < microF1: best_f1 = microF1 save_model(model, seed) else: early_stop += 1 if early_stop > 5: break print('EXPERIMENT:{}, best_f1:{}'.format(seed, best_f1)) f1_avg.append(best_f1) file_summary = constant.save_path + "summary.txt" with open(file_summary, 'w') as the_file: header = "\t".join( ["SPLIT_{}".format(i) for i, _ in enumerate(f1_avg)]) the_file.write(header + "\tAVG\n") ris = "\t".join(["{:.4f}".format(e) for i, e in enumerate(f1_avg)]) the_file.write(ris + "\t{:.4f}\n".format(np.mean(f1_avg)))
class Emotion(nn.Module): def __init__(self, vocab, model_file_path=None, load_optim=False): super(Emotion, self).__init__() self.device = config.device self.vocab = vocab self.vocab_size = vocab.n_words self.embedding = share_embedding(self.vocab, True) self.encoder = Encoder(config.emb_dim, config.hidden_dim, num_layers=config.hop, num_heads=config.heads, total_key_depth=config.depth, total_value_depth=config.depth, filter_size=config.filter, universal=True) self.context_ecoder = Encoder(config.emb_dim, config.hidden_dim, num_layers=config.hop, num_heads=config.heads, total_key_depth=config.depth, total_value_depth=config.depth, filter_size=config.filter, universal=True) self.linear = nn.Linear(config.emb_dim, 2) # self.linear = nn.Linear(config.emb_dim, len(config.emo_map)) optimizer = torch.optim.Adam(self.parameters(), lr=0, weight_decay=config.weight_decay, betas=(0.9, 0.98), eps=1e-9) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[config.schedule * i for i in range(4)], gamma=0.1) self.scheduler = NoamOpt(config.hidden_dim, 1, 8000, optimizer, scheduler) if model_file_path is not None: print("loading weights") state = torch.load(model_file_path, map_location=lambda storage, location: storage) self.iter = state['iter'] self.current_acc = state['current_acc'] self.embedding.load_state_dict(state['embedding_dict']) self.encoder.load_state_dict(state['encoder_state_dict']) self.context_ecoder.load_state_dict( state['context_encoder_state_dict']) if load_optim: try: self.scheduler.load_state_dict(state['optimizer']) except AttributeError: pass self.model_dir = config.save_path if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.best_path = "" def save_model(self, iter, acc, loss): self.iter = iter state = { 'iter': self.iter, 'embedding_dict': self.embedding.state_dict(), 'encoder_state_dict': self.encoder.state_dict(), 'context_encoder_state_dict': self.context_encoder.state_dict(), 'optimizer': self.scheduler.state_dict(), 'current_acc': acc } model_save_path = os.path.join(self.model_dir, 'model_{}_{:.4f}'.format(iter, acc)) self.best_path = model_save_path torch.save(state, model_save_path) def forward(self, batch, predict=False): context, target, emotion = batch emotion = (emotion > 0).to(int) self.scheduler.optimizer.zero_grad() context = self.context_ecoder(context) context = torch.sum(context, dim=-2, keepdim=True) target = torch.cat(context, self.encoder(target), dim=-2) target = torch.sum(target, dim=-2) pre_logit = torch.sigmoid(self.linear(target)) logit = torch.softmax(pre_logit, dim=-1) predic = torch.max(logit.data, 1)[1] loss = -1 if not predict: loss = F.cross_entropy(logit, emotion) loss.backward() self.scheduler.step() train_acc = metrics.accuracy_score(emotion.cpu(), predic.cpu()) return loss, train_acc, predic
def train( model, data_loader_train, data_loader_val, data_loader_test, vocab, patient=10, split=0, verbose=True, ): """ Training loop Inputs: model: the model to be trained data_loader_train: training data loader data_loader_val: validation data loader vocab: vocabulary list Output: avg_best: best f1 score on validation data """ if constant.USE_CUDA: device = torch.device("cuda:{}".format(constant.device)) model.to(device) criterion = nn.CrossEntropyLoss() if constant.noam: opt = NoamOpt( constant.emb_dim, 1, 4000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9), ) else: opt = torch.optim.Adam(model.parameters(), lr=constant.lr) ## TRAINING LOOP avg_best = 0 cnt = 0 for e in range(constant.max_epochs): model.train() loss_log = [] f1_log = [] pbar = tqdm(enumerate(data_loader_train), total=len(data_loader_train)) for i, (X_1, X_2, X_3, x1_len, x2_len, x3_len, y, ind, X_text) in pbar: if constant.noam: opt.optimizer.zero_grad() else: opt.zero_grad() if x1_len is None: pred_prob = model(X_1, X_2, X_3) else: pred_prob = model(X_1, X_2, X_3, x1_len, x2_len, x3_len) if constant.double_supervision: loss = (1 - constant.super_ratio) * criterion( pred_prob[0], y) + constant.super_ratio * criterion( pred_prob[2], y) else: loss = criterion(pred_prob[0], y) if constant.act: R_t = pred_prob[2][0] N_t = pred_prob[2][1] p_t = R_t + N_t avg_p_t = torch.sum( torch.sum(p_t, dim=1) / p_t.size(1)) / p_t.size(0) loss += constant.act_loss_weight * avg_p_t.item() loss.backward() opt.step() ## logging loss_log.append(loss.item()) accuracy, microPrecision, microRecall, microF1 = getMetrics( pred_prob[0].detach().cpu().numpy(), y.cpu().numpy()) f1_log.append(microF1) pbar.set_description( "(Epoch {}) TRAIN MICRO:{:.4f} TRAIN LOSS:{:.4f}".format( (e + 1), np.mean(f1_log), np.mean(loss_log))) ## LOG if e % 1 == 0: microF1 = evaluate(model, criterion, data_loader_val, verbose) if microF1 > avg_best: avg_best = microF1 save_model(model, split) predict( model, criterion, data_loader_test, split) ## print the prediction with the highest Micro-F1 cnt = 0 else: cnt += 1 if cnt == patient: break if avg_best == 1.0: break correct = 0 loss_nb = 0 return avg_best
def train(model, data_loader_train, data_loader_val, data_loader_test, vocab, patient=10, split=0): """ Training loop Inputs: model: the model to be trained data_loader_train: training data loader data_loader_val: validation data loader vocab: vocabulary list Output: avg_best: best f1 score on validation data """ if (constant.USE_CUDA): model.cuda() criterion = nn.CrossEntropyLoss() if (constant.noam): opt = NoamOpt( constant.emb_dim, 1, 4000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) else: opt = torch.optim.Adam(model.parameters(), lr=constant.lr) avg_best = 0 cnt = 0 for e in range(constant.max_epochs): model.train() loss_log = [] f1_log = 0 pbar = tqdm(enumerate(data_loader_train), total=len(data_loader_train)) for i, (X, x_len, y, ind, X_text) in pbar: if constant.noam: opt.optimizer.zero_grad() else: opt.zero_grad() if x_len is None: pred_prob = model(X) else: pred_prob = model(X, x_len) loss = criterion(pred_prob[0], y) loss.backward() opt.step() ## logging loss_log.append(loss.item()) accuracy, microPrecision, microRecall, microF1 = getMetrics( pred_prob[0].detach().cpu().numpy(), y.cpu().numpy()) f1_log += microF1 pbar.set_description( "(Epoch {}) TRAIN MICRO:{:.4f} TRAIN LOSS:{:.4f}".format( (e + 1), f1_log / float(i + 1), np.mean(loss_log))) ## LOG if (e % 1 == 0): microF1 = evaluate(model, criterion, data_loader_val) if (microF1 > avg_best): avg_best = microF1 save_model(model, split) predict(model, criterion, data_loader_test, "", split=split ) ## print the prediction with the highest Micro-F1 cnt = 0 else: cnt += 1 if (cnt == patient): break if (avg_best == 1.0): break correct = 0 loss_nb = 0 return avg_best
def main(config): vocab_len = get_babi_vocab(config.task) train_iter, val_iter, test_iter = datasets.BABI20.iters( batch_size=config.batch_size, root='.data', memory_size=70, task=config.task, joint=False, tenK=False, only_supporting=False, sort=False, shuffle=True) model = BabiUTransformer(num_vocab=vocab_len, embedding_size=config.emb, hidden_size=config.emb, num_layers=config.max_hops, num_heads=config.heads, total_key_depth=config.depth, total_value_depth=config.depth, filter_size=config.filter, act=config.act) if (config.verbose): print(model) print("ACT", config.act) if (config.cuda): model.cuda() criterion = nn.CrossEntropyLoss() if (config.noam): opt = NoamOpt( config.emb, 1, 4000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) else: opt = torch.optim.Adam(model.parameters(), lr=config.lr) if (config.verbose): acc_val, loss_val = evaluate(model, criterion, val_iter) print("RAND_VAL ACC:{:.4f}\t RAND_VAL LOSS:{:.4f}".format( acc_val, loss_val)) correct = [] loss_nb = [] cnt_batch = 0 avg_best = 0 cnt = 0 model.train() for b in train_iter: story, query, answer = b.story, b.query, b.answer.squeeze() if (config.cuda): story, query, answer = story.cuda(), query.cuda(), answer.cuda() if (config.noam): opt.optimizer.zero_grad() else: opt.zero_grad() pred_prob = model(story, query) loss = criterion(pred_prob[0], answer) if (config.act): R_t = pred_prob[2][0] N_t = pred_prob[2][1] p_t = R_t + N_t avg_p_t = torch.sum( torch.sum(p_t, dim=1) / p_t.size(1)) / p_t.size(0) loss += config.act_loss_weight * avg_p_t.item() loss.backward() opt.step() ## LOG loss_nb.append(loss.item()) pred = pred_prob[1].data.max(1)[1] # max func return (max, argmax) correct.append(np.mean(pred.eq(answer.data).cpu().numpy())) cnt_batch += 1 if (cnt_batch % 10 == 0): acc = np.mean(correct) loss_nb = np.mean(loss_nb) if (config.verbose): print("TRN ACC:{:.4f}\tTRN LOSS:{:.4f}".format(acc, loss_nb)) acc_val, loss_val = evaluate(model, criterion, val_iter) if (config.verbose): print("VAL ACC:{:.4f}\tVAL LOSS:{:.4f}".format( acc_val, loss_val)) if (acc_val > avg_best): avg_best = acc_val weights_best = deepcopy(model.state_dict()) cnt = 0 else: cnt += 1 if (cnt == 45): break if (avg_best == 1.0): break correct = [] loss_nb = [] cnt_batch = 0 model.load_state_dict({name: weights_best[name] for name in weights_best}) acc_test, loss_test = evaluate(model, criterion, test_iter) if (config.verbose): print("TST ACC:{:.4f}\tTST LOSS:{:.4f}".format(acc_val, loss_val)) return acc_test