def __init__(self, config, tasks): super(Inference, self).__init__() self.config = config self.tasks = tasks if config.branching_encoder: utils.log("Build Branching Bert Encoder") self.encoder = BranchingBertModel.from_pretrained( config.bert_model, encoder_structure=config.branching_structure) else: utils.log("Build {}:{} Encoder".format(config.encoder_type, config.bert_model)) self.encoder = get_encoder(config.encoder_type).from_pretrained( config.bert_model, output_attentions=config.output_attentions) utils.log("Build Task Modules") self.tasks_modules = nn.ModuleDict() for task in tasks: if task.has_module: self.tasks_modules.update([(task.name, task.get_module())]) self.task_dict = dict([(task.name, task) for task in self.tasks]) self.dummy_input = torch.rand(1, 10, requires_grad=True) # self.encoder = HighwayLSTM(num_layers=3, input_size=300, hidden_size=200, layer_dropout=0.2) # self.word_embedding = nn.Embedding(self.config.external_vocab_size, self.config.external_vocab_embed_size) # self.word_embedding.weight.data.copy_(torch.from_numpy(np.load(config.external_embeddings))) # print("Loading embedding from {}".format(config.external_embeddings)) self.loss_max_margin = MarginRankingLoss(margin=config.max_margin) self.distance = nn.PairwiseDistance(p=1)
def __init__( self, device: bool, margin=None, mining=False, ) -> None: self._margin = margin # margin should be small self._mining = mining if margin is not None: ''' MarginRankingLoss(x1, x2, y) = max(0, -y*(x1-x2) + margin) if y=1 max(0, -x_neg + x_pos + margin) ''' super(TripletLoss, self).__init__( MarginRankingLoss(margin=margin), device, ) else: ''' SoftMarginLoss(x, y) = sum( log(1+exp(-y_i*x_i)) ) ''' super(TripletLoss, self).__init__( SoftMarginLoss(), device, )
def __init__(self, interface, learning_rate=3e-4, batch_size=32, margin=10, num_samples=100, user_embedding_dim=10, item_embedding_dim=10, user_meta_dim=15, item_meta_dim=15, meta_meta_dim=30, dense_1_dim=32, dense_2_dim=15, dropout=0.5): self.interface = interface self.margin = margin self.learning_rate = learning_rate self.user_embedding_dim = user_embedding_dim self.item_embedding_dim = item_embedding_dim self.user_meta_dim = user_meta_dim self.item_meta_dim = item_meta_dim self.meta_meta_dim = meta_meta_dim self.dense_1_dim = dense_1_dim self.dense_2_dim = dense_2_dim self.dropout = dropout self.network = SiameseNetwork(interface, user_embedding_dim=self.user_embedding_dim, item_embedding_dim=item_embedding_dim, user_meta_dim=user_meta_dim, item_meta_dim=item_meta_dim, meta_meta_dim=meta_meta_dim, dense_1_dim=dense_1_dim, dense_2_dim=dense_2_dim, dropout=dropout) self.dataset = DataGenerator(interface.state_history, interface.rewards_history, interface.action_history) self.batch_size = batch_size self.num_samples = num_samples self.loss = MarginRankingLoss(margin=margin, reduction='none') self.optimizer = Adam(self.network.parameters(), lr=learning_rate)
def reset(self, n): self.network = SiameseNetwork(self.interface, user_embedding_dim=self.user_embedding_dim, item_embedding_dim=self.item_embedding_dim, user_meta_dim=self.user_meta_dim, item_meta_dim=self.item_meta_dim, meta_meta_dim=self.meta_meta_dim, dense_1_dim=self.dense_1_dim, dense_2_dim=self.dense_2_dim, dropout=self.dropout) self.dataset = DataGenerator(self.interface.state_history, self.interface.rewards_history, self.interface.action_history) self.loss = MarginRankingLoss(margin=self.margin, reduction='none') self.optimizer = Adam(self.network.parameters(), lr=self.learning_rate) self.train(n)
def calculate_hinge_loss(fine_log_probs, other_log_probs): loss_fct = MarginRankingLoss(margin=1.609) length = len(other_log_probs) temp_tensor = [] for i in range(length): temp_tensor.append(fine_log_probs) temp_tensor = torch.cat(temp_tensor, dim=0) other_log_probs = torch.cat(other_log_probs, dim=0) y_vec = torch.ones(length).to(device) loss = loss_fct(temp_tensor, other_log_probs, y_vec) return loss
def __init__(self, word_embeddings: TextFieldEmbedder, vocab: Vocabulary, loss: str, hinge_margin: float) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.out = torch.nn.Linear( in_features=word_embeddings.get_output_dim(), out_features=1) self.accuracy = BooleanAccuracy() self.loss_name = loss if loss == 'hinge': self.loss = MarginRankingLoss(margin=hinge_margin, reduction='mean') else: self.loss = BCEWithLogitsLoss(reduction='mean') self.sigmoid = torch.nn.Sigmoid()
def setup(args): # Logger logger = Logger(args.name, ['loss', 'val_loss', 'MR', 'MRR', 'h@10']) # Loss function criterion = MarginRankingLoss(args.margin, reduction='sum') # Batch loader loader = BatchLoader(train, bernoulli_p, goldens, all_ents, all_sources, batch_size=args.batch_size, neg_ratio=args.neg_ratio) return logger, criterion, optimizer, loader
def __init__(self, margin): super().__init__() self.loss = MarginRankingLoss(margin=margin, reduction='sum')
def train(args, bert_field, model): Dataset = Data(args) # datasets train_rawdata = Dataset.load('train') valid_rawdata = Dataset.load('valid') (train_rawdata_questions, train_rawdata_gold, train_rawdata_neg) = train_rawdata (valid_rawdata_questions, valid_rawdata_gold, valid_rawdata_neg) = valid_rawdata train_dataset_question = Dataset.numericalize(bert_field, train_rawdata_questions) train_dataset_gold = Dataset.numericalize(bert_field, train_rawdata_gold) train_dataset_negs = [] for one_neg in train_rawdata_neg: train_dataset_neg = Dataset.numericalize(bert_field, one_neg) # train_dataset_neg is a tuple(subwords, lens, mask) train_dataset_negs.append(train_dataset_neg) print('train data loaded!') if args.neg_fix: # batchlize # sample_train_dataset_negs = train_neg_sample(train_dataset_negs, args.neg_size) # train_data = train_batchlize(train_dataset_question, train_dataset_gold, sample_train_dataset_negs, args.batch_size, args.neg_size, syntax_embed=train_syntax_embed, hidden_embed=args.syntax_hidden_embed) # print("train data batchlized............") sample_train_dataset_negs = train_neg_sample(train_dataset_negs, args.neg_size) train_data = train_batchlize(train_dataset_question, train_dataset_gold, sample_train_dataset_negs, args.batch_size, args.neg_size) print("train data batchlized............") valid_dataset_question = Dataset.numericalize(bert_field, valid_rawdata_questions) valid_dataset_gold = Dataset.numericalize(bert_field, valid_rawdata_gold) valid_dataset_negs = [] for index, one_neg in enumerate(valid_rawdata_neg): if not one_neg: print('no neg paths', index) valid_dataset_neg = Dataset.numericalize(bert_field, one_neg) valid_dataset_negs.append(valid_dataset_neg) valid_dataset = (valid_dataset_question, valid_dataset_gold, valid_dataset_negs) print('valid data loaded!') # num of train steps print('train examples',len(train_rawdata_questions)) num_train_steps = int( len(train_rawdata_questions) / args.batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) # loss function criterion = MarginRankingLoss(margin=args.margin) # train params patience = args.patience num_train_epochs = args.num_train_epochs iters_left = patience best_precision = 0 num_not_improved = 0 global_step = 0 logger.info('\nstart training:%s'%datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print("start training!") # train and evaluate for epoch in range(args.num_train_epochs): # batchlize if not args.neg_fix: sample_train_dataset_negs = train_neg_sample(train_dataset_negs, args.neg_size) train_data = train_batchlize(train_dataset_question, train_dataset_gold, sample_train_dataset_negs, args.batch_size, args.neg_size) print("train data batchlized............") train_right = 0 train_total = 0 # 打印 print('start time') start_time = datetime.now() logger.info('\nstart training:%s'%datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print(start_time) model.train() optimizer.zero_grad() loss_epoch = 0 # 单次迭代的总loss (batches_train_question, batches_train_gold, batches_train_negs) = train_data for step,(batch_train_question, batch_train_gold, batch_train_negs) in enumerate(zip(batches_train_question,batches_train_gold, batches_train_negs)): batch_train_question = (t.cuda() for t in batch_train_question) batch_train_gold = (t.cuda() for t in batch_train_gold) batch_train_negs = (t.cuda() for t in batch_train_negs) scores = model(batch_train_question, batch_train_gold, batch_train_negs) (pos_score, neg_scores) = scores pos_score = pos_score.expand_as(neg_scores).reshape(-1) neg_scores = neg_scores.reshape(-1) assert len(pos_score) == len(neg_scores) ones = torch.ones(pos_score.shape) if args.no_cuda == False: ones = ones.cuda() loss = criterion(pos_score, neg_scores, ones) # evaluate train result = (torch.sum(pos_score.reshape(-1, args.neg_size) > neg_scores.reshape(-1, args.neg_size),-1) == args.neg_size).cpu() train_right += torch.sum(result).item() train_total += len(result) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() loss_epoch += loss if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # 打印 end_time = datetime.now() logger.info('\ntrain epoch %d time span:%s'%(epoch, end_time-start_time)) print('train loss', loss_epoch.item()) logger.info('train loss:%f'%loss_epoch.item()) print('train result', train_right, train_total, 1.0*train_right/train_total) logger.info(('train result', train_right, train_total, 1.0*train_right/train_total)) # 评估 right, total, precision = evaluate(args, model, valid_dataset, valid_rawdata, epoch) # right, total, precision = 0, 0, 0.0 # 打印 print('valid result', right, total, precision) print('epoch time') print(datetime.now()) print('*'*20) logger.info("epoch:%d\t"%epoch+"dev_Accuracy-----------------------%d/%d=%f\n"%(right, total, precision)) end_time = datetime.now() logger.info('dev epoch %d time span:%s'%(epoch,end_time-start_time)) if precision > best_precision: best_precision = precision iters_left = patience print("epoch %d saved\n"%epoch) logger.info("epoch %d saved\n"%epoch) # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) else: iters_left -= 1 if iters_left == 0: break logger.info('finish training!') print('finish training!')
def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None, position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_dict=None, semantic_labels=None, custom_hyperparameters=None): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[ 1] if input_ids is not None else inputs_embeds.shape[1] lambda_1, lambda_2, margin_1, margin_2 = custom_hyperparameters flat_input_ids = input_ids.view( -1, input_ids.size(-1)) if input_ids is not None else None flat_position_ids = position_ids.view( -1, position_ids.size(-1)) if position_ids is not None else None flat_token_type_ids = token_type_ids.view( -1, token_type_ids.size(-1)) if token_type_ids is not None else None flat_attention_mask = attention_mask.view( -1, attention_mask.size(-1)) if attention_mask is not None else None flat_inputs_embeds = (inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) if inputs_embeds is not None else None) outputs = self.roberta( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, attention_mask=flat_attention_mask, head_mask=head_mask, inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) if semantic_labels is not None: loss_fct1 = MarginRankingLoss(margin=margin_1) loss_fct2 = MarginRankingLoss(margin=margin_2) semantic_type_p = (1 + semantic_labels).true_divide(2) semantic_type_p_total = semantic_type_p.sum(1) semantic_type_n = (1 - semantic_labels).true_divide(2) semantic_type_n_total = semantic_type_n.sum(1) scores_t = reshaped_logits[[i for i in range(len(labels))], labels] * semantic_type_p_total scores_p = (reshaped_logits * semantic_type_p).sum(1) scores_n, _ = (reshaped_logits * semantic_type_n).max(dim=1) scores_n *= semantic_type_p_total gold = torch.ones(len(semantic_labels)) if torch.cuda.is_available(): gold = gold.cuda() scores1 = loss_fct1(scores_t, scores_p, gold) scores2 = loss_fct2(scores_p, scores_n, gold) loss = loss + lambda_1 * scores1 + lambda_2 * scores2 if not return_dict: output = (reshaped_logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def train_func(self): step = 0 plm_lr = self.args.plm_learning_rate rerank_lr = self.args.rank_learning_rate model = load_rerank_model(self.args) true_score_func = get_score_func(model, 'true', inference=False) false_score_func = get_score_func(model, 'false', inference=False) if torch.cuda.is_available(): model.cuda() loss_fct = MarginRankingLoss(margin=1, reduction='mean') if self.args.separate_learning_rate: params = [(k, v) for k, v in model.named_parameters() if v.requires_grad] non_bert_params = { 'params': [v for k, v in params if not k.startswith('plm_model.')] } bert_params = { 'params': [v for k, v in params if k.startswith('plm_model.')], 'lr': plm_lr } # optimizer = torch.optim.Adam([bert_params, non_bert_params], lr=rerank_lr) optimizer = AdamW([non_bert_params, bert_params], lr=rerank_lr) else: optimizer = AdamW(model.parameters(), plm_lr) scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=self.args.scheduler_step, gamma=self.args.scheduler_gamma) accumulate_step = 0 for epoch in range(1, self.args.epoch + 1): for batch in self.train_loader: model.train() true_scores = true_score_func(batch) false_scores = false_score_func(batch) # y all 1s to indicate positive should be higher y = torch.ones(len(true_scores)).float() if torch.cuda.is_available(): y = y.cuda() loss = loss_fct(true_scores, false_scores, y) loss.backward() self.writer.add_scalar('loss', loss, step) accumulate_step += 1 # torch.nn.utils.clip_grad_value_(model.parameters(), 0.01) stop_scheduler_step = self.args.scheduler_step * 8 if accumulate_step % self.args.gradient_accumulate_step == 0: optimizer.step() optimizer.zero_grad() # if self.args.scheduler_lr and step <= stop_scheduler_step: if self.args.scheduler_lr: # and step <= stop_scheduler_step: scheduler.step() accumulate_step = 0 step += 1 if step % self.args.save_model_step == 0: model_basename = self.args.dest_base_dir + self.args.exp_name model_basename += '_epoch_{}_step_{}'.format(epoch, step) torch.save(model.state_dict(), model_basename + '.model') write_json(model_basename + '.json', vars(self.args)) map_top3 = self.evaluate(model, 5, model_basename) self.writer.add_scalar('map@3', map_top3, step) self.logger.info('step {} map@3 {:.4f}'.format( step, map_top3))
def train(gpu=None): logs = { 'train':tensorboard_logger.Logger(tb_path + "/train"), 'prs':tensorboard_logger.Logger(tb_path + "/prs"), 'spr':tensorboard_logger.Logger(tb_path + "/sp"), 'r2':tensorboard_logger.Logger(tb_path + "/r2"), } db = Dataset(training_path, testing_path, post_map_path, feature_path, aux_path, attr_path, settings['min_images']) print 'Training Attributes:', db.attr_names model = neural_net(num_attributes=len(db.attr_inds), aux_size=len(db.aux_list)) if resume_train is None: start_train = 0 else: epochs_str = [el.split('_')[-1].split('.')[0] for el in glob.glob('log/' + resume_train + "/*.dat")] if 'model' in epochs_str: epochs_str.remove('model') last_epoch = np.max([int(el) for el in epochs_str]) # last_epoch = np.max([int(el.split('_')[-1][0]) for el in glob.glob('log/' + resume_train + "/*.dat")]) resume_path = 'log/' + resume_train + "/vgg_model_ep_" + str(last_epoch) + ".dat" start_train = last_epoch + 1 if gpu is not None: model.load_state_dict(torch.load(resume_path, map_location='cuda:' + str(gpu))) else: model.load_state_dict(torch.load(resume_path, map_location=lambda gpu, loc: gpu)) # Initializing PyTorch Dataloader dataloader = DataLoader(db, batch_size=settings['batch_size'], shuffle=True, num_workers=4) mr_loss = MarginRankingLoss(margin=0.3).to(gpu) optimizer = optim.Adadelta(model.parameters(), lr=settings['lr'], weight_decay=1e-5) model = model.to(gpu) step = 0 for epoch in range(start_train, settings['num_epochs']): print 'Epoch', epoch pbar = tqdm(total=db.__len__()) for i_batch, sample_batched in enumerate(dataloader): optimizer.zero_grad() image_1 = sample_batched['image_1'].type(torch.FloatTensor) image_2 = sample_batched['image_2'].type(torch.FloatTensor) aux_1 = sample_batched['label_1'].type(torch.FloatTensor).to(gpu) aux_2 = sample_batched['label_2'].type(torch.FloatTensor).to(gpu) gt = (aux_1 > aux_2).type(torch.FloatTensor) reg_loss_1 = torch.zeros(image_1.shape[0], dtype=torch.float32) reg_loss_2 = torch.zeros(image_1.shape[0], dtype=torch.float32) ranking_loss = torch.zeros(image_1.shape[0], dtype=torch.float32) if gpu is not None: image_1 = image_1.to(gpu) image_2 = image_2.to(gpu) aux_1 = aux_1.to(gpu) aux_2 = aux_2.to(gpu) gt = gt.to(gpu) reg_loss_1 = reg_loss_1.to(gpu) reg_loss_2 = reg_loss_2.to(gpu) ranking_loss = ranking_loss.to(gpu) out_1 = model(image_1) out_2 = model(image_2) for i in range(len(db.attr_inds)): # avg over attributes ranking_loss += mr_loss(out_1[i], out_2[i], gt[:, i]) ranking_loss = ranking_loss / len(db.attr_inds) if fixed_std: p = [torch.distributions.normal.Normal(aux_1[:, i], 0.1) for i in range(len(db.attr_inds))] q = [torch.distributions.normal.Normal(out_1[i].mean(1).squeeze(), out_1[i].std(1).squeeze()) for i in range(len(db.attr_inds))] for i in range(len(db.attr_inds)): # avg over attributes reg_loss_1 += torch.distributions.kl.kl_divergence(p[i], q[i]) reg_loss_1 = reg_loss_1 / len(db.attr_inds) p = [torch.distributions.normal.Normal(aux_2[:, i], 0.1) for i in range(len(db.attr_inds))] q = [torch.distributions.normal.Normal(out_2[i].mean(1).squeeze(), out_2[i].std(1).squeeze()) for i in range(len(db.attr_inds))] for i in range(len(db.attr_inds)): # avg over attributes reg_loss_2 += torch.distributions.kl.kl_divergence(p[i], q[i]) reg_loss_2 = reg_loss_2 / len(db.attr_inds) else: p = [torch.distributions.normal.Normal(aux_1[:, i], model.aux_stds[sample_batched['aux_1'], i]) for i in range(len(db.attr_inds))] q = [torch.distributions.normal.Normal(out_1[i].mean(1).squeeze(), out_1[i].std(1).squeeze()) for i in range(len(db.attr_inds))] for i in range(len(db.attr_inds)): # avg over attributes reg_loss_1 += torch.distributions.kl.kl_divergence(p[i], q[i]) reg_loss_1 = reg_loss_1 / len(db.attr_inds) p = [torch.distributions.normal.Normal(aux_2[:, i], model.aux_stds[sample_batched['aux_2'], i]) for i in range(len(db.attr_inds))] q = [torch.distributions.normal.Normal(out_2[i].mean(1).squeeze(), out_2[i].std(1).squeeze()) for i in range(len(db.attr_inds))] for i in range(len(db.attr_inds)): # avg over attributes reg_loss_2 += torch.distributions.kl.kl_divergence(p[i], q[i]) reg_loss_2 = reg_loss_2 / len(db.attr_inds) ranking_loss = ranking_loss.mean() # avg over batch reg_loss = reg_loss_1.mean() + reg_loss_2.mean() # avg over batch loss = reg_loss + ranking_loss step += 1 logs['train'].log_value('loss', loss.item(), step) loss.backward() optimizer.step() _loss = loss.item() pbar.update(image_1.shape[0]) pbar.close() if epoch % 50 == 0: model.eval() test(model, db, gpu, logs=logs, step=step) model.train() persist_model(model, experiment_folder + '/vgg_model_ep_' + str(epoch) + '.dat') # Performing final evaluation model.eval() test(model, db, gpu) persist_model(model, model_path) return
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 # aug_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/ContextualWordEmbsAug_sub_df.pkl') # aug_df['is_original'] = 0 # trn_df = pd.concat([trn_df, aug_df], axis=0).reset_index(drop=True) gkf = GroupKFold( n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) # calc max_seq_len using quest dataset # max_seq_len = QUESTDataset( # df=trn_df, # mode='train', # tokens=[], # augment=[], # pretrained_model_name_or_path=TOKENIZER_PRETRAIN, # ).MAX_SEQUENCE_LENGTH # max_seq_len = 9458 # max_seq_len = 1504 max_seq_len = 512 fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop( ['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop( ['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series(list(itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')) ))).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, MAX_SEQUENCE_LENGTH=max_seq_len, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], pretrained_model_name_or_path=TOKENIZER_PRETRAIN, MAX_SEQUENCE_LENGTH=max_seq_len, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) fobj = BCEWithLogitsLoss() # fobj = MSELoss() pair_fobj = MarginRankingLoss() model = BertModelForBinaryMultiLabelClassifier(num_labels=len(LABEL_COL), pretrained_model_name_or_path=MODEL_PRETRAIN, # cat_num=5, token_size=len( trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=max_seq_len, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch( model, fobj, optimizer, trn_loader, pair_fobj) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader) scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])]) save_and_clean_for_prediction( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
def forward( self, input_ids, attention_mask, valid_ids, active_mask, valid_output, labels=None, chunk_labels=None, chunk_mask=None, ): """ active_mask : mention_mask for ngrams = torch.LongTensor([[1,2,1,3,4,5,4], [1,2,3,0,4,4,0]]) laebls : for ngrams labels = torch.LongTensor([[1,-1,-1,1,-1], [1,-1,-1,1,0]]) """ # -------------------------------------------------------------------------------- # Bert Embedding Outputs outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask) sequence_output = outputs[0] # -------------------------------------------------------------------------------- # Valid Outputs : get first token vector batch_size = sequence_output.size(0) for i in range(batch_size): valid_num = sum(valid_ids[i]).item() vectors = sequence_output[i][valid_ids[i] == 1] valid_output[i, :valid_num].copy_(vectors) # -------------------------------------------------------------------------------- # Dropout sequence_output = self.dropout(valid_output) # -------------------------------------------------------------------------------- # CNN Outputs cnn_outputs = self.cnn2gram( sequence_output) # shape = (batch_size, max_gram_num, 512) # -------------------------------------------------------------------------------- # Classifier 512 to 1 classifier_scores = self.classifier( cnn_outputs) # shape = (batch_size, max_gram_num, 1) classifier_scores = classifier_scores.squeeze( -1) # shape = (batch_size, max_gram_num) classifier_scores = classifier_scores.unsqueeze(1).expand( active_mask.size() ) # shape = (batch_size, max_diff_ngram_num, max_gram_num) classifier_scores = classifier_scores.masked_fill(mask=active_mask, value=-float("inf")) # -------------------------------------------------------------------------------- # Merge TF : # shape = (batch_size * max_diff_ngram_num * max_gram_num) to (batch_size * max_diff_ngram_num) total_scores, indices = torch.max(classifier_scores, dim=-1) # -------------------------------------------------------------------------------- # -------------------------------------------------------------------------------- # Total Loss Compute if labels is not None and chunk_labels is not None: # ************************************************************************************* # ************************************************************************************* # [1] Chunk Loss Chunk_Loss_Fct = CrossEntropyLoss(reduction="mean") active_chunk_loss = chunk_mask.view(-1) != -1 chunk_logits = self.chunk_classifier( cnn_outputs) # shape = (batch_size * num_gram, 2) active_chunk_logits = chunk_logits.view( -1, self.num_labels)[active_chunk_loss] active_chunk_label_loss = chunk_labels.view(-1) != -1 active_chunk_labels = chunk_labels.view( -1)[active_chunk_label_loss] chunk_loss = Chunk_Loss_Fct(active_chunk_logits, active_chunk_labels) # ************************************************************************************* # ************************************************************************************* # [2] Rank Loss Rank_Loss_Fct = MarginRankingLoss(margin=1, reduction="mean") device = torch.device("cuda", total_scores.get_device()) flag = torch.FloatTensor([1]).to(device) rank_losses = [] for i in range(batch_size): score = total_scores[i] label = labels[i] true_score = score[label == 1] neg_score = score[label == -1] rank_losses.append( Rank_Loss_Fct(true_score.unsqueeze(-1), neg_score.unsqueeze(0), flag)) rank_loss = torch.mean(torch.stack(rank_losses)) # ************************************************************************************* # ************************************************************************************* # [3] Total Loss tot_loss = rank_loss + chunk_loss return tot_loss else: return total_scores # shape = (batch_size * max_differ_gram_num)
def train(args): # random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # gpu if not args.no_cuda: os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu print('使用%s号GPU' % args.gpu) # word vector corpus = Corpus() vocab, embed = corpus.load_embed(args.fn_embed) print("finish loading external word embeding, the shape is:") print(embed.shape) # model model_dict = {'lstm_comparing':BiLSTM_Encoding_Comparing, 'char_lstm_comparing':Char_BiLSTM_Encoding_Comparing} print("current model is", args.model) model_name = model_dict[args.model] if not args.no_cuda: embed = embed.cuda() model = model_name(args, embed) if not args.no_cuda: model.cuda() print(model) train_questions_raw, train_golds_raw, train_negs_raw = corpus.load_data(args.fn_train, 'train') valid_questions_raw, valid_golds_raw, valid_negs_raw = corpus.load_data(args.fn_valid, 'valid') train_questions = corpus.numericalize(train_questions_raw, args.input_mode) train_golds = corpus.numericalize(train_golds_raw, args.input_mode) train_negs = [] for line in train_negs_raw: train_negs.append(corpus.numericalize(line, args.input_mode)) # from pdb import set_trace # set_trace() if isinstance(train_questions, tuple): print("train data loaded!%d questions totally"%len(train_questions[0])) else: print("train data loaded!%d questions totally"%len(train_questions)) valid_questions = corpus.numericalize(valid_questions_raw, args.input_mode) valid_golds = corpus.numericalize(valid_golds_raw, args.input_mode) valid_negs = [] for index, line in enumerate(valid_negs_raw): valid_negs.append(corpus.numericalize(line, args.input_mode)) if isinstance(valid_questions, tuple): print("valid data loaded!%d questions totally"%len(valid_questions[0])) else: print("valid data loaded!%d questions totally"%len(valid_questions)) valid_dataset = (valid_questions, valid_golds, valid_negs) print("字符字典长度", corpus.len_char_dict()) # dump vocab corpus.dump_vocab(args.vocab_word, mode='word') corpus.dump_vocab(args.vocab_char, mode='char') # training settings optimizer_dict = {"adam":Adam} optimizer_name = optimizer_dict[args.optimizer] print("choose optimizer:%s"%args.optimizer) optimizer = optimizer_name(model.parameters(), lr = args.learning_rate) criterion = MarginRankingLoss(margin=args.margin) patience = args.patience num_train_epochs = args.num_train_epochs iters_left = patience best_precision = 0 num_not_improved = 0 global_step = 0 logger.info('\nstart training:%s'%datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print("start training!") for epoch in range(args.num_train_epochs): # batchlize sample_train_negs = train_neg_sample(train_negs, args.neg_size, mode=args.input_mode) sample_train = (train_questions, train_golds, sample_train_negs) train_batches = train_batchlize(sample_train, args.batch_size, mode=args.input_mode) print("train data batchlized............") # train_right = 0 train_total = 0 # 打印 print('start time') start_time = datetime.now() logger.info('\nstart training:%s'%datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print(start_time) model.train() optimizer.zero_grad() loss_epoch = 0 # 单次迭代的总loss for step, batch in enumerate(train_batches): # if not args.no_cuda: # batch = (t.cuda() for t in batch) question_batch, gold_batch, negs_batch = batch pos_score, neg_scores = model(question_batch, gold_batch, negs_batch) pos_score = pos_score.expand_as(neg_scores).reshape(-1) neg_scores = neg_scores.reshape(-1) assert pos_score.shape == neg_scores.shape ones = torch.ones(pos_score.shape) if not args.no_cuda: ones = ones.cuda() loss = criterion(pos_score, neg_scores, ones) # evaluate train result = (torch.sum(pos_score.view(-1, args.neg_size) > neg_scores.view(-1, args.neg_size),-1) == args.neg_size) train_right += torch.sum(result).item() train_total += len(result) optimizer.zero_grad() loss.backward() optimizer.step() loss_epoch += loss # 打印 end_time = datetime.now() logger.info('\ntrain epoch %d time span:%s'%(epoch, end_time-start_time)) print('train loss', loss_epoch.item()) logger.info('train loss:%f'%loss_epoch.item()) print('train result', train_right, train_total, 1.0*train_right/train_total) logger.info(('train result', train_right, train_total, 1.0*train_right/train_total)) # eval right, total, precision = evaluate_char(args, model, valid_dataset) # print print('valid result', right, total, precision) print('epoch time') print(datetime.now()) print('*'*20) logger.info("epoch:%d\t"%epoch+"dev_Accuracy-----------------------%d/%d=%f\n"%(right, total, precision)) end_time = datetime.now() logger.info('dev epoch %d time span:%s'%(epoch,end_time-start_time)) if precision > best_precision: best_precision = precision iters_left = patience print("epoch %d saved\n"%epoch) logger.info("epoch %d saved\n"%epoch) # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "best_model.bin") torch.save(model_to_save.state_dict(), output_model_file) else: iters_left -= 1 if iters_left == 0: break logger.info('finish training!') print('finish training!')
def forward(self, input_ids, attention_mask, valid_ids, active_mask, valid_output, labels=None): """ active_mask : mention_mask for ngrams = torch.LongTensor([[1,2,1,3,4,5,4], [1,2,3,0,4,4,0]]) laebls : for ngrams labels = torch.LongTensor([[1,-1,-1,1,-1], [1,-1,-1,1,0]]) """ # -------------------------------------------------------------------------------- # Bert Embedding Outputs outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask) sequence_output = outputs[0] # -------------------------------------------------------------------------------- # Valid Outputs : get first token vector batch_size = sequence_output.size(0) for i in range(batch_size): valid_num = sum(valid_ids[i]).item() vectors = sequence_output[i][valid_ids[i] == 1] valid_output[i, :valid_num].copy_(vectors) # -------------------------------------------------------------------------------- # Dropout sequence_output = self.dropout(valid_output) # -------------------------------------------------------------------------------- # CNN Outputs cnn_outputs = self.cnn2gram( sequence_output) # shape = (batch_size, max_gram_num, 512) # -------------------------------------------------------------------------------- # Classifier 512 to 1 classifier_scores = self.classifier( cnn_outputs) # shape = (batch_size, max_gram_num, 1) classifier_scores = classifier_scores.squeeze(-1) classifier_scores = classifier_scores.unsqueeze(1).expand( active_mask.size() ) # shape = (batch_size, max_diff_ngram_num, max_gram_num) classifier_scores = classifier_scores.masked_fill( mask=active_mask.byte(), value=-float('inf')) # -------------------------------------------------------------------------------- # Merge TF : # shape = (batch_size * max_diff_ngram_num * max_gram_num) to (batch_size * max_diff_ngram_num) total_scores, indices = torch.max( classifier_scores, dim=-1) # shape = (batch_size * max_diff_ngram_num) # -------------------------------------------------------------------------------- # Loss Compute if labels is not None: Rank_Loss_Fct = MarginRankingLoss(margin=1, reduction='mean') device = torch.device("cuda", total_scores.get_device()) flag = torch.FloatTensor([1]).to(device) rank_losses = [] for i in range(batch_size): score = total_scores[i] label = labels[i] true_score = score[label == 1] neg_score = score[label == -1] rank_losses.append( Rank_Loss_Fct(true_score.unsqueeze(-1), neg_score.unsqueeze(0), flag)) rank_loss = torch.mean(torch.stack(rank_losses)) return rank_loss else: return total_scores # shape = (batch_size * max_differ_gram_num)
model = FeedForward(len(rel2id), len(ent2id), dim=config['embedding_dim']) elif args.model == 'ffs': model = FeedForward_Source(len(rel2id), len(ent2id), len(src2id), dim=config['embedding_dim']) elif args.model == 'hyte': model = HyTE(len(rel2id), len(ent2id), len(src2id), dim=config['embedding_dim'], norm=config['norm'], margin=config['margin'], l2reg=config['l2reg']) # model.to(device) # Logger if args.mode.startswith('train'): logger = Logger(config['name'], ['loss', 'val_loss', 'MR', 'MRR', 'h@10']) else: logger = None # Loss function criterion = MarginRankingLoss(config['margin'], reduction='sum') # Batch loader loader = BatchLoader(train, bernoulli_p, goldens, all_ents, all_sources, batch_size=config['batch_size'], neg_ratio=config['neg_ratio']) # ========================================= # Initialize OPTIMIZER # ========================================= if config['optim']== 'adam': optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['l2reg']) elif config['optim'] == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=config['learning_rate'], weight_decay=config['l2reg']) else: optimizer = optim.SGD(model.parameters(), lr=config['learning_rate'], weight_decay=config['l2reg']) trainer = Trainer(model, train, val, test, optimizer, criterion, logger, loader, config)
def __init__(self, margin=1.0): super().__init__() self.loss = MarginRankingLoss(margin=margin) self.margin = margin
import torch from torch.nn import MarginRankingLoss criterion = MarginRankingLoss(margin=0.0, size_average=None, reduce=None, reduction='mean') x1 = torch.Tensor(32, 3) print(x1) x2 = torch.Tensor(32, 3) y = torch.ones([32, 3]) loss = criterion(x1, x2, y) print(loss)
def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv') # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv') # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv') # raw_pseudo_df2 = pd.read_csv('./mnt/inputs/pseudos/top2_e121_e125_e126_e127_e128_e129/raw_pseudo_tst_df.csv') # half_opt_pseudo_df2 = pd.read_csv('./mnt/inputs/pseudos/top2_e121_e125_e126_e127_e128_e129/half_opt_pseudo_tst_df.csv') # opt_pseudo_df2 = pd.read_csv('./mnt/inputs/pseudos/top2_e121_e125_e126_e127_e128_e129/opt_pseudo_tst_df.csv') # clean texts # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer']) # load additional tokens # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin: # additional_tokens = pickle.load(fin) gkf = GroupKFold(n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] # + additional_tokens # fold_trn_df = pd.concat([fold_trn_df, raw_pseudo_df, opt_pseudo_df, half_opt_pseudo_df, raw_pseudo_df2, opt_pseudo_df2, half_opt_pseudo_df2], axis=0) trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) # fobj = BCEWithLogitsLoss() # fobj = FocalLossKaggle(gamma=2) fobj = MarginRankingLoss() state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict() model = BertModelForBinaryMultiLabelClassifier( num_labels=len(LABEL_COL), config_path=MODEL_CONFIG_PATH, state_dict=state_dict, token_size=len(trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, cat_last_layer_num=1, do_ratio=0.2, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader, DEVICE) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader, DEVICE, mode='valid') scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [ trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [ val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [ val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [ val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric, ) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
def train(gpu=None): # Loading post image features with open(post_map_path, 'r') as f: code_list = pickle.load(f) image_features = np.load(feature_path) brands = build_brand_list(data_path + 'brand_list.csv') brand_list = brands['username'].tolist() model = VggModel(len(brand_list)) # Initializing PyTorch Dataloader db = BrandDataset(training_path, code_list, image_features, brand_list) dataloader = DataLoader(db, batch_size=256, shuffle=True, num_workers=0) loss_function = MarginRankingLoss(margin=0.3) if gpu: model.cuda(gpu) loss_function.cuda(gpu) optimizer_rel = optim.Adadelta(model.parameters(), lr=1) for epoch in range(20): for i_batch, sample_batched in enumerate(dataloader): model.zero_grad() image_pos = Variable(sample_batched['image_p']) image_neg = Variable(sample_batched['image_n']) brand = Variable(sample_batched['brand']) ones = Variable(torch.ones(image_pos.size()[0], 1)) if gpu: image_pos.cuda(gpu) image_neg.cuda(gpu) brand.cuda(gpu) ones.cuda(gpu) # Forwarding the network for positive and negative samples out_pos = model({'image': image_pos, 'brand': brand}) out_neg = model({'image': image_neg, 'brand': brand}) loss = loss_function(out_pos, out_neg, ones) loss.backward() optimizer_rel.step() # Computing evaluation metrics on testing/validation set if (i_batch % eval_freq == 0) & (i_batch > 0): model.eval() test = test_ranking(model, testing_path, code_list, image_features, brands, gpu) model.train() persist_model(model, experiment_folder + '/vgg_model.dat') print 'Epoch:', epoch, 'batch', i_batch, \ 'Tr_Loss:', loss.item(), \ 'Testing MedR:', test[0], \ 'Testing AUC:', test[1], \ 'Testing cAUC:', test[2], \ 'Testing NDCG@10:', test[3], \ 'Testing NDCG@50:', test[4] else: print 'Epoch:', epoch, 'batch', i_batch, 'Tr_Loss:', loss.item( ) persist_model( model, experiment_folder + '/vgg_model_ep_' + str(epoch) + '.dat') # Performing final evaluation model.eval() test = test_ranking(model, testing_path, code_list, image_features, brands, gpu) model.train() persist_model(model, model_path) print 'Final Result: ', \ 'MedR:', test[0], \ 'AUC:', test[1], \ 'cAUC:', test[2], \ 'NDCG@10:', test[3], \ 'NDCG@50:', test[4] return
def train(args, model, processor, tokenizer, device, n_gpu): global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() data, num_examples = features(args, processor, "train", tokenizer) data = TensorDataset(*data) if args.local_rank == -1: sampler = RandomSampler(data) else: sampler = DistributedSampler(data) data_loader = DataLoader(data, sampler=sampler, batch_size=args.train_batch_size) step_size = args.gradient_accumulation_steps * args.num_train_epochs num_train_optimization_steps = len(data_loader) // step_size # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from " "https://www.github.com/nvidia/apex to use " "distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", num_examples) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() loss_fct = MarginRankingLoss(margin=args.margin) ckpt_num = 0 eval_results_history = [] best = 0. best_props = {} eval_result = None no_improvement = 0 t = time.time() try: for num_epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 if no_improvement > args.tolerance: logger.info( "No improvement in last %d evaluations, early stopping") logger.info( "epoch: {} | nb_tr_steps: {} | global_step: {} | tr_loss: {}" .format(num_epoch, nb_tr_steps, global_step, tr_loss)) for step, batch in enumerate(tqdm(data_loader, desc="Iteration")): print(nb_tr_steps) batch = tuple(t.to(device) for t in batch) input_ids, segment_ids, mask_ids = batch # <question, +ve doc> pairs input_ids_qp, segment_ids_qp, input_mask_qp = \ input_ids[:, 0, :], segment_ids[:, 0, :], mask_ids[:, 0, :] # <question, -ve doc> pairs input_ids_qn, segment_ids_qn, input_mask_qn = \ input_ids[:, 1, :], segment_ids[:, 1, :], mask_ids[:, 1, :] pos_scores = model(input_ids_qp, segment_ids_qp, input_mask_qp) neg_scores = model(input_ids_qn, segment_ids_qn, input_mask_qn) # y all 1s to indicate positive should be higher y = torch.ones(len(pos_scores)).float().to(device) loss = loss_fct(pos_scores, neg_scores, y) if nb_tr_steps % 10 == 0 and nb_tr_steps != 0: logger.info("+ve scores : %r" % pos_scores) logger.info("-ve scores : %r" % neg_scores) logger.info("Train step loss : %0.5f" % loss.item()) if global_step > 0: logger.info("Train total loss : %0.5f" % (tr_loss / global_step)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles # this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) if nb_tr_steps % config.eval_every_step == 0 and nb_tr_steps != 0: eval_result = eval(args, model, processor, tokenizer, device, tr_loss, global_step) if eval_result["f1"] >= best: save( model, "%s_%0.3f_%0.3f_%0.3f" % (args.model_name, eval_result["precision"], eval_result["recall"], eval_result["f1"]), args, tokenizer, ckpt_num) best = eval_result["f1"] best_props["num_epoch"] = num_epoch best_props["nb_tr_steps"] = nb_tr_steps best_props["tr_loss"] = tr_loss / global_step best_props["ckpt_num"] = ckpt_num best_props["global_step"] = global_step best_props["eval_result"] = eval_result with open(os.path.join(config.output_dir, "best.json"), "w") as wf: json.dump(best_props, wf, indent=2) # make predictions with best model for i in range(1, 6): predict(args, model, processor, tokenizer, device, i) no_improvement = 0 else: no_improvement += 1 ckpt_num += 1 eval_results_history.append((ckpt_num, eval_result)) except KeyboardInterrupt: logger.info("Training interrupted!") if eval_result is not None: save( model, "%s_%0.3f_%0.3f_%0.3f_interrupted" % (args.model_name, eval_result["precision"], eval_result["recall"], eval_result["f1"]), args, tokenizer, ckpt_num) t = time.time() - t logger.info("Training took %0.3f seconds" % t) loss = tr_loss / global_step logger.info("Final training loss %0.5f" % loss) logger.info("Best F1-score on eval set : %0.3f" % best) logger.info("***** Eval best props *****") for key in sorted(best_props.keys()): if key != "eval_result": logger.info(" %s = %s", key, str(best_props[key])) else: for eval_key in sorted(best_props[key].keys()): logger.info(" %s = %s", eval_key, str(best_props[key][eval_key])) with open(os.path.join(config.output_dir, "eval_results_history.pkl"), "wb") as wf: pickle.dump(eval_results_history, wf)