Example #1
0
    def __init__(self, config, tasks):
        super(Inference, self).__init__()
        self.config = config
        self.tasks = tasks
        if config.branching_encoder:
            utils.log("Build Branching Bert Encoder")
            self.encoder = BranchingBertModel.from_pretrained(
                config.bert_model,
                encoder_structure=config.branching_structure)
        else:
            utils.log("Build {}:{} Encoder".format(config.encoder_type,
                                                   config.bert_model))
            self.encoder = get_encoder(config.encoder_type).from_pretrained(
                config.bert_model, output_attentions=config.output_attentions)

        utils.log("Build Task Modules")
        self.tasks_modules = nn.ModuleDict()
        for task in tasks:
            if task.has_module:
                self.tasks_modules.update([(task.name, task.get_module())])
        self.task_dict = dict([(task.name, task) for task in self.tasks])
        self.dummy_input = torch.rand(1, 10, requires_grad=True)

        # self.encoder = HighwayLSTM(num_layers=3, input_size=300, hidden_size=200, layer_dropout=0.2)
        # self.word_embedding = nn.Embedding(self.config.external_vocab_size, self.config.external_vocab_embed_size)
        # self.word_embedding.weight.data.copy_(torch.from_numpy(np.load(config.external_embeddings)))
        # print("Loading embedding from {}".format(config.external_embeddings))

        self.loss_max_margin = MarginRankingLoss(margin=config.max_margin)
        self.distance = nn.PairwiseDistance(p=1)
 def __init__(
     self,
     device: bool,
     margin=None,
     mining=False,
 ) -> None:
     self._margin = margin  # margin should be small
     self._mining = mining
     if margin is not None:
         '''
         MarginRankingLoss(x1, x2, y) = max(0, -y*(x1-x2) + margin)
         if y=1
             max(0, -x_neg + x_pos + margin)
         '''
         super(TripletLoss, self).__init__(
             MarginRankingLoss(margin=margin),
             device,
         )
     else:
         '''
         SoftMarginLoss(x, y) = sum( log(1+exp(-y_i*x_i)) )
         '''
         super(TripletLoss, self).__init__(
             SoftMarginLoss(),
             device,
         )
    def __init__(self, interface, learning_rate=3e-4, batch_size=32, margin=10, num_samples=100, user_embedding_dim=10,
                 item_embedding_dim=10, user_meta_dim=15, item_meta_dim=15, meta_meta_dim=30, dense_1_dim=32,
                 dense_2_dim=15, dropout=0.5):
        self.interface = interface

        self.margin = margin
        self.learning_rate = learning_rate

        self.user_embedding_dim = user_embedding_dim
        self.item_embedding_dim = item_embedding_dim
        self.user_meta_dim = user_meta_dim
        self.item_meta_dim = item_meta_dim
        self.meta_meta_dim = meta_meta_dim
        self.dense_1_dim = dense_1_dim
        self.dense_2_dim = dense_2_dim
        self.dropout = dropout
        self.network = SiameseNetwork(interface, user_embedding_dim=self.user_embedding_dim,
                                      item_embedding_dim=item_embedding_dim, user_meta_dim=user_meta_dim,
                                      item_meta_dim=item_meta_dim, meta_meta_dim=meta_meta_dim, dense_1_dim=dense_1_dim,
                                      dense_2_dim=dense_2_dim, dropout=dropout)
        self.dataset = DataGenerator(interface.state_history, interface.rewards_history, interface.action_history)
        self.batch_size = batch_size
        self.num_samples = num_samples

        self.loss = MarginRankingLoss(margin=margin, reduction='none')
        self.optimizer = Adam(self.network.parameters(), lr=learning_rate)
 def reset(self, n):
     self.network = SiameseNetwork(self.interface, user_embedding_dim=self.user_embedding_dim,
                                   item_embedding_dim=self.item_embedding_dim, user_meta_dim=self.user_meta_dim,
                                   item_meta_dim=self.item_meta_dim, meta_meta_dim=self.meta_meta_dim,
                                   dense_1_dim=self.dense_1_dim, dense_2_dim=self.dense_2_dim, dropout=self.dropout)
     self.dataset = DataGenerator(self.interface.state_history, self.interface.rewards_history,
                                  self.interface.action_history)
     self.loss = MarginRankingLoss(margin=self.margin, reduction='none')
     self.optimizer = Adam(self.network.parameters(), lr=self.learning_rate)
     self.train(n)
Example #5
0
 def calculate_hinge_loss(fine_log_probs, other_log_probs):
     loss_fct = MarginRankingLoss(margin=1.609)
     length = len(other_log_probs)
     temp_tensor = []
     for i in range(length):
         temp_tensor.append(fine_log_probs)
     temp_tensor = torch.cat(temp_tensor, dim=0)
     other_log_probs = torch.cat(other_log_probs, dim=0)
     y_vec = torch.ones(length).to(device)
     loss = loss_fct(temp_tensor, other_log_probs, y_vec)
     return loss
    def __init__(self, word_embeddings: TextFieldEmbedder, vocab: Vocabulary,
                 loss: str, hinge_margin: float) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings

        self.out = torch.nn.Linear(
            in_features=word_embeddings.get_output_dim(), out_features=1)
        self.accuracy = BooleanAccuracy()
        self.loss_name = loss
        if loss == 'hinge':
            self.loss = MarginRankingLoss(margin=hinge_margin,
                                          reduction='mean')
        else:
            self.loss = BCEWithLogitsLoss(reduction='mean')
        self.sigmoid = torch.nn.Sigmoid()
Example #7
0
def setup(args):

    # Logger
    logger = Logger(args.name, ['loss', 'val_loss', 'MR', 'MRR', 'h@10'])

    # Loss function
    criterion = MarginRankingLoss(args.margin, reduction='sum')

    # Batch loader
    loader = BatchLoader(train,
                         bernoulli_p,
                         goldens,
                         all_ents,
                         all_sources,
                         batch_size=args.batch_size,
                         neg_ratio=args.neg_ratio)

    return logger, criterion, optimizer, loader
Example #8
0
 def __init__(self, margin):
     super().__init__()
     self.loss = MarginRankingLoss(margin=margin, reduction='sum')
Example #9
0
def train(args, bert_field, model):

    Dataset =  Data(args)

    # datasets
    train_rawdata = Dataset.load('train')
    valid_rawdata = Dataset.load('valid')

    (train_rawdata_questions, train_rawdata_gold, train_rawdata_neg) = train_rawdata
    (valid_rawdata_questions, valid_rawdata_gold, valid_rawdata_neg) = valid_rawdata
    train_dataset_question = Dataset.numericalize(bert_field, train_rawdata_questions)
    train_dataset_gold = Dataset.numericalize(bert_field, train_rawdata_gold)
    train_dataset_negs = []
    for one_neg in train_rawdata_neg:
        train_dataset_neg = Dataset.numericalize(bert_field, one_neg) # train_dataset_neg is a tuple(subwords, lens, mask)
        train_dataset_negs.append(train_dataset_neg)
    print('train data loaded!')

    if args.neg_fix:
    # batchlize
        # sample_train_dataset_negs = train_neg_sample(train_dataset_negs, args.neg_size)
        # train_data = train_batchlize(train_dataset_question, train_dataset_gold, sample_train_dataset_negs, args.batch_size, args.neg_size, syntax_embed=train_syntax_embed, hidden_embed=args.syntax_hidden_embed) 

        # print("train data batchlized............")
        sample_train_dataset_negs = train_neg_sample(train_dataset_negs, args.neg_size)
        train_data = train_batchlize(train_dataset_question, train_dataset_gold, sample_train_dataset_negs, args.batch_size, args.neg_size)
        print("train data batchlized............")

    valid_dataset_question = Dataset.numericalize(bert_field, valid_rawdata_questions)
    valid_dataset_gold = Dataset.numericalize(bert_field, valid_rawdata_gold)
    valid_dataset_negs = []
    for index, one_neg in enumerate(valid_rawdata_neg):
        if not one_neg:
            print('no neg paths', index)
        valid_dataset_neg = Dataset.numericalize(bert_field, one_neg)
        valid_dataset_negs.append(valid_dataset_neg)

    valid_dataset = (valid_dataset_question, valid_dataset_gold, valid_dataset_negs)
    print('valid data loaded!')

    # num of train steps
    print('train examples',len(train_rawdata_questions))
    num_train_steps = int(
            len(train_rawdata_questions) / args.batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    # optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]        
    
    optimizer = BertAdam(optimizer_grouped_parameters,
                            lr=args.learning_rate,
                            warmup=args.warmup_proportion,
                            t_total=num_train_steps)
    
    # loss function
    criterion = MarginRankingLoss(margin=args.margin)

    # train params
    patience = args.patience
    num_train_epochs = args.num_train_epochs
    iters_left = patience
    best_precision = 0
    num_not_improved = 0
    global_step = 0

    logger.info('\nstart training:%s'%datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    print("start training!")

    # train and evaluate
    for epoch in range(args.num_train_epochs):
        
        # batchlize
        if not args.neg_fix:
            sample_train_dataset_negs = train_neg_sample(train_dataset_negs, args.neg_size)
            train_data = train_batchlize(train_dataset_question, train_dataset_gold, sample_train_dataset_negs, args.batch_size, args.neg_size)
            print("train data batchlized............")

        train_right = 0
        train_total = 0
        # 打印
        print('start time')
        start_time = datetime.now()
        logger.info('\nstart training:%s'%datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        print(start_time)

        model.train()
        optimizer.zero_grad()
        loss_epoch = 0 # 单次迭代的总loss
        (batches_train_question, batches_train_gold, batches_train_negs) = train_data
        for step,(batch_train_question, batch_train_gold, batch_train_negs) in enumerate(zip(batches_train_question,batches_train_gold, batches_train_negs)):
            batch_train_question = (t.cuda() for t in batch_train_question)
            batch_train_gold = (t.cuda() for t in batch_train_gold)
            batch_train_negs = (t.cuda() for t in batch_train_negs)
            scores = model(batch_train_question, batch_train_gold, batch_train_negs)
            (pos_score, neg_scores) = scores
            pos_score = pos_score.expand_as(neg_scores).reshape(-1)
            neg_scores = neg_scores.reshape(-1)
            assert len(pos_score) == len(neg_scores)
            ones = torch.ones(pos_score.shape)
            if args.no_cuda == False:
                ones = ones.cuda()
            loss = criterion(pos_score, neg_scores, ones)
            
            # evaluate train
            result = (torch.sum(pos_score.reshape(-1, args.neg_size) > neg_scores.reshape(-1, args.neg_size),-1) == args.neg_size).cpu()

            train_right += torch.sum(result).item()
            train_total += len(result)

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            loss_epoch += loss
            if (step + 1) % args.gradient_accumulation_steps == 0:
                # modify learning rate with special warm up BERT uses
                lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_steps, args.warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
        
        # 打印
        end_time = datetime.now()
        logger.info('\ntrain epoch %d time span:%s'%(epoch, end_time-start_time))
        print('train loss', loss_epoch.item())
        logger.info('train loss:%f'%loss_epoch.item())
        print('train result', train_right, train_total, 1.0*train_right/train_total)
        logger.info(('train result', train_right, train_total, 1.0*train_right/train_total))

        # 评估
        right, total, precision = evaluate(args, model, valid_dataset, valid_rawdata, epoch)
        # right, total, precision = 0, 0, 0.0

        # 打印
        print('valid result', right, total, precision)
        print('epoch time')
        print(datetime.now())
        print('*'*20)
        logger.info("epoch:%d\t"%epoch+"dev_Accuracy-----------------------%d/%d=%f\n"%(right, total, precision))
        end_time = datetime.now()
        logger.info('dev epoch %d time span:%s'%(epoch,end_time-start_time))
        
        if precision > best_precision:
            best_precision = precision
            iters_left = patience
            print("epoch %d saved\n"%epoch)
            logger.info("epoch %d saved\n"%epoch)
            # Save a trained model
            model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
            output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
            torch.save(model_to_save.state_dict(), output_model_file)
        else:
            iters_left -= 1
            if iters_left == 0:
                break
    logger.info('finish training!')
    print('finish training!')
Example #10
0
    def forward(self,
                input_ids=None,
                token_type_ids=None,
                attention_mask=None,
                labels=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                output_attentions=None,
                output_hidden_states=None,
                return_dict=None,
                semantic_labels=None,
                custom_hyperparameters=None):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
            of the input tensors. (see `input_ids` above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[
            1] if input_ids is not None else inputs_embeds.shape[1]

        lambda_1, lambda_2, margin_1, margin_2 = custom_hyperparameters

        flat_input_ids = input_ids.view(
            -1, input_ids.size(-1)) if input_ids is not None else None
        flat_position_ids = position_ids.view(
            -1, position_ids.size(-1)) if position_ids is not None else None
        flat_token_type_ids = token_type_ids.view(
            -1,
            token_type_ids.size(-1)) if token_type_ids is not None else None
        flat_attention_mask = attention_mask.view(
            -1,
            attention_mask.size(-1)) if attention_mask is not None else None
        flat_inputs_embeds = (inputs_embeds.view(-1, inputs_embeds.size(-2),
                                                 inputs_embeds.size(-1))
                              if inputs_embeds is not None else None)

        outputs = self.roberta(
            flat_input_ids,
            position_ids=flat_position_ids,
            token_type_ids=flat_token_type_ids,
            attention_mask=flat_attention_mask,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

            if semantic_labels is not None:
                loss_fct1 = MarginRankingLoss(margin=margin_1)
                loss_fct2 = MarginRankingLoss(margin=margin_2)

                semantic_type_p = (1 + semantic_labels).true_divide(2)
                semantic_type_p_total = semantic_type_p.sum(1)
                semantic_type_n = (1 - semantic_labels).true_divide(2)
                semantic_type_n_total = semantic_type_n.sum(1)

                scores_t = reshaped_logits[[i for i in range(len(labels))],
                                           labels] * semantic_type_p_total

                scores_p = (reshaped_logits * semantic_type_p).sum(1)

                scores_n, _ = (reshaped_logits * semantic_type_n).max(dim=1)
                scores_n *= semantic_type_p_total

                gold = torch.ones(len(semantic_labels))
                if torch.cuda.is_available():
                    gold = gold.cuda()
                scores1 = loss_fct1(scores_t, scores_p, gold)
                scores2 = loss_fct2(scores_p, scores_n, gold)
                loss = loss + lambda_1 * scores1 + lambda_2 * scores2

        if not return_dict:
            output = (reshaped_logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
Example #11
0
    def train_func(self):
        step = 0
        plm_lr = self.args.plm_learning_rate
        rerank_lr = self.args.rank_learning_rate
        model = load_rerank_model(self.args)
        true_score_func = get_score_func(model, 'true', inference=False)
        false_score_func = get_score_func(model, 'false', inference=False)
        if torch.cuda.is_available():
            model.cuda()
        loss_fct = MarginRankingLoss(margin=1, reduction='mean')

        if self.args.separate_learning_rate:
            params = [(k, v) for k, v in model.named_parameters()
                      if v.requires_grad]
            non_bert_params = {
                'params':
                [v for k, v in params if not k.startswith('plm_model.')]
            }
            bert_params = {
                'params': [v for k, v in params if k.startswith('plm_model.')],
                'lr': plm_lr
            }
            # optimizer = torch.optim.Adam([bert_params, non_bert_params], lr=rerank_lr)
            optimizer = AdamW([non_bert_params, bert_params], lr=rerank_lr)
        else:
            optimizer = AdamW(model.parameters(), plm_lr)
        scheduler = optim.lr_scheduler.StepLR(
            optimizer,
            step_size=self.args.scheduler_step,
            gamma=self.args.scheduler_gamma)
        accumulate_step = 0

        for epoch in range(1, self.args.epoch + 1):
            for batch in self.train_loader:
                model.train()
                true_scores = true_score_func(batch)
                false_scores = false_score_func(batch)
                # y all 1s to indicate positive should be higher
                y = torch.ones(len(true_scores)).float()
                if torch.cuda.is_available():
                    y = y.cuda()

                loss = loss_fct(true_scores, false_scores, y)
                loss.backward()
                self.writer.add_scalar('loss', loss, step)
                accumulate_step += 1

                # torch.nn.utils.clip_grad_value_(model.parameters(), 0.01)
                stop_scheduler_step = self.args.scheduler_step * 8
                if accumulate_step % self.args.gradient_accumulate_step == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    # if self.args.scheduler_lr and step <= stop_scheduler_step:
                    if self.args.scheduler_lr:  # and step <= stop_scheduler_step:
                        scheduler.step()
                    accumulate_step = 0

                step += 1
                if step % self.args.save_model_step == 0:
                    model_basename = self.args.dest_base_dir + self.args.exp_name
                    model_basename += '_epoch_{}_step_{}'.format(epoch, step)
                    torch.save(model.state_dict(), model_basename + '.model')
                    write_json(model_basename + '.json', vars(self.args))
                    map_top3 = self.evaluate(model, 5, model_basename)
                    self.writer.add_scalar('map@3', map_top3, step)
                    self.logger.info('step {} map@3 {:.4f}'.format(
                        step, map_top3))
def train(gpu=None):
    logs = {
    'train':tensorboard_logger.Logger(tb_path + "/train"),
    'prs':tensorboard_logger.Logger(tb_path + "/prs"),
    'spr':tensorboard_logger.Logger(tb_path + "/sp"),
    'r2':tensorboard_logger.Logger(tb_path + "/r2"),
    }

    db = Dataset(training_path, testing_path, post_map_path, feature_path, aux_path, attr_path, settings['min_images'])
    print 'Training Attributes:', db.attr_names

    model = neural_net(num_attributes=len(db.attr_inds), aux_size=len(db.aux_list))

    if resume_train is None:
        start_train = 0
    else:
        epochs_str = [el.split('_')[-1].split('.')[0] for el in glob.glob('log/' + resume_train + "/*.dat")]
        if 'model' in epochs_str:
            epochs_str.remove('model')
        last_epoch = np.max([int(el) for el in epochs_str])
        # last_epoch = np.max([int(el.split('_')[-1][0]) for el in glob.glob('log/' + resume_train + "/*.dat")])
        resume_path = 'log/' + resume_train + "/vgg_model_ep_" + str(last_epoch) + ".dat"
        start_train = last_epoch + 1
        if gpu is not None:
            model.load_state_dict(torch.load(resume_path, map_location='cuda:' + str(gpu)))
        else:
            model.load_state_dict(torch.load(resume_path, map_location=lambda gpu, loc: gpu))

    # Initializing PyTorch Dataloader
    dataloader = DataLoader(db, batch_size=settings['batch_size'], shuffle=True, num_workers=4)

    mr_loss = MarginRankingLoss(margin=0.3).to(gpu)

    optimizer = optim.Adadelta(model.parameters(), lr=settings['lr'], weight_decay=1e-5)

    model = model.to(gpu)

    step = 0
    for epoch in range(start_train, settings['num_epochs']):
        print 'Epoch', epoch
        pbar = tqdm(total=db.__len__())
        for i_batch, sample_batched in enumerate(dataloader):
            optimizer.zero_grad()
            image_1 = sample_batched['image_1'].type(torch.FloatTensor)
            image_2 = sample_batched['image_2'].type(torch.FloatTensor)

            aux_1 = sample_batched['label_1'].type(torch.FloatTensor).to(gpu)
            aux_2 = sample_batched['label_2'].type(torch.FloatTensor).to(gpu)

            gt = (aux_1 > aux_2).type(torch.FloatTensor)

            reg_loss_1 = torch.zeros(image_1.shape[0], dtype=torch.float32)
            reg_loss_2 = torch.zeros(image_1.shape[0], dtype=torch.float32)
            ranking_loss = torch.zeros(image_1.shape[0], dtype=torch.float32)

            if gpu is not None:
                image_1 = image_1.to(gpu)
                image_2 = image_2.to(gpu)
                aux_1 = aux_1.to(gpu)
                aux_2 = aux_2.to(gpu)
                gt = gt.to(gpu)
                reg_loss_1 = reg_loss_1.to(gpu)
                reg_loss_2 = reg_loss_2.to(gpu)
                ranking_loss = ranking_loss.to(gpu)

            out_1 = model(image_1)
            out_2 = model(image_2)

            for i in range(len(db.attr_inds)):  # avg over attributes
                ranking_loss += mr_loss(out_1[i], out_2[i], gt[:, i])
            ranking_loss = ranking_loss / len(db.attr_inds)

            if fixed_std:
                p = [torch.distributions.normal.Normal(aux_1[:, i], 0.1) for i in range(len(db.attr_inds))]
                q = [torch.distributions.normal.Normal(out_1[i].mean(1).squeeze(), out_1[i].std(1).squeeze()) for i in range(len(db.attr_inds))]
                for i in range(len(db.attr_inds)):  # avg over attributes
                    reg_loss_1 += torch.distributions.kl.kl_divergence(p[i], q[i])
                reg_loss_1 = reg_loss_1 / len(db.attr_inds)
                p = [torch.distributions.normal.Normal(aux_2[:, i], 0.1) for i in range(len(db.attr_inds))]
                q = [torch.distributions.normal.Normal(out_2[i].mean(1).squeeze(), out_2[i].std(1).squeeze()) for i in range(len(db.attr_inds))]
                for i in range(len(db.attr_inds)):  # avg over attributes
                    reg_loss_2 += torch.distributions.kl.kl_divergence(p[i], q[i])
                reg_loss_2 = reg_loss_2 / len(db.attr_inds)
            else:
                p = [torch.distributions.normal.Normal(aux_1[:, i], model.aux_stds[sample_batched['aux_1'], i]) for i in range(len(db.attr_inds))]
                q = [torch.distributions.normal.Normal(out_1[i].mean(1).squeeze(), out_1[i].std(1).squeeze()) for i in range(len(db.attr_inds))]
                for i in range(len(db.attr_inds)):  # avg over attributes
                    reg_loss_1 += torch.distributions.kl.kl_divergence(p[i], q[i])
                reg_loss_1 = reg_loss_1 / len(db.attr_inds)
                p = [torch.distributions.normal.Normal(aux_2[:, i], model.aux_stds[sample_batched['aux_2'], i]) for i in range(len(db.attr_inds))]
                q = [torch.distributions.normal.Normal(out_2[i].mean(1).squeeze(), out_2[i].std(1).squeeze()) for i in range(len(db.attr_inds))]
                for i in range(len(db.attr_inds)):  # avg over attributes
                    reg_loss_2 += torch.distributions.kl.kl_divergence(p[i], q[i])
                reg_loss_2 = reg_loss_2 / len(db.attr_inds)

            ranking_loss = ranking_loss.mean()  # avg over batch
            reg_loss = reg_loss_1.mean() + reg_loss_2.mean()  # avg over batch

            loss = reg_loss + ranking_loss

            step += 1
            logs['train'].log_value('loss', loss.item(), step)

            loss.backward()
            optimizer.step()

            _loss = loss.item()
            pbar.update(image_1.shape[0])

        pbar.close()

        if epoch % 50 == 0:
            model.eval()
            test(model, db, gpu, logs=logs, step=step)
            model.train()
            persist_model(model, experiment_folder + '/vgg_model_ep_' + str(epoch) + '.dat')

    # Performing final evaluation
    model.eval()
    test(model, db, gpu)
    persist_model(model, model_path)
    return
Example #13
0
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1
    # aug_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/ContextualWordEmbsAug_sub_df.pkl')
    # aug_df['is_original'] = 0

    # trn_df = pd.concat([trn_df, aug_df], axis=0).reset_index(drop=True)

    gkf = GroupKFold(
        n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    # calc max_seq_len using quest dataset
    # max_seq_len = QUESTDataset(
    #     df=trn_df,
    #     mode='train',
    #     tokens=[],
    #     augment=[],
    #     pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
    # ).MAX_SEQUENCE_LENGTH
    # max_seq_len = 9458
    # max_seq_len = 1504
    max_seq_len = 512

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ', logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(
            ['is_original', 'question_body_le'], axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(
            ['is_original', 'question_body_le'], axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(list(itertools.chain.from_iterable(
            fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
            fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
            fold_trn_df.answer.apply(lambda x: x.split(' '))
        ))).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            MAX_SEQUENCE_LENGTH=max_seq_len,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            MAX_SEQUENCE_LENGTH=max_seq_len,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = BCEWithLogitsLoss()
        # fobj = MSELoss()
        pair_fobj = MarginRankingLoss()
        model = BertModelForBinaryMultiLabelClassifier(num_labels=len(LABEL_COL),
                                                       pretrained_model_name_or_path=MODEL_PRETRAIN,
                                                       # cat_num=5,
                                                       token_size=len(
                                                           trn_dataset.tokenizer),
                                                       MAX_SEQUENCE_LENGTH=max_seq_len,
                                                       )
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=MAX_EPOCH, eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch(
                model, fobj, optimizer, trn_loader, pair_fobj)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader)

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [trn_loss, ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [val_loss, ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [val_metric, ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [val_metric_raws, ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}',
                logger)
            model = model.to('cpu')
            model = model.module
            save_checkpoint(
                f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                model,
                optimizer,
                scheduler,
                histories,
                val_y_preds,
                val_y_trues,
                val_qa_ids,
                fold,
                epoch,
                val_loss,
                val_metric)
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])])
        save_and_clean_for_prediction(
            f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
            trn_dataset.tokenizer)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
Example #14
0
    def forward(
        self,
        input_ids,
        attention_mask,
        valid_ids,
        active_mask,
        valid_output,
        labels=None,
        chunk_labels=None,
        chunk_mask=None,
    ):
        """
        active_mask : mention_mask for ngrams = torch.LongTensor([[1,2,1,3,4,5,4], [1,2,3,0,4,4,0]])
        laebls : for ngrams labels = torch.LongTensor([[1,-1,-1,1,-1], [1,-1,-1,1,0]])
        """
        # --------------------------------------------------------------------------------
        # Bert Embedding Outputs
        outputs = self.distilbert(input_ids=input_ids,
                                  attention_mask=attention_mask)

        sequence_output = outputs[0]

        # --------------------------------------------------------------------------------
        # Valid Outputs : get first token vector
        batch_size = sequence_output.size(0)
        for i in range(batch_size):
            valid_num = sum(valid_ids[i]).item()

            vectors = sequence_output[i][valid_ids[i] == 1]
            valid_output[i, :valid_num].copy_(vectors)

        # --------------------------------------------------------------------------------
        # Dropout
        sequence_output = self.dropout(valid_output)

        # --------------------------------------------------------------------------------
        # CNN Outputs
        cnn_outputs = self.cnn2gram(
            sequence_output)  # shape = (batch_size, max_gram_num, 512)

        # --------------------------------------------------------------------------------
        # Classifier 512 to 1
        classifier_scores = self.classifier(
            cnn_outputs)  # shape = (batch_size, max_gram_num, 1)
        classifier_scores = classifier_scores.squeeze(
            -1)  # shape = (batch_size, max_gram_num)

        classifier_scores = classifier_scores.unsqueeze(1).expand(
            active_mask.size()
        )  # shape = (batch_size, max_diff_ngram_num, max_gram_num)
        classifier_scores = classifier_scores.masked_fill(mask=active_mask,
                                                          value=-float("inf"))

        # --------------------------------------------------------------------------------
        # Merge TF : # shape = (batch_size * max_diff_ngram_num * max_gram_num) to (batch_size * max_diff_ngram_num)
        total_scores, indices = torch.max(classifier_scores, dim=-1)

        # --------------------------------------------------------------------------------
        # --------------------------------------------------------------------------------
        # Total Loss Compute
        if labels is not None and chunk_labels is not None:
            # *************************************************************************************
            # *************************************************************************************
            # [1] Chunk Loss
            Chunk_Loss_Fct = CrossEntropyLoss(reduction="mean")

            active_chunk_loss = chunk_mask.view(-1) != -1
            chunk_logits = self.chunk_classifier(
                cnn_outputs)  # shape = (batch_size * num_gram, 2)
            active_chunk_logits = chunk_logits.view(
                -1, self.num_labels)[active_chunk_loss]

            active_chunk_label_loss = chunk_labels.view(-1) != -1
            active_chunk_labels = chunk_labels.view(
                -1)[active_chunk_label_loss]

            chunk_loss = Chunk_Loss_Fct(active_chunk_logits,
                                        active_chunk_labels)

            # *************************************************************************************
            # *************************************************************************************
            # [2] Rank Loss
            Rank_Loss_Fct = MarginRankingLoss(margin=1, reduction="mean")

            device = torch.device("cuda", total_scores.get_device())
            flag = torch.FloatTensor([1]).to(device)

            rank_losses = []
            for i in range(batch_size):

                score = total_scores[i]
                label = labels[i]

                true_score = score[label == 1]
                neg_score = score[label == -1]
                rank_losses.append(
                    Rank_Loss_Fct(true_score.unsqueeze(-1),
                                  neg_score.unsqueeze(0), flag))

            rank_loss = torch.mean(torch.stack(rank_losses))
            # *************************************************************************************
            # *************************************************************************************
            # [3] Total Loss
            tot_loss = rank_loss + chunk_loss
            return tot_loss

        else:
            return total_scores  # shape = (batch_size * max_differ_gram_num)
Example #15
0
def train(args):
    # random seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # gpu
    if not args.no_cuda:
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
        print('使用%s号GPU' % args.gpu)

    # word vector
    corpus = Corpus()
    vocab, embed = corpus.load_embed(args.fn_embed)
    print("finish loading external word embeding, the shape is:")
    print(embed.shape)

    # model
    model_dict = {'lstm_comparing':BiLSTM_Encoding_Comparing, 'char_lstm_comparing':Char_BiLSTM_Encoding_Comparing}
    print("current model is", args.model)
    model_name = model_dict[args.model]
    if not args.no_cuda:
        embed = embed.cuda()
    model = model_name(args, embed)
    if not args.no_cuda:
        model.cuda()
    print(model)

    train_questions_raw, train_golds_raw, train_negs_raw = corpus.load_data(args.fn_train, 'train')
    valid_questions_raw, valid_golds_raw, valid_negs_raw = corpus.load_data(args.fn_valid, 'valid')

    train_questions = corpus.numericalize(train_questions_raw, args.input_mode)
    
    train_golds = corpus.numericalize(train_golds_raw, args.input_mode)
    train_negs = []
    for line in train_negs_raw:
        train_negs.append(corpus.numericalize(line, args.input_mode))
    
    # from pdb import set_trace
    # set_trace()
    if isinstance(train_questions, tuple):
        print("train data loaded!%d questions totally"%len(train_questions[0]))
    else:
        print("train data loaded!%d questions totally"%len(train_questions))

    valid_questions = corpus.numericalize(valid_questions_raw, args.input_mode)
    valid_golds = corpus.numericalize(valid_golds_raw, args.input_mode)
    valid_negs = []
    for index, line in enumerate(valid_negs_raw):
        valid_negs.append(corpus.numericalize(line, args.input_mode))
    
    if isinstance(valid_questions, tuple):
        print("valid data loaded!%d questions totally"%len(valid_questions[0]))
    else:
        print("valid data loaded!%d questions totally"%len(valid_questions))
    
    valid_dataset = (valid_questions, valid_golds, valid_negs)

    print("字符字典长度", corpus.len_char_dict())
    
    # dump vocab
    corpus.dump_vocab(args.vocab_word, mode='word')
    corpus.dump_vocab(args.vocab_char, mode='char')

    # training settings
    optimizer_dict = {"adam":Adam}
    optimizer_name = optimizer_dict[args.optimizer]
    print("choose optimizer:%s"%args.optimizer)
    optimizer = optimizer_name(model.parameters(), lr = args.learning_rate)
    
    criterion = MarginRankingLoss(margin=args.margin)
    
    patience = args.patience
    num_train_epochs = args.num_train_epochs
    iters_left = patience
    best_precision = 0
    num_not_improved = 0
    global_step = 0

    logger.info('\nstart training:%s'%datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    print("start training!")

    for epoch in range(args.num_train_epochs):
        # batchlize
        sample_train_negs = train_neg_sample(train_negs, args.neg_size, mode=args.input_mode)
        sample_train = (train_questions, train_golds, sample_train_negs)
        train_batches = train_batchlize(sample_train, args.batch_size, mode=args.input_mode)
        print("train data batchlized............")
        
        # 
        train_right = 0
        train_total = 0
        # 打印
        print('start time')
        start_time = datetime.now()
        logger.info('\nstart training:%s'%datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        print(start_time)

        model.train()
        optimizer.zero_grad()
        loss_epoch = 0 # 单次迭代的总loss
        for step, batch in enumerate(train_batches):
            # if not args.no_cuda:
            #     batch = (t.cuda() for t in batch)
            question_batch, gold_batch, negs_batch = batch
            pos_score, neg_scores = model(question_batch, gold_batch, negs_batch)
            
            pos_score = pos_score.expand_as(neg_scores).reshape(-1)
            neg_scores = neg_scores.reshape(-1)
            assert pos_score.shape == neg_scores.shape
            ones = torch.ones(pos_score.shape)
            if not args.no_cuda:
                ones = ones.cuda()
            loss = criterion(pos_score, neg_scores, ones)
            
            # evaluate train
            result = (torch.sum(pos_score.view(-1, args.neg_size) > neg_scores.view(-1, args.neg_size),-1) == args.neg_size)

            train_right += torch.sum(result).item()
            train_total += len(result)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_epoch += loss

        # 打印
        end_time = datetime.now()
        logger.info('\ntrain epoch %d time span:%s'%(epoch, end_time-start_time))
        print('train loss', loss_epoch.item())
        logger.info('train loss:%f'%loss_epoch.item())
        print('train result', train_right, train_total, 1.0*train_right/train_total)
        logger.info(('train result', train_right, train_total, 1.0*train_right/train_total))

        # eval
        right, total, precision = evaluate_char(args, model, valid_dataset)

        # print
        print('valid result', right, total, precision)
        print('epoch time')
        print(datetime.now())
        print('*'*20)
        logger.info("epoch:%d\t"%epoch+"dev_Accuracy-----------------------%d/%d=%f\n"%(right, total, precision))
        end_time = datetime.now()
        logger.info('dev epoch %d time span:%s'%(epoch,end_time-start_time))
        
        if precision > best_precision:
            best_precision = precision
            iters_left = patience
            print("epoch %d saved\n"%epoch)
            logger.info("epoch %d saved\n"%epoch)
            # Save a trained model
            model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
            output_model_file = os.path.join(args.output_dir, "best_model.bin")
            torch.save(model_to_save.state_dict(), output_model_file)
        else:
            iters_left -= 1
            if iters_left == 0:
                break
    logger.info('finish training!')
    print('finish training!')
Example #16
0
    def forward(self,
                input_ids,
                attention_mask,
                valid_ids,
                active_mask,
                valid_output,
                labels=None):
        """
        active_mask : mention_mask for ngrams = torch.LongTensor([[1,2,1,3,4,5,4], [1,2,3,0,4,4,0]])
        laebls : for ngrams labels = torch.LongTensor([[1,-1,-1,1,-1], [1,-1,-1,1,0]]) 
        """
        # --------------------------------------------------------------------------------
        # Bert Embedding Outputs
        outputs = self.roberta(input_ids=input_ids,
                               attention_mask=attention_mask)

        sequence_output = outputs[0]

        # --------------------------------------------------------------------------------
        # Valid Outputs : get first token vector
        batch_size = sequence_output.size(0)
        for i in range(batch_size):
            valid_num = sum(valid_ids[i]).item()
            vectors = sequence_output[i][valid_ids[i] == 1]
            valid_output[i, :valid_num].copy_(vectors)

        # --------------------------------------------------------------------------------
        # Dropout
        sequence_output = self.dropout(valid_output)

        # --------------------------------------------------------------------------------
        # CNN Outputs
        cnn_outputs = self.cnn2gram(
            sequence_output)  # shape = (batch_size, max_gram_num, 512)

        # --------------------------------------------------------------------------------
        # Classifier 512 to 1
        classifier_scores = self.classifier(
            cnn_outputs)  # shape = (batch_size, max_gram_num, 1)
        classifier_scores = classifier_scores.squeeze(-1)

        classifier_scores = classifier_scores.unsqueeze(1).expand(
            active_mask.size()
        )  # shape = (batch_size, max_diff_ngram_num, max_gram_num)
        classifier_scores = classifier_scores.masked_fill(
            mask=active_mask.byte(), value=-float('inf'))

        # --------------------------------------------------------------------------------
        # Merge TF : # shape = (batch_size * max_diff_ngram_num * max_gram_num) to (batch_size * max_diff_ngram_num)
        total_scores, indices = torch.max(
            classifier_scores,
            dim=-1)  # shape = (batch_size * max_diff_ngram_num)

        # --------------------------------------------------------------------------------
        # Loss Compute
        if labels is not None:
            Rank_Loss_Fct = MarginRankingLoss(margin=1, reduction='mean')

            device = torch.device("cuda", total_scores.get_device())
            flag = torch.FloatTensor([1]).to(device)

            rank_losses = []
            for i in range(batch_size):

                score = total_scores[i]
                label = labels[i]

                true_score = score[label == 1]
                neg_score = score[label == -1]
                rank_losses.append(
                    Rank_Loss_Fct(true_score.unsqueeze(-1),
                                  neg_score.unsqueeze(0), flag))

            rank_loss = torch.mean(torch.stack(rank_losses))
            return rank_loss

        else:
            return total_scores  # shape = (batch_size * max_differ_gram_num)
Example #17
0
	model = FeedForward(len(rel2id), len(ent2id), dim=config['embedding_dim'])
elif args.model == 'ffs':
	model = FeedForward_Source(len(rel2id), len(ent2id), len(src2id), dim=config['embedding_dim'])
elif args.model == 'hyte':
	model = HyTE(len(rel2id), len(ent2id), len(src2id), dim=config['embedding_dim'], norm=config['norm'], margin=config['margin'], l2reg=config['l2reg'])

# model.to(device)

# Logger
if args.mode.startswith('train'):
	logger = Logger(config['name'], ['loss', 'val_loss', 'MR', 'MRR', 'h@10'])
else:
	logger = None

# Loss function
criterion = MarginRankingLoss(config['margin'], reduction='sum')

# Batch loader
loader = BatchLoader(train, bernoulli_p, goldens, all_ents, all_sources, batch_size=config['batch_size'], neg_ratio=config['neg_ratio'])

# =========================================
# Initialize OPTIMIZER
# =========================================
if config['optim']== 'adam':
	optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['l2reg'])
elif config['optim'] == 'adagrad':
	optimizer = optim.Adagrad(model.parameters(), lr=config['learning_rate'], weight_decay=config['l2reg'])
else:
	optimizer = optim.SGD(model.parameters(), lr=config['learning_rate'], weight_decay=config['l2reg'])

trainer = Trainer(model, train, val, test, optimizer, criterion, logger, loader, config)
Example #18
0
 def __init__(self, margin=1.0):
     super().__init__()
     self.loss = MarginRankingLoss(margin=margin)
     self.margin = margin
Example #19
0
import torch
from torch.nn import MarginRankingLoss

criterion = MarginRankingLoss(margin=0.0,
                              size_average=None,
                              reduce=None,
                              reduction='mean')

x1 = torch.Tensor(32, 3)
print(x1)
x2 = torch.Tensor(32, 3)
y = torch.ones([32, 3])

loss = criterion(x1, x2, y)
print(loss)
Example #20
0
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1
    # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv')
    # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv')
    # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv')
    # raw_pseudo_df2 = pd.read_csv('./mnt/inputs/pseudos/top2_e121_e125_e126_e127_e128_e129/raw_pseudo_tst_df.csv')
    # half_opt_pseudo_df2 = pd.read_csv('./mnt/inputs/pseudos/top2_e121_e125_e126_e127_e128_e129/half_opt_pseudo_tst_df.csv')
    # opt_pseudo_df2 = pd.read_csv('./mnt/inputs/pseudos/top2_e121_e125_e126_e127_e128_e129/opt_pseudo_tst_df.csv')

    # clean texts
    # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer'])

    # load additional tokens
    # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin:
    #     additional_tokens = pickle.load(fin)

    gkf = GroupKFold(n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(
                    histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ',
            logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]  # + additional_tokens

        # fold_trn_df = pd.concat([fold_trn_df, raw_pseudo_df, opt_pseudo_df, half_opt_pseudo_df, raw_pseudo_df2, opt_pseudo_df2, half_opt_pseudo_df2], axis=0)

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        # fobj = BCEWithLogitsLoss()
        # fobj = FocalLossKaggle(gamma=2)
        fobj = MarginRankingLoss()
        state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict()
        model = BertModelForBinaryMultiLabelClassifier(
            num_labels=len(LABEL_COL),
            config_path=MODEL_CONFIG_PATH,
            state_dict=state_dict,
            token_size=len(trn_dataset.tokenizer),
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
            cat_last_layer_num=1,
            do_ratio=0.2,
        )
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader,
                                       DEVICE)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader, DEVICE, mode='valid')

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [
                    val_loss,
                ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [
                    val_metric,
                ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [
                    val_metric_raws,
                ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}', logger)
            model = model.to('cpu')
            model = model.module
            save_checkpoint(
                f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                model,
                optimizer,
                scheduler,
                histories,
                val_y_preds,
                val_y_trues,
                val_qa_ids,
                fold,
                epoch,
                val_loss,
                val_metric,
            )
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(
                histories["val_metric"][fold])])
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer,
                                      clean=False)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
Example #21
0
def train(gpu=None):

    # Loading post image features
    with open(post_map_path, 'r') as f:
        code_list = pickle.load(f)
    image_features = np.load(feature_path)

    brands = build_brand_list(data_path + 'brand_list.csv')
    brand_list = brands['username'].tolist()

    model = VggModel(len(brand_list))

    # Initializing PyTorch Dataloader
    db = BrandDataset(training_path, code_list, image_features, brand_list)
    dataloader = DataLoader(db, batch_size=256, shuffle=True, num_workers=0)

    loss_function = MarginRankingLoss(margin=0.3)
    if gpu:
        model.cuda(gpu)
        loss_function.cuda(gpu)

    optimizer_rel = optim.Adadelta(model.parameters(), lr=1)

    for epoch in range(20):
        for i_batch, sample_batched in enumerate(dataloader):

            model.zero_grad()
            image_pos = Variable(sample_batched['image_p'])
            image_neg = Variable(sample_batched['image_n'])
            brand = Variable(sample_batched['brand'])
            ones = Variable(torch.ones(image_pos.size()[0], 1))

            if gpu:
                image_pos.cuda(gpu)
                image_neg.cuda(gpu)
                brand.cuda(gpu)
                ones.cuda(gpu)

            # Forwarding the network for positive and negative samples
            out_pos = model({'image': image_pos, 'brand': brand})
            out_neg = model({'image': image_neg, 'brand': brand})

            loss = loss_function(out_pos, out_neg, ones)
            loss.backward()
            optimizer_rel.step()

            # Computing evaluation metrics on testing/validation set
            if (i_batch % eval_freq == 0) & (i_batch > 0):
                model.eval()
                test = test_ranking(model, testing_path, code_list,
                                    image_features, brands, gpu)
                model.train()

                persist_model(model, experiment_folder + '/vgg_model.dat')
                print 'Epoch:', epoch, 'batch', i_batch, \
                    'Tr_Loss:', loss.item(), \
                    'Testing MedR:', test[0], \
                    'Testing AUC:', test[1], \
                    'Testing cAUC:', test[2], \
                    'Testing NDCG@10:', test[3], \
                    'Testing NDCG@50:', test[4]
            else:
                print 'Epoch:', epoch, 'batch', i_batch, 'Tr_Loss:', loss.item(
                )
        persist_model(
            model, experiment_folder + '/vgg_model_ep_' + str(epoch) + '.dat')

    # Performing final evaluation
    model.eval()
    test = test_ranking(model, testing_path, code_list, image_features, brands,
                        gpu)
    model.train()
    persist_model(model, model_path)
    print 'Final Result: ', \
        'MedR:', test[0], \
        'AUC:', test[1], \
        'cAUC:', test[2], \
        'NDCG@10:', test[3], \
        'NDCG@50:', test[4]
    return
Example #22
0
def train(args, model, processor, tokenizer, device, n_gpu):
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0

    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    data, num_examples = features(args, processor, "train", tokenizer)
    data = TensorDataset(*data)

    if args.local_rank == -1:
        sampler = RandomSampler(data)
    else:
        sampler = DistributedSampler(data)

    data_loader = DataLoader(data,
                             sampler=sampler,
                             batch_size=args.train_batch_size)
    step_size = args.gradient_accumulation_steps * args.num_train_epochs
    num_train_optimization_steps = len(data_loader) // step_size

    # Prepare optimizer

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from "
                              "https://www.github.com/nvidia/apex to use "
                              "distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
        warmup_linear = WarmupLinearSchedule(
            warmup=args.warmup_proportion,
            t_total=num_train_optimization_steps)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", num_examples)
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_optimization_steps)

    model.train()
    loss_fct = MarginRankingLoss(margin=args.margin)
    ckpt_num = 0
    eval_results_history = []
    best = 0.
    best_props = {}
    eval_result = None
    no_improvement = 0
    t = time.time()

    try:
        for num_epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0

            if no_improvement > args.tolerance:
                logger.info(
                    "No improvement in last %d evaluations, early stopping")
                logger.info(
                    "epoch: {} | nb_tr_steps: {} | global_step: {} | tr_loss: {}"
                    .format(num_epoch, nb_tr_steps, global_step, tr_loss))

            for step, batch in enumerate(tqdm(data_loader, desc="Iteration")):
                print(nb_tr_steps)
                batch = tuple(t.to(device) for t in batch)
                input_ids, segment_ids, mask_ids = batch

                # <question, +ve doc> pairs
                input_ids_qp, segment_ids_qp, input_mask_qp = \
                input_ids[:, 0, :], segment_ids[:, 0, :], mask_ids[:, 0, :]
                # <question, -ve doc> pairs
                input_ids_qn, segment_ids_qn, input_mask_qn = \
                input_ids[:, 1, :], segment_ids[:, 1, :], mask_ids[:, 1, :]

                pos_scores = model(input_ids_qp, segment_ids_qp, input_mask_qp)
                neg_scores = model(input_ids_qn, segment_ids_qn, input_mask_qn)

                # y all 1s to indicate positive should be higher
                y = torch.ones(len(pos_scores)).float().to(device)
                loss = loss_fct(pos_scores, neg_scores, y)
                if nb_tr_steps % 10 == 0 and nb_tr_steps != 0:
                    logger.info("+ve scores : %r" % pos_scores)
                    logger.info("-ve scores : %r" % neg_scores)
                    logger.info("Train step loss : %0.5f" % loss.item())
                    if global_step > 0:
                        logger.info("Train total loss : %0.5f" %
                                    (tr_loss / global_step))

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles
                        # this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                    if args.local_rank in [-1, 0]:
                        tb_writer.add_scalar('lr',
                                             optimizer.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar('loss', loss.item(), global_step)

                if nb_tr_steps % config.eval_every_step == 0 and nb_tr_steps != 0:
                    eval_result = eval(args, model, processor, tokenizer,
                                       device, tr_loss, global_step)
                    if eval_result["f1"] >= best:
                        save(
                            model, "%s_%0.3f_%0.3f_%0.3f" %
                            (args.model_name, eval_result["precision"],
                             eval_result["recall"], eval_result["f1"]), args,
                            tokenizer, ckpt_num)
                        best = eval_result["f1"]
                        best_props["num_epoch"] = num_epoch
                        best_props["nb_tr_steps"] = nb_tr_steps
                        best_props["tr_loss"] = tr_loss / global_step
                        best_props["ckpt_num"] = ckpt_num
                        best_props["global_step"] = global_step
                        best_props["eval_result"] = eval_result
                        with open(os.path.join(config.output_dir, "best.json"),
                                  "w") as wf:
                            json.dump(best_props, wf, indent=2)

                        # make predictions with best model
                        for i in range(1, 6):
                            predict(args, model, processor, tokenizer, device,
                                    i)
                        no_improvement = 0
                    else:
                        no_improvement += 1

                    ckpt_num += 1
                    eval_results_history.append((ckpt_num, eval_result))

    except KeyboardInterrupt:
        logger.info("Training interrupted!")
        if eval_result is not None:
            save(
                model, "%s_%0.3f_%0.3f_%0.3f_interrupted" %
                (args.model_name, eval_result["precision"],
                 eval_result["recall"], eval_result["f1"]), args, tokenizer,
                ckpt_num)

    t = time.time() - t
    logger.info("Training took %0.3f seconds" % t)
    loss = tr_loss / global_step
    logger.info("Final training loss %0.5f" % loss)
    logger.info("Best F1-score on eval set : %0.3f" % best)
    logger.info("***** Eval best props *****")
    for key in sorted(best_props.keys()):
        if key != "eval_result":
            logger.info("  %s = %s", key, str(best_props[key]))
        else:
            for eval_key in sorted(best_props[key].keys()):
                logger.info("  %s = %s", eval_key,
                            str(best_props[key][eval_key]))

    with open(os.path.join(config.output_dir, "eval_results_history.pkl"),
              "wb") as wf:
        pickle.dump(eval_results_history, wf)