Exemple #1
0
def init_model(config):
    cfg, cfg_data, cfg_model, cfg_optim = read_config(config)

    device, n_gpu = utils.get_device()
    utils.set_seeds(cfg.seed, n_gpu)

    train_batch_size = int(cfg_optim.train_batch_size /
                           cfg_optim.gradient_accumulation_steps)

    processor = get_class(cfg.task.lower())

    processor.get_train_examples(cfg.data_dir)

    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(cfg.bert_model,
                                              do_lower_case=cfg.do_lower_case)

    # Prepare model
    model = BertForSequenceClassification.from_pretrained(
        cfg.bert_model,
        cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1),
        num_labels=len(label_list))

    model.to(device)

    if not torch.cuda.is_available():
        model.load_state_dict(
            torch.load(cfg.model_save_pth, map_location='cpu')['state_dict'])
    else:
        model.load_state_dict(torch.load(cfg.model_save_pth)['state_dict'])

    return model, processor, cfg_optim, label_list, tokenizer, device
Exemple #2
0
    def train(self):
        """
        Full training logic
        """
        t0 = time()

        for epoch in range(self.start_epoch, self.epochs):
            set_seeds(self.config['seeds'], epoch)

            # Train and Valid losses
            train_loss = self._train_epoch(epoch)
            valid_loss, accuracy, iou, _ = self._valid_epoch(epoch)

            time_elapsed = time() - t0
            print("Epoch %1d completed after %4d secs" % (epoch, time_elapsed))

            self._save_checkpoint(epoch,
                                  train_loss,
                                  valid_loss,
                                  accuracy,
                                  iou,
                                  save_last=True)

            if self.consecutive_stale >= self.consecutive_stale_break:
                break

        mlflow.log_metric('Loss', self.best_loss)
        mlflow.log_metric('IoU', self.best_iou)
        mlflow.log_metric('Accuracy', self.best_accuracy)
        mlflow.log_metric('Epoch', epoch)

        mlflow.end_run()

        return self.best_iou
    def train(self):
        """
        Full training logic
        """

        t0 = time()

        for iteration in range(self.start_iter, self.nb_iters):
            set_seeds(self.config['seeds'], iteration)

            # Train and Valid losses
            L_semi, L_seg, L_disc = self._train_iter(iteration)

            iter_time = time() - t0

            if iteration % self.save_period == 0:
                train_loss = (L_semi, L_seg, L_disc)
                valid_loss, accuracy, iou, _ = self._valid_iter(iteration)

                self.logger.info(
                    '> [{}/{} ({:.0f}%), {:.2f}s] Semi_L: {:.6f} - Seg_L: {:.6f} - D_L: {:.6f}'
                    .format(
                        iteration, self.nb_iters,
                        100.0 * iteration / self.nb_iters,
                        iter_time * (self.nb_iters - iteration) /
                        (iteration - self.start_iter + 1), L_semi, L_seg,
                        L_disc))

                self.logger.info(
                    '> [Valid Loss: {:.6f} - Accuracy: {:.2f} - IoU: {:.3f} '.
                    format(valid_loss, accuracy, iou))

                # Save the checkpoints.
                time_elapsed = time() - t0
                print("Iteration %1d completed after %4d secs" %
                      (iteration, time_elapsed))
                self._save_checkpoint(iteration,
                                      train_loss,
                                      valid_loss,
                                      accuracy,
                                      iou,
                                      save_last=True)

        mlflow.log_metric('Loss', self.best_loss)
        mlflow.log_metric('IoU', self.best_iou)
        mlflow.log_metric('Accuracy', self.best_accuracy)

        mlflow.end_run()

        return self.best_iou
    def predict(self):
        t0 = time()

        set_seeds(self.config['seeds'])

        for batch_id, sample in enumerate(self.predict_loader):
            image = sample['image']
            image = image.to(self.device)

            name = sample['name']
            initial_size = sample['initial_size']

            prediction = self.make_prediction(image)

            self.save_prediction(prediction, name, initial_size, image)

        time_elapsed = time() - t0
        print("Prediction completed after %4d secs" % (time_elapsed))
Exemple #5
0
def main():
    # Load Configuration
    model_cfg = configuration.model.from_json(cfg.model_cfg)        # BERT_cfg
    set_seeds(cfg.seed)

    # Load Data & Create Criterion
    #data = load_data(cfg)

    #if cfg.uda_mode or cfg.mixmatch_mode:
    #    data_iter = [data.sup_data_iter(), data.unsup_data_iter()] if cfg.mode=='train' \
    #        else [data.sup_data_iter(), data.unsup_data_iter(), data.eval_data_iter()]  # train_eval
    #else:
    #    data_iter = [data.sup_data_iter()]

    # my own implementation
    dataset = DataSet(cfg)
    train_dataset, val_dataset, unsup_dataset = dataset.get_dataset()

    # Create the DataLoaders for our training and validation sets.
    train_dataloader = DataLoader(
                train_dataset,  # The training samples.
                sampler = RandomSampler(train_dataset), # Select batches randomly
                batch_size = cfg.train_batch_size # Trains with this batch size.
            )

    validation_dataloader = DataLoader(
                val_dataset, # The validation samples.
                sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
                batch_size = cfg.eval_batch_size # Evaluate with this batch size.
            )

    unsup_dataloader = None
    if unsup_dataset:
        unsup_dataloader = DataLoader(
            unsup_dataset,
            sampler = RandomSampler(unsup_dataset),
            batch_size = cfg.train_batch_size
        )

    if cfg.uda_mode or cfg.mixmatch_mode:
        data_iter = [train_dataloader, unsup_dataloader, validation_dataloader] 
    else:
        data_iter = [train_dataloader, validation_dataloader]

    ema_optimizer = None
    ema_model = None

    if cfg.model == "custom":
        model = models.Classifier(model_cfg, NUM_LABELS[cfg.task])
    elif cfg.model == "bert":
        model = BertForSequenceClassificationCustom.from_pretrained(
            "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
            num_labels = NUM_LABELS[cfg.task],
            output_attentions = False, # Whether the model returns attentions weights.
            output_hidden_states = False, # Whether the model returns all hidden-states.
        )


    if cfg.uda_mode:
        if cfg.unsup_criterion == 'KL':
            unsup_criterion = nn.KLDivLoss(reduction='none')
        else:
            unsup_criterion = nn.MSELoss(reduction='none')
        sup_criterion = nn.CrossEntropyLoss(reduction='none')
        optimizer = optim.optim4GPU(cfg, model)
    elif cfg.mixmatch_mode:
        train_criterion = SemiLoss()
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr)
        ema_model = models.Classifier(model_cfg,  NUM_LABELS[cfg.task])
        for param in ema_model.parameters():
            param.detach_()
        ema_optimizer= WeightEMA(cfg, model, ema_model, alpha=cfg.ema_decay)
    else:
        sup_criterion = nn.CrossEntropyLoss(reduction='none')
        optimizer = optim.optim4GPU(cfg, model)
    
    # Create trainer
    trainer = train.Trainer(cfg, model, data_iter, optimizer, get_device(), ema_model, ema_optimizer)

    # loss functions
    def get_sup_loss(model, sup_batch, unsup_batch, global_step):
        # batch
        input_ids, segment_ids, input_mask, og_label_ids, num_tokens = sup_batch

        # convert label ids to hot vectors
        sup_size = input_ids.size(0)
        label_ids = torch.zeros(sup_size, 2).scatter_(1, og_label_ids.cpu().view(-1,1), 1)
        label_ids = label_ids.cuda(non_blocking=True)

        # sup mixup
        sup_l = np.random.beta(cfg.alpha, cfg.alpha)
        sup_l = max(sup_l, 1-sup_l)
        sup_idx = torch.randperm(sup_size)

        if cfg.sup_mixup and 'word' in cfg.sup_mixup:
            if cfg.simple_pad:
                simple_pad(input_ids, input_mask, num_tokens)
                c_input_ids = None
            else:
                input_ids, c_input_ids = pad_for_word_mixup(
                    input_ids, input_mask, num_tokens, sup_idx
                )
        else:
            c_input_ids = None

        # sup loss
        hidden = model(
            input_ids=input_ids, 
            segment_ids=segment_ids, 
            input_mask=input_mask,
            output_h=True,
            mixup=cfg.sup_mixup,
            shuffle_idx=sup_idx,
            clone_ids=c_input_ids,
            l=sup_l,
            manifold_mixup=cfg.manifold_mixup,
            simple_pad=cfg.simple_pad,
            no_grad_clone=cfg.no_grad_clone
        )
        logits = model(input_h=hidden)

        if cfg.sup_mixup:
            label_ids = mixup_op(label_ids, sup_l, sup_idx)

        sup_loss = -torch.sum(F.log_softmax(logits, dim=1) * label_ids, dim=1)

        if cfg.tsa and cfg.tsa != "none":
            tsa_thresh = get_tsa_thresh(cfg.tsa, global_step, cfg.total_steps, start=1./logits.shape[-1], end=1)
            larger_than_threshold = torch.exp(-sup_loss) > tsa_thresh   # prob = exp(log_prob), prob > tsa_threshold
            # larger_than_threshold = torch.sum(  F.softmax(pred[:sup_size]) * torch.eye(num_labels)[sup_label_ids]  , dim=-1) > tsa_threshold
            loss_mask = torch.ones_like(og_label_ids, dtype=torch.float32) * (1 - larger_than_threshold.type(torch.float32))
            sup_loss = torch.sum(sup_loss * loss_mask, dim=-1) / torch.max(torch.sum(loss_mask, dim=-1), torch_device_one())
        else:
            sup_loss = torch.mean(sup_loss)

        return sup_loss, sup_loss, sup_loss, sup_loss


    def get_loss_ict(model, sup_batch, unsup_batch, global_step):
        # batch
        input_ids, segment_ids, input_mask, og_label_ids, num_tokens = sup_batch
        ori_input_ids, ori_segment_ids, ori_input_mask, \
        aug_input_ids, aug_segment_ids, aug_input_mask, \
        ori_num_tokens, aug_num_tokens = unsup_batch

        # convert label ids to hot vectors
        sup_size = input_ids.size(0)
        label_ids = torch.zeros(sup_size, 2).scatter_(1, og_label_ids.cpu().view(-1,1), 1)
        label_ids = label_ids.cuda(non_blocking=True)

        # sup mixup
        sup_l = np.random.beta(cfg.alpha, cfg.alpha)
        sup_l = max(sup_l, 1-sup_l)
        sup_idx = torch.randperm(sup_size)

        if cfg.sup_mixup and 'word' in cfg.sup_mixup:
            if cfg.simple_pad:
                simple_pad(input_ids, input_mask, num_tokens)
                c_input_ids = None
            else:
                input_ids, c_input_ids = pad_for_word_mixup(
                    input_ids, input_mask, num_tokens, sup_idx
                )
        else:
            c_input_ids = None

        # sup loss
        if cfg.model == "bert":
            logits = model(
                input_ids=input_ids,
                c_input_ids=c_input_ids,
                attention_mask=input_mask,
                mixup=cfg.sup_mixup,
                shuffle_idx=sup_idx,
                l=sup_l,
                manifold_mixup = cfg.manifold_mixup,
                no_pretrained_pool=cfg.no_pretrained_pool
            )
        else:
            hidden = model(
                input_ids=input_ids, 
                segment_ids=segment_ids, 
                input_mask=input_mask,
                output_h=True,
                mixup=cfg.sup_mixup,
                shuffle_idx=sup_idx,
                clone_ids=c_input_ids,
                l=sup_l,
                manifold_mixup=cfg.manifold_mixup,
                simple_pad=cfg.simple_pad,
                no_grad_clone=cfg.no_grad_clone
            )
            logits = model(input_h=hidden)

        if cfg.sup_mixup:
            label_ids = mixup_op(label_ids, sup_l, sup_idx)

        sup_loss = -torch.sum(F.log_softmax(logits, dim=1) * label_ids, dim=1)

        if cfg.tsa and cfg.tsa != "none":
            tsa_thresh = get_tsa_thresh(cfg.tsa, global_step, cfg.total_steps, start=1./logits.shape[-1], end=1)
            larger_than_threshold = torch.exp(-sup_loss) > tsa_thresh   # prob = exp(log_prob), prob > tsa_threshold
            # larger_than_threshold = torch.sum(  F.softmax(pred[:sup_size]) * torch.eye(num_labels)[sup_label_ids]  , dim=-1) > tsa_threshold
            loss_mask = torch.ones_like(og_label_ids, dtype=torch.float32) * (1 - larger_than_threshold.type(torch.float32))
            sup_loss = torch.sum(sup_loss * loss_mask, dim=-1) / torch.max(torch.sum(loss_mask, dim=-1), torch_device_one())
        else:
            sup_loss = torch.mean(sup_loss)

        if cfg.no_unsup_loss:
            return sup_loss, sup_loss, sup_loss, sup_loss

        # unsup loss
        with torch.no_grad():
            if cfg.model == "bert":
                ori_logits = model(
                    input_ids = ori_input_ids,
                    attention_mask = ori_input_mask,
                    no_pretrained_pool=cfg.no_pretrained_pool
                )
            else:
                ori_logits = model(ori_input_ids, ori_segment_ids, ori_input_mask)
            ori_prob   = F.softmax(ori_logits, dim=-1)    # KLdiv target


        # mixup
        l = np.random.beta(cfg.alpha, cfg.alpha)
        l = max(l, 1-l)
        idx = torch.randperm(hidden.size(0))

        
        if cfg.mixup and 'word' in cfg.mixup:
            ori_input_ids, c_ori_input_ids = pad_for_word_mixup(
                ori_input_ids, ori_input_mask, ori_num_tokens, idx
            )
        else:
            c_ori_input_ids = None

        
        #for i in range(0, batch_size):
        #    new_mask = ori_input_mask[i]
        #    new_ids = ori_input_ids[i]
        #    old_ids = c_ori_input_ids[i]
        #    pdb.set_trace()
        if cfg.model == "bert":
            logits = model(
                input_ids=ori_input_ids,
                c_input_ids=c_ori_input_ids,
                attention_mask=ori_input_mask,
                mixup=cfg.mixup,
                shuffle_idx=idx,
                l=l,
                manifold_mixup = cfg.manifold_mixup,
                no_pretrained_pool=cfg.no_pretrained_pool
            )
        else:
            hidden = model(
                input_ids=ori_input_ids, 
                segment_ids=ori_segment_ids, 
                input_mask=ori_input_mask,
                output_h=True,
                mixup=cfg.mixup,
                shuffle_idx=idx,
                clone_ids=c_ori_input_ids,
                l=l,
                manifold_mixup=cfg.manifold_mixup,
                simple_pad=cfg.simple_pad,
                no_grad_clone=cfg.no_grad_clone
            )
            logits = model(input_h=hidden)

        if cfg.mixup:
            ori_prob = mixup_op(ori_prob, l, idx)

        probs_u = torch.softmax(logits, dim=1)
        unsup_loss = torch.mean((probs_u - ori_prob)**2)

        w = cfg.uda_coeff * sigmoid_rampup(global_step, cfg.consistency_rampup_ends - cfg.consistency_rampup_starts)
        final_loss = sup_loss + w*unsup_loss
        return final_loss, sup_loss, unsup_loss, w*unsup_loss

    # evaluation
    def get_acc(model, batch):
        # input_ids, segment_ids, input_mask, label_id, sentence = batch
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        _, label_pred = logits.max(1)

        result = (label_pred == label_id).float()
        accuracy = result.mean()
        # output_dump.logs(sentence, label_pred, label_id)    # output dump

        return accuracy, result

    if cfg.mode == 'train':
        trainer.train(get_loss, None, cfg.model_file, cfg.pretrain_file)

    if cfg.mode == 'train_eval':
        if cfg.mixmatch_mode:
            trainer.train(get_mixmatch_loss_short, get_acc, cfg.model_file, cfg.pretrain_file)
        elif cfg.uda_test_mode:
            trainer.train(get_sup_loss, get_acc, cfg.model_file, cfg.pretrain_file)
        elif cfg.uda_test_mode_two:
            trainer.train(get_loss_ict, get_acc, cfg.model_file, cfg.pretrain_file)
        else:
            trainer.train(get_sup_loss, get_acc, cfg.model_file, cfg.pretrain_file)

    if cfg.mode == 'eval':
        results = trainer.eval(get_acc, cfg.model_file, None)
        total_accuracy = torch.cat(results).mean().item()
        print('Accuracy :' , total_accuracy)
Exemple #6
0
def main(cfg, model_cfg):
    # Load Configuration
    cfg = configuration.params.from_json(cfg)                   # Train or Eval cfg
    model_cfg = configuration.model.from_json(model_cfg)        # BERT_cfg
    set_seeds(cfg.seed)

    # Load Data & Create Criterion
    data = load_data(cfg)
    if cfg.uda_mode:
        unsup_criterion = nn.KLDivLoss(reduction='none')
        data_iter = [data.sup_data_iter(), data.unsup_data_iter()] if cfg.mode=='train' \
            else [data.sup_data_iter(), data.unsup_data_iter(), data.eval_data_iter()]  # train_eval
    else:
        data_iter = [data.sup_data_iter()]
    sup_criterion = nn.CrossEntropyLoss(reduction='none')
    
    # Load Model
    model = models.Classifier(model_cfg, len(data.TaskDataset.labels))

    # Create trainer
    trainer = train.Trainer(cfg, model, data_iter, optim.optim4GPU(cfg, model), get_device())

    # Training
    def get_loss(model, sup_batch, unsup_batch, global_step):

        # logits -> prob(softmax) -> log_prob(log_softmax)

        # batch
        input_ids, segment_ids, input_mask, label_ids = sup_batch
        if unsup_batch:
            ori_input_ids, ori_segment_ids, ori_input_mask, \
            aug_input_ids, aug_segment_ids, aug_input_mask  = unsup_batch

            input_ids = torch.cat((input_ids, aug_input_ids), dim=0)
            segment_ids = torch.cat((segment_ids, aug_segment_ids), dim=0)
            input_mask = torch.cat((input_mask, aug_input_mask), dim=0)
            
        # logits
        logits = model(input_ids, segment_ids, input_mask)

        # sup loss
        sup_size = label_ids.shape[0]            
        sup_loss = sup_criterion(logits[:sup_size], label_ids)  # shape : train_batch_size
        if cfg.tsa:
            tsa_thresh = get_tsa_thresh(cfg.tsa, global_step, cfg.total_steps, start=1./logits.shape[-1], end=1)
            larger_than_threshold = torch.exp(-sup_loss) > tsa_thresh   # prob = exp(log_prob), prob > tsa_threshold
            # larger_than_threshold = torch.sum(  F.softmax(pred[:sup_size]) * torch.eye(num_labels)[sup_label_ids]  , dim=-1) > tsa_threshold
            loss_mask = torch.ones_like(label_ids, dtype=torch.float32) * (1 - larger_than_threshold.type(torch.float32))
            sup_loss = torch.sum(sup_loss * loss_mask, dim=-1) / torch.max(torch.sum(loss_mask, dim=-1), torch_device_one())
        else:
            sup_loss = torch.mean(sup_loss)

        # unsup loss
        if unsup_batch:
            # ori
            with torch.no_grad():
                ori_logits = model(ori_input_ids, ori_segment_ids, ori_input_mask)
                ori_prob   = F.softmax(ori_logits, dim=-1)    # KLdiv target
                # ori_log_prob = F.log_softmax(ori_logits, dim=-1)

                # confidence-based masking
                if cfg.uda_confidence_thresh != -1:
                    unsup_loss_mask = torch.max(ori_prob, dim=-1)[0] > cfg.uda_confidence_thresh
                    unsup_loss_mask = unsup_loss_mask.type(torch.float32)
                else:
                    unsup_loss_mask = torch.ones(len(logits) - sup_size, dtype=torch.float32)
                unsup_loss_mask = unsup_loss_mask.to(_get_device())
                    
            # aug
            # softmax temperature controlling
            uda_softmax_temp = cfg.uda_softmax_temp if cfg.uda_softmax_temp > 0 else 1.
            aug_log_prob = F.log_softmax(logits[sup_size:] / uda_softmax_temp, dim=-1)

            # KLdiv loss
            """
                nn.KLDivLoss (kl_div)
                input : log_prob (log_softmax)
                target : prob    (softmax)
                https://pytorch.org/docs/stable/nn.html

                unsup_loss is divied by number of unsup_loss_mask
                it is different from the google UDA official
                The official unsup_loss is divided by total
                https://github.com/google-research/uda/blob/master/text/uda.py#L175
            """
            unsup_loss = torch.sum(unsup_criterion(aug_log_prob, ori_prob), dim=-1)
            unsup_loss = torch.sum(unsup_loss * unsup_loss_mask, dim=-1) / torch.max(torch.sum(unsup_loss_mask, dim=-1), torch_device_one())
            final_loss = sup_loss + cfg.uda_coeff*unsup_loss

            return final_loss, sup_loss, unsup_loss
        return sup_loss, None, None

    # evaluation
    def get_acc(model, batch):
        # input_ids, segment_ids, input_mask, label_id, sentence = batch
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        _, label_pred = logits.max(1)

        result = (label_pred == label_id).float()
        accuracy = result.mean()
        # output_dump.logs(sentence, label_pred, label_id)    # output dump

        return accuracy, result

    if cfg.mode == 'train':
        trainer.train(get_loss, None, cfg.model_file, cfg.pretrain_file)

    if cfg.mode == 'train_eval':
        trainer.train(get_loss, get_acc, cfg.model_file, cfg.pretrain_file)

    if cfg.mode == 'eval':
        results = trainer.eval(get_acc, cfg.model_file, None)
        total_accuracy = torch.cat(results).mean().item()
        print('Accuracy :' , total_accuracy)
Exemple #7
0
else:
    torch.cuda.set_device(args.local_rank)
    args.device = torch.device("cuda", args.local_rank)
    args.n_gpu = 1

    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')

# Log GPU information
logger.add_text('info', f"args: {args}")

# Modify batch size if accumulating gradients
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

# Reproducibility
utils.set_seeds(args.seed, multi_gpu=args.n_gpu > 0)

# Build dataloaders
tokenizer = tokenization.FullTokenizer(args.vocab,
                                       do_lower_case=args.do_lower_case)
tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))
pipeline = [
    PipelineForPretrain(
        max_pred=20,  # what is this?
        mask_prob=0.15,  # actually this does nothing
        vocab_words=list(tokenizer.vocab.keys()),  # 
        indexer=tokenizer.convert_tokens_to_ids,
        max_len=args.max_seq_length)
]
dataloader = SentencePairDataLoader(args.text_file,
                                    batch_size=args.train_batch_size,
Exemple #8
0
    pr_auc = cal_pr_auc(total_scores, total_labels)
    acc = cal_accuracy(total_scores, total_labels)
    pre = cal_precision(total_scores, total_labels)
    rec = cal_recall(total_scores, total_labels)
    far = cal_false_alarm(total_scores, total_labels)
    spe = cal_specific(total_scores, total_labels)
    rmse = cal_rmse(total_scores, total_labels)
    gap = cal_score_gap(total_scores, total_labels)
    gm = cal_geometric_mean(total_scores, total_labels)
    mcc = cal_MCC(total_scores, total_labels)
    sen = cal_sensitivity(total_scores, total_labels)
    f = cal_f_measure(total_scores, total_labels)
    pauc = cal_pAUC(total_scores, total_labels)
    fnr = cal_false_neg(total_scores, total_labels)
    print('AUC\t {}\tPR_AUC\t{}\tpAUC\t{}'.format(auc, pr_auc, pauc))
    print('FAR\t{}\tFNR\t{}\tGM\t{}'.format(far, fnr, gm))
    print('Precision\t{}\tRecall\t{}'.format(pre, rec))
    print('Acc\t{}\tMCC\t{}'.format(acc, mcc))
    print('Sen\t{}\tSpe\t{}'.format(sen, spe))
    print('Gap\t{}\tRMSE\t{}'.format(gap, rmse))
    print('F\t{}'.format(f))
    return auc


if __name__ == '__main__':
    args = parse_args()
    set_seeds(args.seed)
    show_params(args)
    train_AR_Net(args)
    show_params(args)
Exemple #9
0
def train(config):
    cfg, cfg_data, cfg_model, cfg_optim = read_config(config)

    device, n_gpu = utils.get_device()
    utils.set_seeds(cfg.seed, n_gpu)

    train_batch_size = int(cfg_optim.train_batch_size /
                           cfg_optim.gradient_accumulation_steps)

    processor = get_class(cfg.task.lower())

    tokenizer = BertTokenizer.from_pretrained(cfg.bert_model,
                                              do_lower_case=cfg.do_lower_case)

    train_examples = None
    num_train_steps = None
    if cfg.do_train:
        train_examples = processor.get_train_examples(cfg_data.data_dir)
        num_train_steps = int(
            len(train_examples) / train_batch_size /
            cfg_optim.gradient_accumulation_steps * cfg_optim.num_train_epochs)

    label_list = processor.get_labels()
    # Prepare model
    print(PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1))
    model = BertForSequenceClassification.from_pretrained(
        cfg.bert_model,
        cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1),
        num_labels=len(label_list))

    model.to(device)

    # Prepare optimizer
    if cfg_optim.optimize_on_cpu:
        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                           for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())

    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    t_total = num_train_steps

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=cfg_optim.learning_rate,
                         warmup=cfg_optim.warmup_proportion,
                         t_total=t_total)

    global_step = 0
    if cfg.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      cfg_optim.max_seq_length,
                                                      tokenizer,
                                                      show_exp=False)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        train_dataloader = convert_features_to_tensors(train_features,
                                                       train_batch_size)

        model.train()
        best_score = 0
        flags = 0
        for _ in trange(int(cfg_optim.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if cfg_optim.fp16 and cfg_optim.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * cfg_optim.loss_scale
                if cfg_optim.gradient_accumulation_steps > 1:
                    loss = loss / cfg_optim.gradient_accumulation_steps
                loss.backward()

                if (step + 1) % cfg_optim.gradient_accumulation_steps == 0:
                    if cfg_optim.optimize_on_cpu:
                        if cfg_optim.fp16 and cfg_optim.loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                if param.grad is not None:
                                    param.grad.data = param.grad.data / cfg_optim.loss_scale
                        is_nan = utils.set_optimizer_params_grad(
                            param_optimizer,
                            model.named_parameters(),
                            test_nan=True)
                        if is_nan:
                            logger.info(
                                "FP16 TRAINING: Nan in gradients, reducing loss scaling"
                            )
                            cfg_optim.loss_scale = cfg_optim.loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        utils.copy_optimizer_params_to_model(
                            model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()

            f1 = evaluate(model, processor, cfg_optim, label_list, tokenizer,
                          device)
            if f1 > best_score:
                best_score = f1
                print('*f1 score = {}'.format(f1))
                flags = 0
                checkpoint = {'state_dict': model.state_dict()}
                torch.save(checkpoint, cfg_optim.model_save_pth)
            else:
                print('f1 score = {}'.format(f1))
                flags += 1
                if flags >= 6:
                    break

    model.load_state_dict(torch.load(cfg.model_save_pth)['state_dict'])
    test(model, processor, cfg_optim, label_list, tokenizer, device)
Exemple #10
0
experiment_ranges = {}
experiment_ranges["loss_control_penalty"] = [0.7]
experiment_ranges["dense_layer_neurons"] = [1024]

alpha = 0.9  # Only relevant for RMSProp, smoothing constant for weight update
base_learningRate = 5e-6
experiment_ranges["learningRate"] = [1e-4]
experiment_ranges["weight_decay"] = [5e-6]

data_path = os.path.dirname(os.getcwd()) + "/data/"
log_dir = os.path.dirname(os.getcwd()) + "/logs/"

random_seed = 1905

# Sets all seeds to the chosen number
set_seeds(random_seed)
# If we wanna keep tensorboard logs
if record_run:
    # Creates the logging directory
    os.makedirs(log_dir, exist_ok=True)
    # Creates this experiment's log directory
    experiment_log_dir = log_dir + "/" + create_exp_name(experiment_ranges)
    os.makedirs(experiment_log_dir, exist_ok=True)
    f = open(experiment_log_dir + "experiment_ranges.txt", "w")
    f.write(str(experiment_ranges))
    f.close()

# ## DATA LOADING ##
#
# We apply several transformations to both MNIST and SVHN
#