Beispiel #1
0
def compute_importance(model, parallel_model, updater, dataloaders, loss_type="l2"):
    """Mimic the depoloyment setup where the model is applied on some samples and those are used to update the importance params
       Uses the L2norm of the function output. This is what we MAS uses as default
    """
    # model.eval()  # Set model to training mode so we get the gradient
    # train_loss_fct = DataParallelCriterion(CrossEntropyLoss(ignore_index=FILL_VAL), args.device_ids)

    softmax = torch.nn.Softmax(dim=-1)
    if loss_type == "l2":
        loss_fct = DataParallelCriterion(torch.nn.MSELoss(reduction='mean'), args.device_ids)
    elif loss_type == "l1":
        loss_fct = DataParallelCriterion(torch.nn.L1Loss(reduction='mean'), args.device_ids)
    elif loss_type == "ewc":
        CELoss = CrossEntropyLoss(ignore_index=FILL_VAL, reduction='mean', weight=TOKEN_WEIGHT)
        loss_fct = DataParallelCriterion(CELoss, args.device_ids)

    # Iterate over data.
    for dataloader in dataloaders:
        for cq, len_cq, cqa, len_cqa, Y, _, _ in dataloader:
            # get the inputs
            n_inputs = sum(len(_cq) for _cq in cq)
            for i in range(len(cqa)):
                cq[i] = (cq[i].to(args.device_ids[i]),)
                len_cq[i] = len_cq[i].to(args.device_ids[i])
                cqa[i] = (cqa[i].to(args.device_ids[i]),)
                len_cqa[i] = len_cqa[i].to(args.device_ids[i])
                Y[i] = Y[i].to(args.device_ids[i])

            # zero the parameter gradients
            updater.zero_grad()

            # forward
            if loss_type != "ewc":
                logits = parallel_model(cq)
                logits = [logit[range(len(logit)), len_cq[i]-1, :] for i, logit in enumerate(logits)]
                #logits = [softmax(logit, dim=-1) for logit in logits]
                target_zeros = [torch.zeros(logit.size()).to(args.device_ids[i]) for i, logit in enumerate(logits)]
                logits = [softmax(logit) for logit in logits]

                if loss_type == "l2":
                    targets = loss_fct(logits, target_zeros)
                elif loss_type == "l1":
                    targets = loss_fct(logits, target_zeros)
            else:
                targets, _ = get_losses(parallel_model, cqa, Y, None, None, loss_fct)

            targets /= n_inputs 

            #compute the gradients
            targets.backward()

            #update the parameters importance
            updater.step(model.reg_params, n_inputs)
Beispiel #2
0
def createModels(args, userNum, itemNum):
    if args.model == 'SPUIGACF':
        model = SPUIGACF(userNum,
                         itemNum,
                         embedSize=args.embedSize,
                         layers=args.layers,
                         droprate=args.droprate).cuda()
    elif args.model == 'SPUIMultiGACF':
        model = SPUIMultiGACF(userNum,
                              itemNum,
                              embedSize=args.embedSize,
                              layers=args.layers,
                              droprate=args.droprate).cuda()
    elif args.model == 'SPUIGAGPCF':
        model = SPUIGAGPCF(userNum,
                           itemNum,
                           adj,
                           embedSize=args.embedSize,
                           layers=args.layers,
                           droprate=args.droprate).cuda()

    if args.train_mode == 'PairSampling':
        lossfn = BPRLoss()
        if args.parallel == True:
            model = DataParallelModel(model)
            lossfn = DataParallelCriterion2(lossfn)
    elif args.train_mode == 'NegSampling':
        lossfn = BCEWithLogitsLoss()
        if args.parallel == True:
            model = DataParallelModel(model)  # 并行化model
            lossfn = DataParallelCriterion(lossfn)  # 并行化损失函数
    optim = Adam(model.parameters(),
                 lr=args.lr,
                 weight_decay=args.weight_decay)
    return model, lossfn, optim
Beispiel #3
0
    def __init__(self,
                 model,
                 mask_prob: float = 0.15,
                 clip: int = 1,
                 optimizer=None):
        self.model = model
        self.clip = clip
        self.optimizer = optimizer

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)

        self.mask_prob = mask_prob
        self.criterion = nn.NLLLoss(
            ignore_index=model.text_processor.pad_token_id())

        num_gpu = torch.cuda.device_count()
        if num_gpu > 1:
            print("Let's use", num_gpu, "GPUs!")
            self.model = DataParallelModel(self.model)
            self.criterion = DataParallelCriterion(self.criterion)

        self.best_dev_loss = float("inf")
        self.best_train_loss = float("inf")
        self.last_train_loss = float("inf")
Beispiel #4
0
    def __init__(self, cfg: Namespace, data: Dataset):
        """
        Args:
            cfg:  configuration
            data:  train dataset
        """
        self.cfg = cfg
        self.train, self.valid = data.split(0.8)
        RATING_FIELD.build_vocab(self.train)

        self.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')  # pylint: disable=no-member
        self.batch_size = cfg.batch_size
        if torch.cuda.is_available():
            self.batch_size *= torch.cuda.device_count()

        self.trn_itr = BucketIterator(
            self.train,
            device=self.device,
            batch_size=self.batch_size,
            shuffle=True,
            train=True,
            sort_within_batch=True,
            sort_key=lambda exam: -len(exam.comment_text))
        self.vld_itr = BucketIterator(
            self.valid,
            device=self.device,
            batch_size=self.batch_size,
            shuffle=False,
            train=False,
            sort_within_batch=True,
            sort_key=lambda exam: -len(exam.comment_text))
        self.log_step = 1000
        if len(self.vld_itr) < 100:
            self.log_step = 10
        elif len(self.vld_itr) < 1000:
            self.log_step = 100

        bert_path = cfg.bert_path if cfg.bert_path else 'bert-base-cased'
        self.model = BertForSequenceClassification.from_pretrained(
            bert_path, num_labels=2)
        pos_weight = (
            len([exam for exam in self.train.examples if exam.target < 0.5]) /
            len([exam for exam in self.train.examples if exam.target >= 0.5]))
        pos_wgt_tensor = torch.tensor([1.0, pos_weight], device=self.device)  # pylint: disable=not-callable
        self.criterion = nn.CrossEntropyLoss(weight=pos_wgt_tensor)
        if torch.cuda.is_available():
            self.model = DataParallelModel(self.model.cuda())
            self.criterion = DataParallelCriterion(self.criterion)
        self.optimizer = optim.Adam(self.model.parameters(), cfg.learning_rate)
    def CrossEntropyLoss(self, logit, target):
        n, c, h, w = logit.size()
        criterion = nn.CrossEntropyLoss(weight=self.weight, ignore_index=self.ignore_index,
                                        size_average=self.size_average)
        
        if self.cuda and len(self.args.gpu_ids) > 1:
            criterion = DataParallelCriterion(criterion)
        if self.cuda and len(self.args.gpu_ids) < 2:
            criterion = criterion.cuda()
        
        loss = criterion(logit, target.long())

        if self.batch_average:
            loss /= n

        return loss
def train():
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loaders = create_datasets(num_workers=32, batch_size=600)
    # info = pd.read_csv("./flower_data/train.csv")[["image","label"]]
    # class_weights = torch.tensor(1.0/info.groupby(["label"]).count().values.astype(np.float32))
    # del info
    models_ensamble = [
                    # {"name":"vgg", "model":models.vgg16_bn(pretrained=True)},
                    {"name":"resnet", "model":models.resnet50(pretrained=True)}, 
                    # {"name":"densenet", "model":models.densenet121(pretrained=True) },
                    {"name":"resnet", "model":models.resnet101(pretrained=True) },
                    ]

    # model = Ensemble(models_ensamble, name="star_ensemble")
    model = load_checkpoint("ensemble_iso_star_5118.pt")

    ft, cl =model.get_parameters()
    # model = nn.DataParallel(model)
    model = DataParallelModel(model)
    model = model.to(device)
    weight = torch.from_numpy(weight_train[0]).to(device)
    criterion = nn.NLLLoss(weight)
    criterion = DataParallelCriterion(criterion)
  
    optimizers = [ optim.Adam(ft, lr=5e-4), optim.Adam(cl, lr=5e-3)]
    # # print("")
    # # print('-' * 40)
    # # print("lr = {} bs= {}".format(lr,bs) )
    # # print('-' * 40)

    # # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_schedulers = [lr_scheduler.StepLR(optimizers[0], step_size = 1, gamma = 0.995),
                        lr_scheduler.StepLR(optimizers[1], step_size = 1, gamma = 0.992) ]


    model = [model, criterion, optimizers, exp_lr_schedulers, device]

    model = train_model(*model, loaders, num_epochs = 100)
    segm_model = DataParallelModel(segm_model)
print("Let's use", torch.cuda.device_count(), "GPUs!")
segm_model.to(device)
'''if use_cuda:
    segm_model.cuda()
seg_model = nn.DataParallel(seg_model)'''

mul_transf = [
    transforms.Resize(size=(img_size, img_size)),
    transforms.ToTensor()
]
#optimizer = optim.SGD(segm_model.parameters(), lr=lr_rate, momentum=momentum)
optimizer = optim.Adam(segm_model.parameters(), lr=0.0001)
#criterion = nn.BCEWithLogitsLoss().cuda() if use_cuda else nn.BCEWithLogitsLoss()
criterion = nn.BCEWithLogitsLoss()
criterion = DataParallelCriterion(criterion)
criterion.to(device)

scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                           milestones=milestones,
                                           gamma=gamma)

train_loader, valid_loader = CellTrainValidLoader(
    data_transform=transforms.Compose(mul_transf),
    batch_sz=batch_size,
    workers=2)

dict_loaders = {"train": train_loader, "valid": valid_loader}


def train_model(cust_model,
Beispiel #8
0
def main():
    parser = setup_parser()
    args = parser.parse_args()

    processors = {
        'stsb': StsbProcessor,
        'mednli': MednliProcessor,
        'medsts': MedstsProcessor
    }

    output_modes = {
        'mnli': 'classification',
        'stsb': 'regression',
        'mednli': 'classification',
        'medsts': 'regression'
    }

    bert_types = {
        'discharge':
        '/home/dc925/project/data/clinicalbert/biobert_pretrain_output_disch_100000',
        'all':
        '/home/dc925/project/data/clinicalbert/biobert_pretrain_output_all_notes_150000',
        'base_uncased': 'bert-base-uncased',
        'base_cased': 'bert-base-cased'
    }

    ##################################################################################################
    ################################### SETUP DATA, DEVICE, MODEL ####################################
    ##################################################################################################
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device('cuda' if torch.cuda.is_available()
                              and not args.no_cuda else 'cpu')
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device('cuda', args.local_rank)
        n_gpu = 1
        #Initialize the distributed backend which will take care of synchronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))
    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: {}".format(task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]
    label_list = processor.get_labels(output_mode)
    num_labels = len(label_list)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
            args.local_rank))
    model = BertForSequenceClassification.from_pretrained(
        args.bert_model, cache_dir=cache_dir, num_labels=num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        # model = torch.nn.DataParallel(model)
        model = DataParallelModel(model)

    ##################################################################################################
    ########################################### OPTIMIZER ############################################
    ##################################################################################################

    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        if args.discriminative_finetuning:
            group1 = ['layer.0', 'layer.1.']
            group2 = ['layer.2', 'layer.3']
            group3 = ['layer.4', 'layer.5']
            group4 = ['layer.6', 'layer.7']
            group5 = ['layer.8', 'layer.9']
            group6 = ['layer.10', 'layer.11']
            group_all = ['layer.0', 'layer.1.', 'layer.2', 'layer.3', 'layer.4', 'layer.5', \
            'layer.6', 'layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.01},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.01, 'lr': args.learning_rate},

                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.0},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.0, 'lr': args.learning_rate},
            ]
        else:
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)

        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    ##################################################################################################
    ############################################# TRAIN ##############################################
    ##################################################################################################
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer, output_mode)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer, output_mode)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.float)

        all_pids = np.array([f.pid for f in eval_features])

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size,
                                     drop_last=True)

        model.train()
        epoch_metric = {}
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                # define a new function to compute loss values for both output_modes
                logits = model(input_ids, segment_ids, input_mask, labels=None)

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    loss_fct = DataParallelCriterion(loss_fct)
                    logits = [
                        logits[i].view(-1, num_labels)
                        for i in range(len(logits))
                    ]
                    loss = loss_fct(logits, label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss_fct = DataParallelCriterion(loss_fct)
                    logits = [logits[i].view(-1) for i in range(len(logits))]
                    loss = loss_fct(logits, label_ids.view(-1))
                if n_gpu > 1:
                    loss = loss.mean()  #average on multi-gpu
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        #modify lr with special warm up BERT uses
                        #if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            with torch.no_grad():
                model.eval()
                eval_loss = 0
                nb_eval_steps = 0
                preds = []
                i = 0

                for input_ids, input_mask, segment_ids, label_ids in tqdm(
                        eval_dataloader, desc="Evaluating"):
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        logits = model(input_ids,
                                       segment_ids,
                                       input_mask,
                                       labels=None)

                    if output_mode == 'classification':
                        # loss_fct = CrossEntropyLoss()
                        # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                        loss_fct = CrossEntropyLoss()
                        loss_fct = DataParallelCriterion(loss_fct)
                        logits = [
                            logits[i].view(-1, num_labels)
                            for i in range(len(logits))
                        ]
                        tmp_eval_loss = loss_fct(logits, label_ids.view(-1))
                    elif output_mode == 'regression':
                        # loss_fct = MSELoss()
                        # tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

                        loss_fct = MSELoss()
                        loss_fct = DataParallelCriterion(loss_fct)
                        logits = [
                            logits[i].view(-1) for i in range(len(logits))
                        ]
                        tmp_eval_loss = loss_fct(logits, label_ids.view(-1))

                    eval_loss += tmp_eval_loss.mean().item()
                    nb_eval_steps += 1
                    logits = parallel.gather(logits, target_device='cuda:0')
                    if len(preds) == 0:
                        preds.append(logits.detach().cpu().numpy())
                    else:
                        preds[0] = np.append(preds[0],
                                             logits.detach().cpu().numpy(),
                                             axis=0)
                eval_loss = eval_loss / nb_eval_steps
                preds = preds[0]
                if output_mode == 'classification':
                    preds = np.argmax(preds, axis=1)
                elif output_mode == 'regression':
                    preds = np.squeeze(preds)

                all_label_ids = all_label_ids[:preds.shape[0]]
                all_pids = all_pids[:preds.shape[0]]
                errors = generate_errors(preds, all_label_ids.numpy(),
                                         all_pids)

                result = compute_metrics(task_name, preds,
                                         all_label_ids.numpy())

                loss = tr_loss / global_step if args.do_train else None

                result['eval_loss'] = eval_loss
                result['global_step'] = global_step
                result['loss'] = loss
                logger.info('***** Eval Results *****')
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))

                epoch_metric[_] = result[
                    'pearson'] if output_mode == 'regression' else result['acc']

        output_eval_file = os.path.join(args.output_dir, 'eval_results.txt')
        with open(output_eval_file, 'w') as writer:
            logger.info('***** Eval Results *****')
            # for key in sorted(result.keys()):
            #     logger.info("  %s = %s", key, str(result[key]))
            #     writer.write("%s = %s\n" % (key, str(result[key])))
            # writer.write("{}     {}\n".format("epoch","pearson"))
            for key in sorted(epoch_metric.keys()):
                writer.write("{}\t{}\t{}\t{}\n".format(key,
                                                       str(epoch_metric[key]),
                                                       args.learning_rate,
                                                       args.train_batch_size))

        errors.to_csv('errors.txt', sep='\t', index=False)

    ##################################################################################################
    ########################################## SAVE & RELOAD #########################################
    ##################################################################################################
    if args.do_train:
        #Save a trained model, config, and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  #only save the model itself
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)
        model = BertForSequenceClassification.from_pretrained(
            args.output_dir, num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
    else:
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model, num_labels=num_labels)
    model.to(device)
Beispiel #9
0
def train(config):
    net = BertForMaskedLM.from_pretrained(config.model)
    lossFunc = KLDivLoss(config)

    if torch.cuda.is_available():
        net = net.cuda()
        lossFunc = lossFunc.cuda()

        if config.dataParallel:
            net = DataParallelModel(net)
            lossFunc = DataParallelCriterion(lossFunc)

    options = optionsLoader(LOG, config.optionFrames, disp=False)
    Tokenizer = BertTokenizer.from_pretrained(config.model)
    prepareFunc = prepare_data

    trainSet = Dataset('train', config.batch_size,
                       lambda x: len(x[0]) + len(x[1]), prepareFunc, Tokenizer,
                       options['dataset'], LOG, 'train')
    validSet = Dataset('valid', config.batch_size,
                       lambda x: len(x[0]) + len(x[1]), prepareFunc, Tokenizer,
                       options['dataset'], LOG, 'valid')

    print(trainSet.__len__())

    Q = []
    best_vloss = 1e99
    counter = 0
    lRate = config.lRate

    prob_src = config.prob_src
    prob_tgt = config.prob_tgt

    num_train_optimization_steps = trainSet.__len__(
    ) * options['training']['stopConditions']['max_epoch']
    param_optimizer = list(net.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=lRate,
                         e=1e-9,
                         t_total=num_train_optimization_steps,
                         warmup=0.0)

    for epoch_idx in range(options['training']['stopConditions']['max_epoch']):
        total_seen = 0
        total_similar = 0
        total_unseen = 0
        total_source = 0

        trainSet.setConfig(config, prob_src, prob_tgt)
        trainLoader = data.DataLoader(dataset=trainSet,
                                      batch_size=1,
                                      shuffle=True,
                                      num_workers=config.dataLoader_workers,
                                      pin_memory=True)

        validSet.setConfig(config, 0.0, prob_tgt)
        validLoader = data.DataLoader(dataset=validSet,
                                      batch_size=1,
                                      shuffle=False,
                                      num_workers=config.dataLoader_workers,
                                      pin_memory=True)

        for batch_idx, batch_data in enumerate(trainLoader):
            if (batch_idx + 1) % 10000 == 0:
                gc.collect()
            start_time = time.time()

            net.train()

            inputs, positions, token_types, labels, masks, batch_seen, batch_similar, batch_unseen, batch_source = batch_data

            inputs = inputs[0].cuda()
            positions = positions[0].cuda()
            token_types = token_types[0].cuda()
            labels = labels[0].cuda()
            masks = masks[0].cuda()
            total_seen += batch_seen
            total_similar += batch_similar
            total_unseen += batch_unseen
            total_source += batch_source

            n_token = int((labels.data != 0).data.sum())

            predicts = net(inputs, positions, token_types, masks)
            loss = lossFunc(predicts, labels, n_token).sum()

            Q.append(float(loss))
            if len(Q) > 200:
                Q.pop(0)
            loss_avg = sum(Q) / len(Q)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()

            LOG.log(
                'Epoch %2d, Batch %6d, Loss %9.6f, Average Loss %9.6f, Time %9.6f'
                % (epoch_idx + 1, batch_idx + 1, loss, loss_avg,
                   time.time() - start_time))

            # Checkpoints
            idx = epoch_idx * trainSet.__len__() + batch_idx + 1
            if (idx >= options['training']['checkingPoints']['checkMin']) and (
                    idx % options['training']['checkingPoints']['checkFreq']
                    == 0):
                if config.do_eval:
                    vloss = 0
                    total_tokens = 0
                    for bid, batch_data in enumerate(validLoader):
                        inputs, positions, token_types, labels, masks, batch_seen, batch_similar, batch_unseen, batch_source = batch_data

                        inputs = inputs[0].cuda()
                        positions = positions[0].cuda()
                        token_types = token_types[0].cuda()
                        labels = labels[0].cuda()
                        masks = masks[0].cuda()

                        n_token = int((labels.data != config.PAD).data.sum())

                        with torch.no_grad():
                            net.eval()
                            predicts = net(inputs, positions, token_types,
                                           masks)
                            vloss += float(lossFunc(predicts, labels).sum())

                        total_tokens += n_token

                    vloss /= total_tokens
                    is_best = vloss < best_vloss
                    best_vloss = min(vloss, best_vloss)
                    LOG.log(
                        'CheckPoint: Validation Loss %11.8f, Best Loss %11.8f'
                        % (vloss, best_vloss))

                    if is_best:
                        LOG.log('Best Model Updated')
                        save_check_point(
                            {
                                'epoch': epoch_idx + 1,
                                'batch': batch_idx + 1,
                                'options': options,
                                'config': config,
                                'state_dict': net.state_dict(),
                                'best_vloss': best_vloss
                            },
                            is_best,
                            path=config.save_path,
                            fileName='latest.pth.tar')
                        counter = 0
                    else:
                        counter += options['training']['checkingPoints'][
                            'checkFreq']
                        if counter >= options['training']['stopConditions'][
                                'rateReduce_bound']:
                            counter = 0
                            for param_group in optimizer.param_groups:
                                lr_ = param_group['lr']
                                param_group['lr'] *= 0.55
                                _lr = param_group['lr']
                            LOG.log(
                                'Reduce Learning Rate from %11.8f to %11.8f' %
                                (lr_, _lr))
                        LOG.log('Current Counter = %d' % (counter))

                else:
                    save_check_point(
                        {
                            'epoch': epoch_idx + 1,
                            'batch': batch_idx + 1,
                            'options': options,
                            'config': config,
                            'state_dict': net.state_dict(),
                            'best_vloss': 1e99
                        },
                        False,
                        path=config.save_path,
                        fileName='checkpoint_Epoch' + str(epoch_idx + 1) +
                        '_Batch' + str(batch_idx + 1) + '.pth.tar')
                    LOG.log('CheckPoint Saved!')

        if options['training']['checkingPoints']['everyEpoch']:
            save_check_point(
                {
                    'epoch': epoch_idx + 1,
                    'batch': batch_idx + 1,
                    'options': options,
                    'config': config,
                    'state_dict': net.state_dict(),
                    'best_vloss': 1e99
                },
                False,
                path=config.save_path,
                fileName='checkpoint_Epoch' + str(epoch_idx + 1) + '.pth.tar')

        LOG.log('Epoch Finished.')
        LOG.log(
            'Total Seen: %d, Total Unseen: %d, Total Similar: %d, Total Source: %d.'
            % (total_seen, total_unseen, total_similar, total_source))
        gc.collect()
Beispiel #10
0
                           warmup=args.lr_warmup,
                           t_total=n_updates_total,
                           b1=args.b1,
                           b2=args.b2,
                           e=args.e,
                           l2=args.l2,
                           vector_l2=args.vector_l2,
                           max_grad_norm=args.max_grad_norm)
    load_openai_pretrained_model(dh_model.transformer,
                                 n_ctx=n_ctx,
                                 n_special=n_special)

    dh_model.to(device)

    dh_model = DataParallelModel(dh_model)
    criterion_lm = DataParallelCriterion(criterion_lm)
    criterion_clf = DataParallelCriterion(criterion_clf)

    n_updates = 0
    n_epochs = 0
    if submit:
        path = os.path.join(save_dir, desc, 'best_params_para_selector')
        torch.save(dh_model.state_dict(), make_path(path))
    best_score = 0
    for i in range(args.n_iter):
        if i == 0:
            log_msmarco()
        print("running epoch", i)
        run_epoch()
        n_epochs += 1
        # log(save_dir, desc)
Beispiel #11
0
def do_eval(model, logger, output_dir, device, tr_loss, nb_tr_steps,
            global_step, processor, label_list, tokenizer, eval_dataloader,
            error_analysis_dict, output_mode, i, task):

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    preds = []
    all_label_ids = []
    all_input_ids = []
    for input_ids, input_mask, segment_ids, label_ids in tqdm(
            eval_dataloader, desc='Evaluating'):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask, i, output_mode)

            if output_mode == 'classification':
                loss_fct = CrossEntropyLoss()
                loss_fct = DataParallelCriterion(loss_fct)
                logits = [
                    logits[i].view(-1, logits[0].size(1))
                    for i in range(len(logits))
                ]
                tmp_eval_loss = loss_fct(logits, label_ids.view(-1))
            else:
                loss_fct = MSELoss()
                loss_fct = DataParallelCriterion(loss_fct)
                logits = [logits[i].view(-1) for i in range(len(logits))]
                tmp_eval_loss = loss_fct(logits, label_ids.view(-1))

            logits = gather(logits, target_device='cuda:0')
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
            else:
                preds[0] = np.append(preds[0],
                                     logits.detach().cpu().numpy(),
                                     axis=0)
            if len(all_label_ids) == 0:
                all_label_ids.append(label_ids.detach().cpu().numpy())
            else:
                all_label_ids[0] = np.append(all_label_ids[0],
                                             label_ids.detach().cpu().numpy(),
                                             axis=0)
            if len(all_input_ids) == 0:
                all_input_ids.append(input_ids.detach().cpu().numpy())
            else:
                all_input_ids[0] = np.append(all_input_ids[0],
                                             input_ids.detach().cpu().numpy(),
                                             axis=0)

            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    preds = preds[0]
    all_label_ids = all_label_ids[0]
    all_input_ids = all_input_ids[0]
    all_pids = error_analysis_dict['pids'][:len(preds)]
    all_text_a = error_analysis_dict['text_a'][:len(preds)]
    all_text_b = error_analysis_dict['text_b'][:len(preds)]

    all_textpair_tokenized = [
        ' '.join(tokenizer.convert_ids_to_tokens(ids)) for ids in all_input_ids
    ]

    assert len(preds) == len(all_label_ids) == len(all_input_ids) == len(
        all_pids) == len(all_text_a) == len(all_text_b) == len(
            all_textpair_tokenized)

    all_textpair_tokenized = [
        tp.replace('[PAD]', '').strip() for tp in all_textpair_tokenized
    ]

    if output_mode == 'classification':
        preds = np.argmax(preds, axis=1)
        preds_rounded = preds
        eval_accuracy = accuracy(preds, all_label_ids)
    else:
        preds = np.squeeze(preds)
        preds_rounded = np.round(preds * 4) / 4
        eval_accuracy = pearsonr(preds, all_label_ids)[0]
    errors = generate_errors(preds, preds_rounded, all_label_ids, all_pids,
                             all_text_a, all_text_b, all_textpair_tokenized)
    if i == 0:
        errors.to_csv(os.path.join(output_dir, 'error_table.csv'),
                      sep=',',
                      index=False)

    result = {
        'task name': task,
        'eval_loss': eval_loss,
        'eval_accuracy': eval_accuracy,
        'global_step': global_step,
        'loss': tr_loss / nb_tr_steps
    }

    output_eval_file = os.path.join(output_dir, 'eval_results.txt')
    with open(output_eval_file, 'w') as writer:
        logger.info('******** Eval Results *****')
        for key in sorted(result.keys()):
            logger.info('   %s = %s', key, str(result[key]))
            # writer.write('{} = {}\n'.format(key, str(result[key])))

    return eval_accuracy
        # tar = target.contiguous().view(-1)
        # out = output.contiguous().view(target.size(0),-1)

        target = tar.contiguous().view(-1)
        output = out[:tar.size(0)]
        normalize = output.size(0) * output.size(1)
        output = output.contiguous().view(target.size(0), -1)
        loss = self.NLL(output, target) / normalize

        return loss


if not eval_model:
    criterion = NLLLoss(ignore_index=PAD)
    parallel_model = DataParallelModel(model)  # Encapsulate the model
    parallel_loss = DataParallelCriterion(criterion)

# In[5]:

# ---------------------------

# def merge_res(res):
#     ((inds1, log_probs1, enc_out1),(inds2, log_probs2, enc_out2)) = res
#     inds = T.cat([inds1, inds2], dim = 0).cpu()
#     enc_out = T.cat([enc_out1, enc_out2], dim = 0).cpu()
#     if type(log_probs1) != list:
#         log_probs = T.cat([log_probs1, log_probs2], dim = 0)
#         return inds, log_probs, enc_out
#     else:
#         return inds, _, enc_out
Beispiel #13
0
def main(args):
    init(args)
    #Args setup:
    save_dir = os.path.join(args.output_dir, args.experiment_name,
                            "checkpoints")
    save_dir_local = "checkpoints_local"
    desc = args.desc
    data_dir = args.data_dir
    log_dir = os.path.join(args.output_dir, args.experiment_name, "logs")
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(save_dir_local, exist_ok=True)

    train_log_interval = args.train_log_interval
    val_log_interval = args.val_log_interval
    beam = args.beam
    p = args.p
    n_ctx = args.n_ctx
    gen_len = args.gen_len
    k = args.k
    decoding_strategy = args.decoding_strategy
    accum_iter = args.accum_iter
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)
    logger = Logger(log_dir)

    #Text Encoder
    if args.use_offline_gpt2:
        text_encoder = GPT2Tokenizer.from_pretrained('./gpt2model')
    elif args.debug_mode:
        text_encoder = GPT2Tokenizer.from_pretrained('gpt2')
    else:
        text_encoder = GPT2Tokenizer.from_pretrained('gpt2-medium')

    text_encoder.add_special_tokens({
        'bos_token':
        '_start_',
        'cls_token':
        '_classify_',
        'eos_token':
        '_end_',
        'additional_special_tokens':
        ['_kw_', '_endkw_', '_t_', '_i_', '_b_', '_c_']
    })

    vocab = len(text_encoder)

    print("Loading dataset...")
    if args.use_model == "base":
        train_loader = get_paragraph_input_loader(
            os.path.join(data_dir, "train_encoded.jsonl"),
            args.n_batch,
            text_encoder,
            num_workers=3,
            shuffle=True,
            gen_len=gen_len,
            n_ctx=n_ctx,
            include_discourse_type=args.use_discourse,
            include_neigh=args.use_neighbor_feat,
            max_size=args.max_ex,
            include_kw=not args.exclude_kw,
            dim=args.n_embd,
            debug_mode=args.debug_mode)

        val_loader = get_paragraph_input_loader(
            os.path.join(data_dir, "val_encoded.jsonl"),
            n_gpu,
            text_encoder,
            num_workers=0,
            shuffle=False,
            gen_len=gen_len,
            n_ctx=n_ctx,
            include_discourse_type=args.use_discourse,
            include_neigh=args.use_neighbor_feat,
            max_size=args.num_val_examples,
            include_kw=not args.exclude_kw,
            dim=args.n_embd,
            debug_mode=args.debug_mode)

        print("Train length: {}, Validation length: {}".format(
            len(train_loader), len(val_loader)))
        doc_model = GPT2BaseModel(args,
                                  vocab=vocab,
                                  n_ctx=n_ctx,
                                  gen_len=gen_len,
                                  lastidx=text_encoder.eos_token_id,
                                  includeprev=args.use_neighbor_feat,
                                  use_offline_gpt2=args.use_offline_gpt2)

    elif args.use_model == "plotmachines":
        #asli
        train_loader = get_paragraph_memory_input_loader(
            os.path.join(data_dir, "train_encoded.jsonl"),
            args.n_batch,
            text_encoder,
            num_workers=3,
            shuffle=True,
            gen_len=gen_len,
            n_ctx=n_ctx,
            include_discourse_type=args.use_discourse,
            include_neigh=args.use_neighbor_feat,
            max_size=args.max_ex,
            include_kw=not args.exclude_kw,
            memsize=args.memstatesize,
            dim=args.n_embd,
            use_kwmem=True,
            debug_mode=args.debug_mode)

        val_loader = get_paragraph_memory_input_loader(
            os.path.join(data_dir, "val_encoded.jsonl"),
            n_gpu,
            text_encoder,
            num_workers=0,
            shuffle=False,
            gen_len=gen_len,
            n_ctx=n_ctx,
            include_discourse_type=args.use_discourse,
            include_neigh=args.use_neighbor_feat,
            max_size=args.num_val_examples,
            include_kw=not args.exclude_kw,
            memsize=args.memstatesize,
            dim=args.n_embd,
            use_kwmem=True,
            debug_mode=args.debug_mode)

        print("Train length: {}, Validation length: {}".format(
            len(train_loader), len(val_loader)))
        doc_model = PlotMachinesModel(args,
                                      vocab=vocab,
                                      n_ctx=n_ctx,
                                      gen_len=gen_len,
                                      lastidx=text_encoder.eos_token_id,
                                      includeprev=args.use_neighbor_feat,
                                      use_offline_gpt2=args.use_offline_gpt2)

    n_updates_total = (len(train_loader) //
                       args.accum_iter) * (args.num_epochs)

    if args.debug_mode:
        print_model_params(log_dir, doc_model)

    criterion = nn.CrossEntropyLoss(reduction="none")

    model_opt = AdamW(filter(lambda p: p.requires_grad,
                             doc_model.parameters()),
                      lr=args.lr,
                      betas=(args.b1, args.b2),
                      eps=args.e)

    lm_loss = ParagraphLoss(criterion, n_ctx=n_ctx, gen_len=gen_len)

    print("Loading Model")
    doc_model.to(device)
    if n_gpu > 1:
        doc_model = DataParallelModel(doc_model)
        lm_loss = DataParallelCriterion(lm_loss)
    print("Parallelized")

    bestloss = -1
    start_iter, running_loss = 1, 0
    prevloss = 1000

    start_iter, running_loss = load_checkpoint(args.checkpoint, doc_model,
                                               model_opt)
    for i in range(args.num_epochs):
        start_iter, running_loss, bestloss, updates, val_loss1 = run_epoch(
            bestloss,
            start_iter,
            running_loss,
            doc_model,
            lm_loss,
            model_opt,
            train_loader,
            val_loader,
            train_log_interval,
            val_log_interval,
            device,
            beam,
            gen_len,
            k,
            p,
            decoding_strategy,
            accum_iter,
            "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs),
            save_dir,
            logger,
            text_encoder,
            show_progress=args.show_progress,
            my_local_dir=save_dir_local)
        print("VAL LOSS: ", str(val_loss1))
        if val_loss1 > prevloss or math.isnan(val_loss1):
            break
        prevloss = val_loss1

    print('Done training...')
    print('Evaluating on validation with best checkpoint...')

    bestcheck = os.path.join(save_dir, "checkpoint_best.pt")
    checkpoint = torch.load(bestcheck, map_location='cpu')
    state_dict = checkpoint["state_dict"]
    if state_dict.get('module.pos_emb_mask') is None and doc_model.state_dict(
    ).get('module.pos_emb_mask') is not None:
        state_dict['module.pos_emb_mask'] = doc_model.state_dict().get(
            'module.pos_emb_mask')
    doc_model.load_state_dict(state_dict)
    evaluate_doc_model(doc_model, val_loader, text_encoder, device, beam,
                       gen_len, k, p, args.decoding_strategy,
                       os.path.join(save_dir, 'valeval.log'), 'gen', 'tgt',
                       gen_len, [], args)
def main(args):
    init(args)

    # Constants
    n_ctx = args.n_ctx
    save_dir = os.path.join(args.output_dir, args.experiment_name, "checkpoints")
    desc = args.desc
    data_dir = args.data_dir
    log_dir = os.path.join(args.output_dir, args.experiment_name, "logs")
    train_log_interval = args.train_log_interval
    val_log_interval = args.val_log_interval
    beam = args.beam
    gen_len = args.gen_len
    k = args.k
    decoding_strategy = args.decoding_strategy
    accum_iter = args.accum_iter

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)
    logger = Logger(log_dir)

    text_encoder = TextEncoder(args.encoder_path, args.vocab_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3

    print("Loading dataset...")
    train_loader = get_loader(os.path.join(data_dir, "train_encoded.jsonl"), args.n_batch, encoder, num_workers=3, shuffle=True)
    val_loader = get_loader(os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, encoder, num_workers=0, shuffle=False, max_size=args.num_val_examples)
    print("Train length: {}, Validation length: {}".format(len(train_loader), len(val_loader)))

    vocab = n_vocab + n_special + n_ctx
    n_updates_total = (len(train_loader) // args.accum_iter) * (args.num_epochs_dat + args.num_epochs_ft)

    dh_model = LMModel(args, vocab=vocab, n_ctx=n_ctx, doc_embed=args.doc_model)

    criterion = nn.CrossEntropyLoss(reduction="none")
    model_opt = OpenAIAdam(dh_model.parameters(),
                           lr=args.lr,
                           schedule=args.lr_schedule,
                           warmup=args.lr_warmup,
                           t_total=n_updates_total,
                           b1=args.b1,
                           b2=args.b2,
                           e=args.e,
                           l2=args.l2,
                           vector_l2=args.vector_l2,
                           max_grad_norm=args.max_grad_norm)

    lm_loss = LMLoss(criterion)
    summary_loss = SummaryLoss(criterion)

    print("Loading Model")
    if args.use_pretrain:
        load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special, path="./model/", path_names="./")
    start_iter, running_loss = load_checkpoint(args.checkpoint, dh_model, model_opt, vocab, n_ctx)

    dh_model.to(device)
    dh_model = DataParallelModel(dh_model)
    lm_loss = DataParallelCriterion(lm_loss)
    summary_loss = DataParallelCriterion(summary_loss)

    for i in range(args.num_epochs_dat):
        start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, lm_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "DAT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_dat), save_dir, logger, text_encoder, show_progress=args.show_progress, summary_loss=summary_loss)
    for i in range(args.num_epochs_ft):
        start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, summary_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_ft), save_dir, logger, text_encoder, show_progress=args.show_progress)
Beispiel #15
0
def main():
    parser = setup_parser()
    args = parser.parse_args()
    logger.info('@@@@@ START @@@@@')
    device = torch.device(
        'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu')
    n_gpu = torch.cuda.device_count()
    logger.info('device %s n_gpu %d', device, n_gpu)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")
    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    bert_config = BertConfig.from_json_file(args.bert_config_file)
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    if args.tasks == 'all':
        task_names = ['medsts', 'mednli']
        data_dirs = ['MEDSTS', 'MEDNLI']
    elif args.tasks == 'single':
        task_names = ['medsts', 'mednli']
        data_dirs = ['MEDSTS', 'MEDNLI']
        task_names = [task_names[int(args.target_task_id)]]
        data_dirs = [data_dirs[int(args.target_task_id)]]
    if args.k_fold:
        target_data_dir = data_dirs[args.target_task_id]
        k_fold_data_dir = target_data_dir + '/k_fold_{}'.format(args.k)
        data_dirs[args.target_task_id] = k_fold_data_dir
    # if args.add_medsts_c:
    # 	assert args.k_fold==True
    # 	task_names.append('medsts_c')
    # 	data_dirs.append('MEDSTS_c')
    # 	k_fold_data_dir = data_dirs[-1] + '/k_fold_{}'.format(args.k)
    # 	data_dirs[-1] = k_fold_data_dir

    if task_names[0] not in processors:
        raise ValueError('Task not found: {}'.format(task_names[0]))

    processor_list = [processors[task_name]() for task_name in task_names]
    label_list = [processor.get_labels() for processor in processor_list]

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    num_tasks = len(task_names)

    if args.do_train:
        train_examples = [
            processor.get_train_examples(args.data_dir + data_dir)
            for processor, data_dir in zip(processor_list, data_dirs)
        ]
        num_train_steps = int(
            len(train_examples[0]) / args.train_batch_size *
            args.num_train_epochs)
        if args.tasks == 'all':
            total_tr = args.tr_factor * num_tasks * int(args.num_train_epochs)
        else:
            total_tr = int(0.5 * num_train_steps)

    if args.tasks == 'all':
        steps_per_epoch = args.gradient_accumulation_steps * args.tr_factor * num_tasks
    else:
        steps_per_epoch = int(num_train_steps / (2. * args.num_train_epochs))
    bert_config.num_tasks = num_tasks
    bert_config.hidden_size_aug = int(args.h_aug)

    model = BertForMultiTask(bert_config,
                             [len(labels) for labels in label_list])

    if args.init_checkpoint is not None:
        if args.multi:
            load_checkpoint_mult(args.init_checkpoint, model, args.same,
                                 args.tasks)
        else:
            model.bert.load_state_dict(
                torch.load(args.init_checkpoint, map_location='cpu'))

    if args.freeze:
        for n, p in model.bert.named_parameters():
            if 'aug' in n or 'classifier' in n or 'mult' in n or 'gamma' in n or 'beta' in n:
                continue
            p.requires_grad = False

    model.to(device)
    if n_gpu > 1:
        model = DataParallelModel(model)

    group_size = 2

    optimizer_parameters = get_param_groups(model, args, group_size)
    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=total_tr)

    if args.do_eval:
        eval_loaders = []
        error_analysis_dicts = []
        for i, task in enumerate(task_names):
            eval_examples = processor_list[i].get_dev_examples(args.data_dir +
                                                               data_dirs[i])
            eval_features = convert_examples_to_features(
                eval_examples, label_list[i], args.max_seq_length, tokenizer,
                output_modes[task])
            all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in eval_features], dtype=torch.long)
            all_pids = [int(f.pid) for f in eval_examples]
            all_text_a = [f.text_a for f in eval_examples]
            all_text_b = [f.text_b for f in eval_examples]
            error_data = {
                'pids': all_pids,
                'text_a': all_text_a,
                'text_b': all_text_b
            }

            if output_modes[task] == 'classification':
                all_label_ids = torch.tensor(
                    [f.label_id for f in eval_features], dtype=torch.long)
            else:
                all_label_ids = torch.tensor(
                    [f.label_id for f in eval_features], dtype=torch.float32)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label_ids)
            eval_sampler = SequentialSampler(eval_data)
            eval_loaders.append(
                DataLoader(eval_data,
                           sampler=eval_sampler,
                           batch_size=args.eval_batch_size,
                           drop_last=True))
            error_analysis_dicts.append(error_data)

    global_step = 0
    if args.do_train:
        loaders = []
        logger.info(' Num tasks = {}'.format(len(train_examples)))
        for i, task in enumerate(task_names):
            train_features = convert_examples_to_features(
                train_examples[i], label_list[i], args.max_seq_length,
                tokenizer, output_modes[task])
            logger.info('********* Training data for {}'.format(task))
            logger.info('   Data size = {}'.format(len(train_features)))

            all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in train_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in train_features], dtype=torch.long)
            if output_modes[task] == 'classification':
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.long)
            else:
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.float32)
            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_segment_ids, all_label_ids)
            train_sampler = RandomSampler(train_data)
            loaders.append(
                iter(
                    DataLoader(train_data,
                               sampler=train_sampler,
                               batch_size=args.train_batch_size,
                               drop_last=True)))
        total_params = sum(p.numel() for p in model.parameters())
        logger.info(' Num param = {}'.format(total_params))
        loaders = [cycle(it) for it in loaders]

        model.train()
        best_target_score = 0.
        task_id = 0
        all_ev_acc = []

        for epoch in trange(int(args.num_train_epochs), desc='Epoch'):
            if args.sample == 'anneal':
                probs = [len(dataset) for dataset in train_examples]
                probs = anneal(probs,
                               epoch,
                               args.num_train_epochs,
                               anneal_factor=0.8,
                               target_task_id=0,
                               weight=5)

            tr_loss = [0. for i in range(num_tasks)]
            nb_tr_examples, nb_tr_steps = 0, 0

            ## DEBUG
            # steps_per_epoch = 5

            for step in trange(steps_per_epoch, desc='Steps'):
                if step % args.gradient_accumulation_steps == 0:
                    task_id = np.random.choice(len(probs), p=probs)
                    output_mode = output_modes[task_names[task_id]]
                batch = next(loaders[task_id])
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                logits = model(input_ids, segment_ids, input_mask, task_id,
                               output_mode)

                if output_mode == 'classification':
                    loss_fct = CrossEntropyLoss()
                    loss_fct = DataParallelCriterion(loss_fct)
                    logits = [
                        logits[i].view(-1, logits[0].size(-1))
                        for i in range(len(logits))
                    ]
                    loss = loss_fct(logits, label_ids.view(-1))
                else:
                    loss_fct = MSELoss()
                    loss_fct = DataParallelCriterion(loss_fct)
                    logits = [logits[i].view(-1) for i in range(len(logits))]
                    loss = loss_fct(logits, label_ids.view(-1))
                if n_gpu > 1:
                    loss = loss.mean()
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss[task_id] += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    model.zero_grad()
                    global_step += 1

                #this is where you'd calculate training acc

            ev_acc = []
            for i, task in enumerate(task_names):
                acc = do_eval(model, logger, args.output_dir, device,
                              tr_loss[i], nb_tr_steps, global_step,
                              processor_list[i], label_list[i], tokenizer,
                              eval_loaders[i], error_analysis_dicts[i],
                              output_modes[task], i, task)
                ev_acc.append(acc)
            all_ev_acc.append(ev_acc)
            # logger.info('Average acc: {}'.format(np.mean(ev_acc)))
            if ev_acc[args.target_task_id] > best_target_score:
                best_target_score = ev_acc[args.target_task_id]
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
                output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
                torch.save(model_to_save.state_dict(), output_model_file)
                bert_config.to_json_file(output_config_file)
                tokenizer.save_vocabulary(args.output_dir)

                ##TODO: this is where you should add error analysis to get best version

            logger.info('Best target acc: {}'.format(best_target_score))

        output_eval_file = os.path.join(args.output_dir, 'eval_results.txt')
        with open(output_eval_file, 'w') as writer:
            logger.info('******** Eval Results ********')
            for n, acc in enumerate(all_ev_acc):
                logger.info('   {} = {}\n'.format(n, acc))
                writer.write('{} \t {}\n'.format(n, acc))
Beispiel #16
0
    def __init__(self,
                 model,
                 vocab_size,
                 train_dataloader,
                 test_dataloader=None,
                 lr: float = 1e-4,
                 betas=(0.9, 0.999),
                 weight_decay: float = 0.01,
                 warmup_steps=10000,
                 with_cuda: bool = True,
                 cuda_devices=None,
                 log_freq: int = 10,
                 include_next=False,
                 include_vision=True,
                 total_epochs=1):
        """
        :param bert: BERT model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        n_gpu = torch.cuda.device_count()
        print("device", device, "n_gpu", n_gpu)

        # Initialize the BERT Language Model, with BERT model
        self.model = model.to(self.device)
        self.bert = self.model.bert
        self.padding_idx = 0
        self.include_next = include_next
        self.include_vision = include_vision

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count()))
            self.model = DataParallelModel(self.model,
                                           device_ids=range(
                                               torch.cuda.device_count()))

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = optim.Adamax(self.model.parameters(),
                                  lr=lr,
                                  betas=betas,
                                  weight_decay=weight_decay)
        if self.model.__class__.__name__ in [
                'DataParallel', 'DataParallelModel'
        ]:
            self.optim_schedule = ScheduledOptim(
                self.optim,
                self.model.module.bert.transformer_hidden_size,
                n_warmup_steps=warmup_steps)
        else:
            self.optim_schedule = ScheduledOptim(
                self.optim,
                self.model.bert.transformer_hidden_size,
                n_warmup_steps=warmup_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=0)
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count()))
            self.criterion = DataParallelCriterion(
                self.criterion, device_ids=range(torch.cuda.device_count()))

        self.log_freq = log_freq
        self.total_iters = total_epochs * len(train_dataloader)

        print("Total Parameters:",
              sum([p.nelement() for p in self.model.parameters()]))
Beispiel #17
0
 def __init__(self, metric_dict: Dict):
     super(ParallelMetricSet, self).__init__(metric_dict)
     self.metrics = {
         k: DataParallelCriterion(v)
         for k, v in metric_dict.items()
     }
Beispiel #18
0
def train(task_ids, model):
    tasks = [args.tasks[task_id] for task_id in task_ids]

    logger.info("start to train { task: %s, seq train type: %s }" %
                (tasks, args.seq_train_type))
    model_dir = get_model_dir(tasks)
    make_dir(model_dir)

    #train_dataset = [(TASK_DICT[t]["train"] if not args.seq_distil else TASK_DICT[t]["train"].replace("train", "distil")) for t in tasks]
    train_dataset = [
        swap_name(TASK_DICT[t]["train"], args.seq_distil, args.ref1)
        for t in tasks
    ]
    train_extra_data = []
    if "lll" in args.seq_train_type and task_ids[0] > 0 and not args.skip_tasks:
        prev_task = args.tasks[task_ids[0] - 1]
        with torch.no_grad():
            create_extra_data(tasks[0], prev_task, model, train_extra_data)
    elif "gem" in args.seq_train_type and task_ids[0] > 0:
        get_real_data(tasks[0], train_extra_data, accum=False, encode=True)
        args.memory_data.append(train_extra_data)
        train_extra_data = []
    logger.info('extra training data size: {}'.format(len(train_extra_data)))

    if not model:
        # which_model_to_load = model_dir if os.path.isfile(os.path.join(model_dir, FINAL_SAVE_NAME)) else args.model_name
        model = MODEL_CLASS.from_pretrained(args.model_name).cuda()
        model.resize_token_embeddings(len(TOKENIZER))
        if not args.fp32:
            model = FP16_Module(model)

    gen_token = get_gen_token(tasks[0])
    TOKENIZER.add_tokens([gen_token])
    TOKENIZER.save_pretrained(model_dir)
    SPECIAL_TOKENS[tasks[0]] = gen_token
    SPECIAL_TOKEN_IDS[tasks[0]] = TOKENIZER.convert_tokens_to_ids(gen_token)
    logger.info('gen token = {} , gen token id = {}'.format(
        gen_token, SPECIAL_TOKEN_IDS[tasks[0]]))
    MODEL_CONFIG.vocab_size = len(TOKENIZER)
    MODEL_CONFIG.to_json_file(os.path.join(model_dir, CONFIG_NAME))
    global TOKENS_WEIGHT
    if len(TOKENIZER) != TOKENS_WEIGHT.shape[0]:
        TOKENS_WEIGHT = torch.cat((TOKENS_WEIGHT, torch.ones([1]).cuda()))

    if args.skip_tasks and len(tasks) == 1:
        logger.info("*********** skip task: {} ***********".format(tasks[0]))
        if tasks[0] in args.skip_tasks:
            if len(args.skip_tasks) == 1:
                model_dir = get_model_dir(tasks)
                model_path = os.path.join(model_dir, FINAL_SAVE_NAME)
                config_path = os.path.join(model_dir, CONFIG_NAME)
                model_config = CONFIG_CLASS.from_json_file(config_path)
                model = MODEL_CLASS(model_config).cuda()
                state_dict = torch.load(model_path)
                model.load_state_dict(state_dict)
                if not args.fp32:
                    model = FP16_Module(model)
                if args.seq_train_type in REG_TYPE_KEYS:
                    logger.info("calulating reg_params ...")
                    train_qadata = QADataset(train_dataset, "train",
                                             SPECIAL_TOKEN_IDS[tasks[0]],
                                             train_extra_data)
                    max_train_batch_size = max(
                        len(train_qadata) // args.min_n_steps,
                        args.min_batch_size)
                    train_dataloader = create_dataloader(
                        train_qadata, "train", max_train_batch_size)
                    parallel_model = DataParallelModel(WrapModel(model),
                                                       args.device_ids)
                    regularizer = REG_TYPES[args.seq_train_type](
                        model, parallel_model, [train_dataloader], tasks[0])
                    regularizer.task_start_do()
                    regularizer.task_end_do()
                    torch.save(model.state_dict(),
                               os.path.join(model_dir, FINAL_SAVE_NAME))
                    logger.info("done reg_params!")
            args.skip_tasks.remove(tasks[0])
            return model

    model.resize_token_embeddings(
        len(TOKENIZER) if not args.multitask_specific else len(TOKENIZER) + 4)
    if args.multitask_specific:
        for i in range(4):
            TOKENS_WEIGHT = torch.cat((TOKENS_WEIGHT, torch.ones([1]).cuda()))
    if args.distil:
        teacher_model = MODEL_CLASS.from_pretrained(args.model_name).cuda()
        teacher_vocab_size = json.load(
            open("models/gpt2/lll/{task}_0.2/{task}/config.json".format(
                task=tasks[0])))['vocab_size']
        teacher_model.resize_token_embeddings(teacher_vocab_size)
        print("load teacher model from {}".format(
            "models/gpt2/lll/{task}_0.2/{task}/model-finish".format(
                task=tasks[0])))
        teacher_model.load_state_dict(
            torch.load("models/gpt2/lll/{task}_0.2/{task}/model-finish".format(
                task=tasks[0])))
        if not args.fp32:
            teacher_model = FP16_Module(teacher_model)
        teacher_model.eval()
        teacher_model = DataParallelModel(WrapModel(teacher_model),
                                          args.device_ids)

    if not args.fp32:  # again because resize_token_embeddings makes embedding layer fp32
        model = FP16_Module(model)

    parallel_model = DataParallelModel(WrapModel(model), args.device_ids)

    train_qadata = QADataset(train_dataset, "train",
                             SPECIAL_TOKEN_IDS[tasks[0]], train_extra_data)
    max_train_batch_size = max(
        len(train_qadata) // args.min_n_steps, args.min_batch_size)
    train_dataloader = create_dataloader(train_qadata, "train",
                                         max_train_batch_size)
    if not args.unbound and args.seq_train_type not in [
            "multitask", "multilm"
    ]:
        #n_train_epochs = TASK_DICT[tasks[0]]["n_train_epochs"]
        n_train_epochs = args.n_train_epochs[tasks[0]]
    else:
        n_train_epochs = args.n_train_epochs['_'.join(tasks)]
    n_train_optimization_steps = len(train_qadata) * n_train_epochs
    logger.info(
        'len of train dataset: {} , max train batch size {} , num of opt steps: {}'
        .format(len(train_qadata), max_train_batch_size,
                n_train_optimization_steps))

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if "gem" in args.seq_train_type:
        model.task_id = task_ids[0]
        if not hasattr(model, "grad_dims"):
            model.grad_dims = []
            for param in model.parameters():
                model.grad_dims.append(param.data.numel())
        if not hasattr(model, "grads"):
            model.grads = torch.zeros(sum(model.grad_dims), len(args.tasks))
            model.grads = model.grads.cuda()

    if args.seq_train_type in REG_TYPE_KEYS:
        optimizer = Weight_Regularized_AdamW(optimizer_grouped_parameters,
                                             lr=args.learning_rate,
                                             eps=args.adam_epsilon)
    else:
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
    if not args.fp32:
        optimizer = FP16_Optimizer(optimizer,
                                   static_loss_scale=None,
                                   dynamic_loss_scale=True,
                                   dynamic_loss_args={
                                       'scale_window': 100,
                                       'min_scale': 1,
                                       'delayed_shift': 2
                                   })

    scheduler = AnnealingLR(optimizer,
                            start_lr=args.learning_rate,
                            warmup_iter=int(args.n_warmup_ratio *
                                            len(train_qadata)),
                            num_iters=int(n_train_optimization_steps),
                            decay_style=args.decay_style)
    train_loss_fct = DataParallelCriterion(
        CrossEntropyLoss(ignore_index=FILL_VAL, weight=TOKENS_WEIGHT),
        args.device_ids)
    if args.distil:
        kd_loss_fct = DataParallelCriterion(
            nn.KLDivLoss(reduction="batchmean"), args.device_ids)

    if args.seq_train_type in REG_TYPE_KEYS:
        copy_train_dataloader = create_dataloader(train_qadata, "train",
                                                  max_train_batch_size)
        prev_task = args.tasks[task_ids[0] - 1]
        regularizer = REG_TYPES[args.seq_train_type](model, parallel_model,
                                                     [copy_train_dataloader],
                                                     tasks[0], prev_task)
        regularizer.task_start_do()

    tot_n_steps = 0
    train_once = TrainStep(model, optimizer, scheduler)
    if "gem" in args.seq_train_type and task_ids[0] != 0:
        gem_step = GEMStep(model, parallel_model, train_loss_fct, optimizer)
    model.train()
    for ep in range(n_train_epochs):
        cum_loss, cum_qa_loss, cum_lm_loss, cur_n_inputs = 0, 0, 0, 0
        for n_steps, (_, _, cqa, _, Y, gen_X, gen_Y,
                      is_extra) in enumerate(train_dataloader):

            n_inputs = sum(_cqa.shape[0] for _cqa in cqa)
            if args.multitask_specific:
                for i in range(len(is_extra)):
                    gen_X[i][:, 0] += is_extra[i]
                    is_extra[i] = is_extra[i] * 0

            for i in range(len(cqa)):
                cqa[i] = (cqa[i].to(args.device_ids[i]), )
                Y[i] = Y[i].to(args.device_ids[i])
                gen_X[i] = (gen_X[i].to(args.device_ids[i]), )
                gen_Y[i] = gen_Y[i].to(args.device_ids[i])
                is_extra[i] = is_extra[i].to(args.device_ids[i])

            if args.distil:
                losses = get_distil_losses(teacher_model,
                                           parallel_model,
                                           cqa,
                                           Y,
                                           gen_X,
                                           gen_Y,
                                           is_extra,
                                           kd_loss_fct,
                                           train_loss_fct,
                                           args.temperature_kd,
                                           pad_idx=FILL_VAL)
            else:
                losses = get_losses(parallel_model, cqa, Y, gen_X, gen_Y,
                                    train_loss_fct)
            loss = sum(losses)
            if "gem" in args.seq_train_type and task_ids[0] != 0:
                gem_step(task_ids[0])
            train_once(loss, n_inputs)

            qa_loss = losses[0].item() * n_inputs
            lm_loss = losses[1].item() * n_inputs
            cum_loss += (qa_loss + lm_loss)
            cum_qa_loss += qa_loss
            cum_lm_loss += lm_loss
            cur_n_inputs += n_inputs

            if (n_steps + 1) % args.logging_steps == 0:
                logger.info(
                    'progress {:.3f} , lr {:.1E} , loss {:.3f} , qa loss {:.3f} , lm loss {:.3f} , avg batch size {:.1f}'
                    .format(ep + cur_n_inputs / len(train_qadata),
                            scheduler.get_lr(), cum_loss / cur_n_inputs,
                            cum_qa_loss / cur_n_inputs,
                            cum_lm_loss / cur_n_inputs,
                            cur_n_inputs / (n_steps + 1)))

        torch.save(model.state_dict(),
                   os.path.join(model_dir, SAVE_NAME + str(ep + 1)))
        tot_n_steps += (n_steps + 1)
        logger.info(
            'epoch {}/{} done , tot steps {} , lr {:.1E} , loss {:.2f} , qa loss {:.2f} , lm loss {:.2f} , avg batch size {:.1f}'
            .format(ep + 1, n_train_epochs, tot_n_steps, scheduler.get_lr(),
                    cum_loss / cur_n_inputs, cum_qa_loss / cur_n_inputs,
                    cum_lm_loss / cur_n_inputs, cur_n_inputs / (n_steps + 1)))

    # task end do for reg
    if args.seq_train_type in REG_TYPE_KEYS:
        regularizer.task_end_do()
    torch.save(model.state_dict(), os.path.join(model_dir, FINAL_SAVE_NAME))

    return model
    def __init__(self,
                 model,
                 mask_prob: float = 0.3,
                 clip: int = 1,
                 optimizer=None,
                 beam_width: int = 5,
                 max_len_a: float = 1.1,
                 max_len_b: int = 5,
                 len_penalty_ratio: float = 0.8,
                 nll_loss: bool = False,
                 fp16: bool = False,
                 mm_mode="mixed",
                 rank: int = -1):
        self.model = model

        self.clip = clip
        self.optimizer = optimizer

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.num_gpu = torch.cuda.device_count()

        self.mask_prob = mask_prob
        if nll_loss:
            self.criterion = nn.NLLLoss(
                ignore_index=model.text_processor.pad_token_id())
        else:
            self.criterion = SmoothedNLLLoss(
                ignore_index=model.text_processor.pad_token_id())

        self.num_gpu = torch.cuda.device_count()
        self.fp16 = False
        self.rank = rank
        if rank >= 0:
            self.device = torch.device('cuda', rank)
            torch.cuda.set_device(self.device)

        self.model = self.model.to(self.device)

        if fp16:
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level="O2")
            self.fp16 = True

        self.generator = BeamDecoder(self.model,
                                     beam_width=beam_width,
                                     max_len_a=max_len_a,
                                     max_len_b=max_len_b,
                                     len_penalty_ratio=len_penalty_ratio)
        if rank >= 0:
            self.model = DistributedDataParallel(self.model,
                                                 device_ids=[self.rank],
                                                 output_device=self.rank,
                                                 find_unused_parameters=True)
            self.generator = DistributedDataParallel(
                self.generator,
                device_ids=[self.rank],
                output_device=self.rank,
                find_unused_parameters=True)
        elif self.num_gpu > 1:
            print("Let's use", self.num_gpu, "GPUs!")
            self.model = DataParallelModel(self.model)
            self.criterion = DataParallelCriterion(self.criterion)
            self.generator = DataParallelModel(self.generator)

        self.reference = None
        self.best_bleu = -1.0
        self.mm_mode = mm_mode
Beispiel #20
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_corpus",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--discriminative_finetuning',
                        action='store_true',
                        help='Whether to use discriminative fine-tuning')

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_corpus)
        train_dataset = BERTDataset(args.train_corpus,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)
        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    #############################################################################
    # model = BertForPreTraining.from_pretrained(args.bert_model)
    model = BertForMaskedLM.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        # model = torch.nn.DataParallel(model)
        model = DataParallelModel(model)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        if args.discriminative_finetuning:
            group1 = ['layer.0', 'layer.1.']
            group2 = ['layer.2', 'layer.3']
            group3 = ['layer.4', 'layer.5']
            group4 = ['layer.6', 'layer.7']
            group5 = ['layer.8', 'layer.9']
            group6 = ['layer.10', 'layer.11']
            group_all = ['layer.0', 'layer.1', 'layer.2', 'layer.3', 'layer.4', 'layer.5', \
            'layer.6', 'layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.01},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.01, 'lr': args.learning_rate},

                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.0},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.0, 'lr': args.learning_rate},
            ]
        else:
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)

        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            #TODO: check if this works with current data generator from disk that relies on next(file)
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size,
                                      drop_last=True)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch

                logits = model(input_ids, segment_ids, input_mask)
                loss_fct = CrossEntropyLoss(ignore_index=-1)
                loss_fct = DataParallelCriterion(loss_fct)
                logits = [
                    logits[i].view(-1, model.module.config.vocab_size)
                    for i in range(len(logits))
                ]
                loss = loss_fct(logits, lm_label_ids.view(-1))

                # loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
                # loss = model(input_ids, segment_ids, input_mask, lm_label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        if args.do_train:
            torch.save(model_to_save.state_dict(), output_model_file)
            model_to_save.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(args.output_dir)
Beispiel #21
0
    if config.model_type=='LSTM':
        model = LSTMLM(input_size=len(vocab),
                       embedding_size=config.embedding_size,
                       hidden_size=config.hidden_size,
                       output_size=len(vocab),
                       n_layers=config.n_layers,
                       dropout_p=config.dropout_p)
    elif config.model_type=='BiLSTM':
        model = BiLSTMLM(input_size=len(vocab),
                         embedding_size=config.embedding_size,
                         hidden_size=config.hidden_size,
                         output_size=len(vocab),
                         n_layers=config.n_layers,
                         dropout_p=config.dropout_p)
        
    loss_fn = nn.NLLLoss(ignore_index=vocab.stoi[vocab.pad_token])
    optimizer = optim.Adam(model.parameters())
    
    if config.cuda:
        if config.multi_gpu:
            from parallel import DataParallelModel, DataParallelCriterion
            model = DataParallelModel(model).cuda()
            loss_fn = DataParallelCriterion(loss_fn).cuda()
        else:
            model = model.cuda()
            loss_fn = loss_fn.cuda()
    print('=========MODEL=========\n',model)

    # Train
    for epoch in range(1, config.epochs+1):
        train()
Beispiel #22
0
def createModels(args, userNum, itemNum, rt):
    if args.model == 'NCF':
        model = NCF(userNum, itemNum, 64, layers=[128, 64, 32, 16, 8]).cuda()
    elif args.model == 'GCF':
        model = GCF(userNum,
                    itemNum,
                    rt,
                    embedSize=args.embedSize,
                    layers=args.layers).cuda()
    elif args.model == 'GACFV1':
        model = GACFV1(userNum,
                       itemNum,
                       rt,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV2':
        model = GACFV2(userNum,
                       itemNum,
                       rt,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV3':
        model = GACFV2(userNum,
                       itemNum,
                       rt,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV4':
        model = GACFV4(userNum,
                       itemNum,
                       rt,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV5':
        model = GACFV5(userNum,
                       itemNum,
                       rt,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV6':
        model = GACFV6(userNum,
                       itemNum,
                       rt,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    # model = SVD(userNum,itemNum,50).cuda()
    # model = NCF(userNum,itemNum,64,layers=[128,64,32,16,8]).cuda()

    if args.evaluate == 'MSE':
        lossfn = MSELoss()
    elif args.evaluate == 'RANK':
        lossfn = BCEWithLogitsLoss()

    if args.parallel == True:
        model = DataParallelModel(model)  # 并行化model
        lossfn = DataParallelCriterion(lossfn)  # 并行化损失函数
    optim = Adam(model.parameters(),
                 lr=args.lr,
                 weight_decay=args.weight_decay)
    return model, lossfn, optim
Beispiel #23
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs
    args.warmup_steps = t_total // 100

    # Prepare optimizer and schedule (linear warmup and decay)
    optimizer_grouped_parameters = get_param_groups(args, model)
    optimizer = RAdam(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        # model = torch.nn.DataParallel(model)
        model = DataParallelModel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    args.logging_steps = len(train_dataloader) // 1
    args.save_steps = args.logging_steps
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    set_seed(args)
    for _ in train_iterator:
        args.current_epoch = _
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                'token_type_ids':
                batch[2] if args.model_type in ['bert', 'xlnet'] else None,
            }  # XLM and RoBERTa don't use segment_ids
            #   'labels':         batch[3]}
            outputs = model(**inputs)
            outputs = [outputs[i][0] for i in range(len(outputs))]

            loss_fct = CrossEntropyLoss()
            loss_fct = DataParallelCriterion(loss_fct)

            loss = loss_fct(outputs, batch[3])

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Beispiel #24
0
def createModels(args, userNum, itemNum, adj):
    if args.model == 'NCF':
        model = NCF(userNum, itemNum, 64, layers=[128, 64, 32, 16, 8]).cuda()
    if args.model == 'NMF':
        model = NMF(args.model, userNum, itemNum, 3, args.embedSize,
                    args.droprate).cuda()
    elif args.model == 'NGCFMF':
        model = NGCFMF(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers).cuda()
    elif args.model == 'NGCFMLP':
        model = NGCFMLP(userNum,
                        itemNum,
                        adj,
                        embedSize=args.embedSize,
                        layers=args.layers).cuda()
    elif args.model == 'NGCFMFMLP':
        model = NGCFMFMLP(userNum,
                          itemNum,
                          adj,
                          embedSize=args.embedSize,
                          layers=args.layers).cuda()
    elif args.model == 'NGCFMF_concat_MF':
        model = NGCFMF_concat_MF(userNum,
                                 itemNum,
                                 adj,
                                 embedSize=args.embedSize,
                                 layers=args.layers).cuda()
    elif args.model == 'NGCFMF_concat_MLP':
        model = NGCFMF_concat_MLP(userNum,
                                  itemNum,
                                  adj,
                                  embedSize=args.embedSize,
                                  layers=args.layers).cuda()
    elif args.model == 'NGCFMLP_concat_MF':
        model = NGCFMLP_concat_MF(userNum,
                                  itemNum,
                                  adj,
                                  embedSize=args.embedSize,
                                  layers=args.layers).cuda()
    elif args.model == 'NGCFMLP_concat_MLP':
        model = NGCFMLP_concat_MLP(userNum,
                                   itemNum,
                                   adj,
                                   embedSize=args.embedSize,
                                   layers=args.layers).cuda()
    elif args.model == 'NGCFMF_concat_MF_MLP':
        model = NGCFMF_concat_MF_MLP(userNum,
                                     itemNum,
                                     adj,
                                     embedSize=args.embedSize,
                                     layers=args.layers).cuda()
    elif args.model == 'NGCFMLP_concat_MF_MLP':
        model = NGCFMLP_concat_MF_MLP(userNum,
                                      itemNum,
                                      adj,
                                      embedSize=args.embedSize,
                                      layers=args.layers).cuda()
    elif args.model == 'GACFV1':
        model = GACFV1(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV2':
        model = GACFV2(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFMask':
        model = GACFMask(userNum,
                         itemNum,
                         adj,
                         embedSize=args.embedSize,
                         layers=args.layers,
                         droprate=args.droprate).cuda()
    elif args.model == 'SPGA':
        model = SPGACF(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV3':
        model = GACFV3(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV4':
        model = GACFV4(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV5':
        model = GACFV5(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV6':
        model = GACFV6(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()

    if args.train_mode == 'PairSampling':
        lossfn = BPRLoss()
        if args.parallel == True:
            model = DataParallelModel(model)
            lossfn = DataParallelCriterion2(lossfn)
    elif args.train_mode == 'NegSampling':
        lossfn = BCEWithLogitsLoss()
        if args.parallel == True:
            model = DataParallelModel(model)  # 并行化model
            lossfn = DataParallelCriterion(lossfn)  # 并行化损失函数
    optim = Adam(model.parameters(),
                 lr=args.lr,
                 weight_decay=args.weight_decay)
    return model, lossfn, optim
Beispiel #25
0
 def parallelize(self):
     self.parallel = True
     self.model = DataParallelModel(self.model)
     self.criterion = DataParallelCriterion(self.criterion)
Beispiel #26
0
def main_tr(args, crossVal):
    dataLoad = ld.LoadData(args.data_dir, args.classes)
    data = dataLoad.processData(crossVal, args.data_name)

    # load the model
    model = net.MiniSeg(args.classes, aux=True)
    if not osp.isdir(osp.join(args.savedir + '_mod' + str(args.max_epochs))):
        os.mkdir(args.savedir + '_mod' + str(args.max_epochs))
    if not osp.isdir(
            osp.join(args.savedir + '_mod' + str(args.max_epochs),
                     args.data_name)):
        os.mkdir(
            osp.join(args.savedir + '_mod' + str(args.max_epochs),
                     args.data_name))
    saveDir = args.savedir + '_mod' + str(
        args.max_epochs) + '/' + args.data_name + '/' + args.model_name
    # create the directory if not exist
    if not osp.exists(saveDir):
        os.mkdir(saveDir)

    if args.gpu and torch.cuda.device_count() > 1:
        #model = torch.nn.DataParallel(model)
        model = DataParallelModel(model)
    if args.gpu:
        model = model.cuda()

    total_paramters = sum([np.prod(p.size()) for p in model.parameters()])
    print('Total network parameters: ' + str(total_paramters))

    # define optimization criteria
    weight = torch.from_numpy(
        data['classWeights'])  # convert the numpy array to torch
    if args.gpu:
        weight = weight.cuda()

    criteria = CrossEntropyLoss2d(weight, args.ignore_label)  #weight
    if args.gpu and torch.cuda.device_count() > 1:
        criteria = DataParallelCriterion(criteria)
    if args.gpu:
        criteria = criteria.cuda()

    # compose the data with transforms
    trainDataset_main = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(args.width, args.height),
        myTransforms.RandomCropResize(int(32. / 1024. * args.width)),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor()
    ])
    trainDataset_scale1 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(int(args.width * 1.5), int(args.height * 1.5)),
        myTransforms.RandomCropResize(int(100. / 1024. * args.width)),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor()
    ])

    trainDataset_scale2 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(int(args.width * 1.25), int(args.height * 1.25)),
        myTransforms.RandomCropResize(int(100. / 1024. * args.width)),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor()
    ])
    trainDataset_scale3 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(int(args.width * 0.75), int(args.height * 0.75)),
        myTransforms.RandomCropResize(int(32. / 1024. * args.width)),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor()
    ])

    valDataset = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(args.width, args.height),
        myTransforms.ToTensor()
    ])

    # since we training from scratch, we create data loaders at different scales
    # so that we can generate more augmented data and prevent the network from overfitting
    trainLoader = torch.utils.data.DataLoader(myDataLoader.Dataset(
        data['trainIm'], data['trainAnnot'], transform=trainDataset_main),
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=args.num_workers,
                                              pin_memory=True,
                                              drop_last=True)

    trainLoader_scale1 = torch.utils.data.DataLoader(
        myDataLoader.Dataset(data['trainIm'],
                             data['trainAnnot'],
                             transform=trainDataset_scale1),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=True)

    trainLoader_scale2 = torch.utils.data.DataLoader(
        myDataLoader.Dataset(data['trainIm'],
                             data['trainAnnot'],
                             transform=trainDataset_scale2),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=True)
    trainLoader_scale3 = torch.utils.data.DataLoader(
        myDataLoader.Dataset(data['trainIm'],
                             data['trainAnnot'],
                             transform=trainDataset_scale3),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=True)

    valLoader = torch.utils.data.DataLoader(myDataLoader.Dataset(
        data['valIm'], data['valAnnot'], transform=valDataset),
                                            batch_size=args.batch_size,
                                            shuffle=False,
                                            num_workers=args.num_workers,
                                            pin_memory=True)
    max_batches = len(trainLoader) + len(trainLoader_scale1) + len(
        trainLoader_scale2) + len(trainLoader_scale3)

    if args.gpu:
        cudnn.benchmark = True

    start_epoch = 0

    if args.pretrained is not None:
        state_dict = torch.load(args.pretrained)
        new_keys = []
        new_values = []
        for idx, key in enumerate(state_dict.keys()):
            if 'pred' not in key:
                new_keys.append(key)
                new_values.append(list(state_dict.values())[idx])
        new_dict = OrderedDict(list(zip(new_keys, new_values)))
        model.load_state_dict(new_dict, strict=False)
        print('pretrained model loaded')

    if args.resume is not None:
        if osp.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            args.lr = checkpoint['lr']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    log_file = osp.join(saveDir, 'trainValLog_' + args.model_name + '.txt')
    if osp.isfile(log_file):
        logger = open(log_file, 'a')
    else:
        logger = open(log_file, 'w')
        logger.write("Parameters: %s" % (str(total_paramters)))
        logger.write("\n%s\t%s\t\t%s\t%s\t%s\t%s\tlr" %
                     ('CrossVal', 'Epoch', 'Loss(Tr)', 'Loss(val)',
                      'mIOU (tr)', 'mIOU (val)'))
    logger.flush()

    optimizer = torch.optim.Adam(model.parameters(),
                                 args.lr, (0.9, 0.999),
                                 eps=1e-08,
                                 weight_decay=1e-4)
    maxmIOU = 0
    maxEpoch = 0
    print(args.model_name + '-CrossVal: ' + str(crossVal + 1))
    for epoch in range(start_epoch, args.max_epochs):
        # train for one epoch
        cur_iter = 0

        train(args, trainLoader_scale1, model, criteria, optimizer, epoch,
              max_batches, cur_iter)
        cur_iter += len(trainLoader_scale1)
        train(args, trainLoader_scale2, model, criteria, optimizer, epoch,
              max_batches, cur_iter)
        cur_iter += len(trainLoader_scale2)
        train(args, trainLoader_scale3, model, criteria, optimizer, epoch,
              max_batches, cur_iter)
        cur_iter += len(trainLoader_scale3)
        lossTr, overall_acc_tr, per_class_acc_tr, per_class_iu_tr, mIOU_tr, lr = \
                train(args, trainLoader, model, criteria, optimizer, epoch, max_batches, cur_iter)

        # evaluate on validation set
        lossVal, overall_acc_val, per_class_acc_val, per_class_iu_val, mIOU_val = \
                val(args, valLoader, model, criteria)

        torch.save(
            {
                'epoch': epoch + 1,
                'arch': str(model),
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lossTr': lossTr,
                'lossVal': lossVal,
                'iouTr': mIOU_tr,
                'iouVal': mIOU_val,
                'lr': lr
            },
            osp.join(
                saveDir, 'checkpoint_' + args.model_name + '_crossVal' +
                str(crossVal + 1) + '.pth.tar'))

        # save the model also
        model_file_name = osp.join(
            saveDir, 'model_' + args.model_name + '_crossVal' +
            str(crossVal + 1) + '_' + str(epoch + 1) + '.pth')
        torch.save(model.state_dict(), model_file_name)

        logger.write(
            "\n%d\t\t%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.7f" %
            (crossVal + 1, epoch + 1, lossTr, lossVal, mIOU_tr, mIOU_val, lr))
        logger.flush()
        print("\nEpoch No. %d:\tTrain Loss = %.4f\tVal Loss = %.4f\t mIOU(tr) = %.4f\t mIOU(val) = %.4f\n" \
                % (epoch + 1, lossTr, lossVal, mIOU_tr, mIOU_val))

        if mIOU_val >= maxmIOU:
            maxmIOU = mIOU_val
            maxEpoch = epoch + 1
        torch.cuda.empty_cache()
    logger.flush()
    logger.close()
    return maxEpoch, maxmIOU