Exemple #1
0
    def __init__(self,
                 model,
                 mask_prob: float = 0.15,
                 clip: int = 1,
                 optimizer=None):
        self.model = model
        self.clip = clip
        self.optimizer = optimizer

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)

        self.mask_prob = mask_prob
        self.criterion = nn.NLLLoss(
            ignore_index=model.text_processor.pad_token_id())

        num_gpu = torch.cuda.device_count()
        if num_gpu > 1:
            print("Let's use", num_gpu, "GPUs!")
            self.model = DataParallelModel(self.model)
            self.criterion = DataParallelCriterion(self.criterion)

        self.best_dev_loss = float("inf")
        self.best_train_loss = float("inf")
        self.last_train_loss = float("inf")
Exemple #2
0
def createModels(args, userNum, itemNum):
    if args.model == 'SPUIGACF':
        model = SPUIGACF(userNum,
                         itemNum,
                         embedSize=args.embedSize,
                         layers=args.layers,
                         droprate=args.droprate).cuda()
    elif args.model == 'SPUIMultiGACF':
        model = SPUIMultiGACF(userNum,
                              itemNum,
                              embedSize=args.embedSize,
                              layers=args.layers,
                              droprate=args.droprate).cuda()
    elif args.model == 'SPUIGAGPCF':
        model = SPUIGAGPCF(userNum,
                           itemNum,
                           adj,
                           embedSize=args.embedSize,
                           layers=args.layers,
                           droprate=args.droprate).cuda()

    if args.train_mode == 'PairSampling':
        lossfn = BPRLoss()
        if args.parallel == True:
            model = DataParallelModel(model)
            lossfn = DataParallelCriterion2(lossfn)
    elif args.train_mode == 'NegSampling':
        lossfn = BCEWithLogitsLoss()
        if args.parallel == True:
            model = DataParallelModel(model)  # 并行化model
            lossfn = DataParallelCriterion(lossfn)  # 并行化损失函数
    optim = Adam(model.parameters(),
                 lr=args.lr,
                 weight_decay=args.weight_decay)
    return model, lossfn, optim
Exemple #3
0
    def __init__(self, cfg: Namespace, data: Dataset):
        """
        Args:
            cfg:  configuration
            data:  train dataset
        """
        self.cfg = cfg
        self.train, self.valid = data.split(0.8)
        RATING_FIELD.build_vocab(self.train)

        self.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')  # pylint: disable=no-member
        self.batch_size = cfg.batch_size
        if torch.cuda.is_available():
            self.batch_size *= torch.cuda.device_count()

        self.trn_itr = BucketIterator(
            self.train,
            device=self.device,
            batch_size=self.batch_size,
            shuffle=True,
            train=True,
            sort_within_batch=True,
            sort_key=lambda exam: -len(exam.comment_text))
        self.vld_itr = BucketIterator(
            self.valid,
            device=self.device,
            batch_size=self.batch_size,
            shuffle=False,
            train=False,
            sort_within_batch=True,
            sort_key=lambda exam: -len(exam.comment_text))
        self.log_step = 1000
        if len(self.vld_itr) < 100:
            self.log_step = 10
        elif len(self.vld_itr) < 1000:
            self.log_step = 100

        bert_path = cfg.bert_path if cfg.bert_path else 'bert-base-cased'
        self.model = BertForSequenceClassification.from_pretrained(
            bert_path, num_labels=2)
        pos_weight = (
            len([exam for exam in self.train.examples if exam.target < 0.5]) /
            len([exam for exam in self.train.examples if exam.target >= 0.5]))
        pos_wgt_tensor = torch.tensor([1.0, pos_weight], device=self.device)  # pylint: disable=not-callable
        self.criterion = nn.CrossEntropyLoss(weight=pos_wgt_tensor)
        if torch.cuda.is_available():
            self.model = DataParallelModel(self.model.cuda())
            self.criterion = DataParallelCriterion(self.criterion)
        self.optimizer = optim.Adam(self.model.parameters(), cfg.learning_rate)
    def __init__(self,
                 model,
                 caption_model,
                 mask_prob: float = 0.3,
                 clip: int = 1,
                 optimizer=None,
                 beam_width: int = 5,
                 max_len_a: float = 1.1,
                 max_len_b: int = 5,
                 len_penalty_ratio: float = 0.8,
                 nll_loss: bool = False,
                 fp16: bool = False,
                 mm_mode="mixed"):
        super().__init__(model, mask_prob, clip, optimizer, beam_width,
                         max_len_a, max_len_b, len_penalty_ratio, nll_loss,
                         fp16, mm_mode)
        self.caption_model = caption_model
        self.caption_model.eval()
        self.caption_model = self.caption_model.to(self.device)

        if self.num_gpu == 1 and fp16:
            self.caption_model = amp.initialize(self.caption_model,
                                                opt_level="O2")

        if self.num_gpu > 1:
            print("Let's use", self.num_gpu, "GPUs!")
            self.caption_model = DataParallelModel(self.caption_model)
def train():
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loaders = create_datasets(num_workers=32, batch_size=600)
    # info = pd.read_csv("./flower_data/train.csv")[["image","label"]]
    # class_weights = torch.tensor(1.0/info.groupby(["label"]).count().values.astype(np.float32))
    # del info
    models_ensamble = [
                    # {"name":"vgg", "model":models.vgg16_bn(pretrained=True)},
                    {"name":"resnet", "model":models.resnet50(pretrained=True)}, 
                    # {"name":"densenet", "model":models.densenet121(pretrained=True) },
                    {"name":"resnet", "model":models.resnet101(pretrained=True) },
                    ]

    # model = Ensemble(models_ensamble, name="star_ensemble")
    model = load_checkpoint("ensemble_iso_star_5118.pt")

    ft, cl =model.get_parameters()
    # model = nn.DataParallel(model)
    model = DataParallelModel(model)
    model = model.to(device)
    weight = torch.from_numpy(weight_train[0]).to(device)
    criterion = nn.NLLLoss(weight)
    criterion = DataParallelCriterion(criterion)
  
    optimizers = [ optim.Adam(ft, lr=5e-4), optim.Adam(cl, lr=5e-3)]
    # # print("")
    # # print('-' * 40)
    # # print("lr = {} bs= {}".format(lr,bs) )
    # # print('-' * 40)

    # # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_schedulers = [lr_scheduler.StepLR(optimizers[0], step_size = 1, gamma = 0.995),
                        lr_scheduler.StepLR(optimizers[1], step_size = 1, gamma = 0.992) ]


    model = [model, criterion, optimizers, exp_lr_schedulers, device]

    model = train_model(*model, loaders, num_epochs = 100)
Exemple #6
0
def build_model(options):
    model = Seq2Seq.load(ImageCaptioning,
                         options.model_path,
                         tok_dir=options.tokenizer_path,
                         use_obj=options.obj)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    num_gpu = torch.cuda.device_count()
    generator = BeamDecoder(model,
                            beam_width=options.beam_width,
                            max_len_a=options.max_len_a,
                            max_len_b=options.max_len_b,
                            len_penalty_ratio=options.len_penalty_ratio)
    if options.fp16:
        generator = amp.initialize(generator, opt_level="O2")
    if num_gpu > 1:
        generator = DataParallelModel(generator)
    return generator, model.text_processor
Exemple #7
0
def main():
    parser = setup_parser()
    args = parser.parse_args()

    processors = {
        'stsb': StsbProcessor,
        'mednli': MednliProcessor,
        'medsts': MedstsProcessor
    }

    output_modes = {
        'mnli': 'classification',
        'stsb': 'regression',
        'mednli': 'classification',
        'medsts': 'regression'
    }

    bert_types = {
        'discharge':
        '/home/dc925/project/data/clinicalbert/biobert_pretrain_output_disch_100000',
        'all':
        '/home/dc925/project/data/clinicalbert/biobert_pretrain_output_all_notes_150000',
        'base_uncased': 'bert-base-uncased',
        'base_cased': 'bert-base-cased'
    }

    ##################################################################################################
    ################################### SETUP DATA, DEVICE, MODEL ####################################
    ##################################################################################################
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device('cuda' if torch.cuda.is_available()
                              and not args.no_cuda else 'cpu')
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device('cuda', args.local_rank)
        n_gpu = 1
        #Initialize the distributed backend which will take care of synchronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))
    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: {}".format(task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]
    label_list = processor.get_labels(output_mode)
    num_labels = len(label_list)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
            args.local_rank))
    model = BertForSequenceClassification.from_pretrained(
        args.bert_model, cache_dir=cache_dir, num_labels=num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        # model = torch.nn.DataParallel(model)
        model = DataParallelModel(model)

    ##################################################################################################
    ########################################### OPTIMIZER ############################################
    ##################################################################################################

    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        if args.discriminative_finetuning:
            group1 = ['layer.0', 'layer.1.']
            group2 = ['layer.2', 'layer.3']
            group3 = ['layer.4', 'layer.5']
            group4 = ['layer.6', 'layer.7']
            group5 = ['layer.8', 'layer.9']
            group6 = ['layer.10', 'layer.11']
            group_all = ['layer.0', 'layer.1.', 'layer.2', 'layer.3', 'layer.4', 'layer.5', \
            'layer.6', 'layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.01},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.01, 'lr': args.learning_rate},

                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.0},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.0, 'lr': args.learning_rate},
            ]
        else:
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)

        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    ##################################################################################################
    ############################################# TRAIN ##############################################
    ##################################################################################################
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer, output_mode)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer, output_mode)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.float)

        all_pids = np.array([f.pid for f in eval_features])

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size,
                                     drop_last=True)

        model.train()
        epoch_metric = {}
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                # define a new function to compute loss values for both output_modes
                logits = model(input_ids, segment_ids, input_mask, labels=None)

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    loss_fct = DataParallelCriterion(loss_fct)
                    logits = [
                        logits[i].view(-1, num_labels)
                        for i in range(len(logits))
                    ]
                    loss = loss_fct(logits, label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss_fct = DataParallelCriterion(loss_fct)
                    logits = [logits[i].view(-1) for i in range(len(logits))]
                    loss = loss_fct(logits, label_ids.view(-1))
                if n_gpu > 1:
                    loss = loss.mean()  #average on multi-gpu
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        #modify lr with special warm up BERT uses
                        #if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            with torch.no_grad():
                model.eval()
                eval_loss = 0
                nb_eval_steps = 0
                preds = []
                i = 0

                for input_ids, input_mask, segment_ids, label_ids in tqdm(
                        eval_dataloader, desc="Evaluating"):
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        logits = model(input_ids,
                                       segment_ids,
                                       input_mask,
                                       labels=None)

                    if output_mode == 'classification':
                        # loss_fct = CrossEntropyLoss()
                        # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                        loss_fct = CrossEntropyLoss()
                        loss_fct = DataParallelCriterion(loss_fct)
                        logits = [
                            logits[i].view(-1, num_labels)
                            for i in range(len(logits))
                        ]
                        tmp_eval_loss = loss_fct(logits, label_ids.view(-1))
                    elif output_mode == 'regression':
                        # loss_fct = MSELoss()
                        # tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

                        loss_fct = MSELoss()
                        loss_fct = DataParallelCriterion(loss_fct)
                        logits = [
                            logits[i].view(-1) for i in range(len(logits))
                        ]
                        tmp_eval_loss = loss_fct(logits, label_ids.view(-1))

                    eval_loss += tmp_eval_loss.mean().item()
                    nb_eval_steps += 1
                    logits = parallel.gather(logits, target_device='cuda:0')
                    if len(preds) == 0:
                        preds.append(logits.detach().cpu().numpy())
                    else:
                        preds[0] = np.append(preds[0],
                                             logits.detach().cpu().numpy(),
                                             axis=0)
                eval_loss = eval_loss / nb_eval_steps
                preds = preds[0]
                if output_mode == 'classification':
                    preds = np.argmax(preds, axis=1)
                elif output_mode == 'regression':
                    preds = np.squeeze(preds)

                all_label_ids = all_label_ids[:preds.shape[0]]
                all_pids = all_pids[:preds.shape[0]]
                errors = generate_errors(preds, all_label_ids.numpy(),
                                         all_pids)

                result = compute_metrics(task_name, preds,
                                         all_label_ids.numpy())

                loss = tr_loss / global_step if args.do_train else None

                result['eval_loss'] = eval_loss
                result['global_step'] = global_step
                result['loss'] = loss
                logger.info('***** Eval Results *****')
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))

                epoch_metric[_] = result[
                    'pearson'] if output_mode == 'regression' else result['acc']

        output_eval_file = os.path.join(args.output_dir, 'eval_results.txt')
        with open(output_eval_file, 'w') as writer:
            logger.info('***** Eval Results *****')
            # for key in sorted(result.keys()):
            #     logger.info("  %s = %s", key, str(result[key]))
            #     writer.write("%s = %s\n" % (key, str(result[key])))
            # writer.write("{}     {}\n".format("epoch","pearson"))
            for key in sorted(epoch_metric.keys()):
                writer.write("{}\t{}\t{}\t{}\n".format(key,
                                                       str(epoch_metric[key]),
                                                       args.learning_rate,
                                                       args.train_batch_size))

        errors.to_csv('errors.txt', sep='\t', index=False)

    ##################################################################################################
    ########################################## SAVE & RELOAD #########################################
    ##################################################################################################
    if args.do_train:
        #Save a trained model, config, and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  #only save the model itself
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)
        model = BertForSequenceClassification.from_pretrained(
            args.output_dir, num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
    else:
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model, num_labels=num_labels)
    model.to(device)
Exemple #8
0
def main():
    epoches = 32
    gpu_id = 7
    ctx_list = [mx.gpu(x) for x in [7, 8]]
    log_interval = 100
    batch_size = 32
    start_epoch = 0
    # trainer_resume = resume + ".states" if resume is not None else None
    trainer_resume = None

    resume = None
    from mxnet.gluon.data.vision import transforms
    transform_fn = transforms.Compose([
        LeftTopPad(dest_shape=(256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406),
                             std=(0.229, 0.224, 0.225))
    ])
    dataset = CaptionDataSet(
        image_root="/data3/zyx/yks/coco2017/train2017",
        annotation_path=
        "/data3/zyx/yks/coco2017/annotations/captions_train2017.json",
        transforms=transform_fn,
        feature_hdf5="output/train2017.h5")
    val_dataset = CaptionDataSet(
        image_root="/data3/zyx/yks/coco2017/val2017",
        annotation_path=
        "/data3/zyx/yks/coco2017/annotations/captions_val2017.json",
        words2index=dataset.words2index,
        index2words=dataset.index2words,
        transforms=transform_fn,
        feature_hdf5="output/val2017.h5")
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=1,
                            pin_memory=True,
                            last_batch="discard")
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=1,
                            pin_memory=True)

    num_words = dataset.words_count

    # set up logger
    save_prefix = "output/res50_"
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)

    net = EncoderDecoder(num_words=num_words, test_max_len=val_dataset.max_len)
    if resume is not None:
        net.collect_params().load(resume,
                                  allow_missing=True,
                                  ignore_extra=True)
        logger.info("Resumed form checkpoint {}.".format(resume))
    params = net.collect_params()
    for key in params.keys():
        if params[key]._data is not None:
            continue
        else:
            if "bias" in key or "mean" in key or "beta" in key:
                params[key].initialize(init=mx.init.Zero())
                logging.info("initialized {} using Zero.".format(key))
            elif "weight" in key:
                params[key].initialize(init=mx.init.Normal())
                logging.info("initialized {} using Normal.".format(key))
            elif "var" in key or "gamma" in key:
                params[key].initialize(init=mx.init.One())
                logging.info("initialized {} using One.".format(key))
            else:
                params[key].initialize(init=mx.init.Normal())
                logging.info("initialized {} using Normal.".format(key))

    net.collect_params().reset_ctx(ctx=ctx_list)
    trainer = mx.gluon.Trainer(
        net.collect_params(),
        'adam',
        {
            'learning_rate': 4e-4,
            'clip_gradient': 5,
            'multi_precision': True
        },
    )
    if trainer_resume is not None:
        trainer.load_states(trainer_resume)
        logger.info(
            "Loaded trainer states form checkpoint {}.".format(trainer_resume))
    criterion = Criterion()
    accu_top3_metric = TopKAccuracy(top_k=3)
    accu_top1_metric = Accuracy(name="batch_accu")
    ctc_loss_metric = Loss(name="ctc_loss")
    alpha_metric = Loss(name="alpha_loss")
    batch_bleu = BleuMetric(name="batch_bleu",
                            pred_index2words=dataset.index2words,
                            label_index2words=dataset.index2words)
    epoch_bleu = BleuMetric(name="epoch_bleu",
                            pred_index2words=dataset.index2words,
                            label_index2words=dataset.index2words)
    btic = time.time()
    logger.info(batch_size)
    logger.info(num_words)
    logger.info(len(dataset.words2index))
    logger.info(len(dataset.index2words))
    logger.info(dataset.words2index["<PAD>"])
    logger.info(val_dataset.words2index["<PAD>"])
    logger.info(len(val_dataset.words2index))
    # net.hybridize(static_alloc=True, static_shape=True)
    net_parallel = DataParallelModel(net, ctx_list=ctx_list, sync=True)
    for nepoch in range(start_epoch, epoches):
        if nepoch > 15:
            trainer.set_learning_rate(4e-5)
        logger.info("Current lr: {}".format(trainer.learning_rate))
        accu_top1_metric.reset()
        accu_top3_metric.reset()
        ctc_loss_metric.reset()
        alpha_metric.reset()
        epoch_bleu.reset()
        batch_bleu.reset()
        for nbatch, batch in enumerate(tqdm.tqdm(dataloader)):
            batch = [mx.gluon.utils.split_and_load(x, ctx_list) for x in batch]
            inputs = [[x[n] for x in batch] for n, _ in enumerate(ctx_list)]
            losses = []
            with ag.record():
                net_parallel.sync = nbatch > 1
                outputs = net_parallel(*inputs)
                for s_batch, s_outputs in zip(inputs, outputs):
                    image, label, label_len = s_batch
                    predictions, alphas = s_outputs
                    ctc_loss = criterion(predictions, label, label_len)
                    loss2 = 1.0 * ((1. - alphas.sum(axis=1))**2).mean()
                    losses.extend([ctc_loss, loss2])
            ag.backward(losses)
            trainer.step(batch_size=batch_size, ignore_stale_grad=True)
            for n, l in enumerate(label_len):
                l = int(l.asscalar())
                la = label[n, 1:l]
                pred = predictions[n, :(l - 1)]
                accu_top3_metric.update(la, pred)
                accu_top1_metric.update(la, pred)
                epoch_bleu.update(la, predictions[n, :])
                batch_bleu.update(la, predictions[n, :])
            ctc_loss_metric.update(None,
                                   preds=nd.sum(ctc_loss) / image.shape[0])
            alpha_metric.update(None, preds=loss2)
            if nbatch % log_interval == 0 and nbatch > 0:
                msg = ','.join([
                    '{}={:.3f}'.format(*metric.get()) for metric in [
                        epoch_bleu, batch_bleu, accu_top1_metric,
                        accu_top3_metric, ctc_loss_metric, alpha_metric
                    ]
                ])
                logger.info(
                    '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.
                    format(nepoch, nbatch,
                           log_interval * batch_size / (time.time() - btic),
                           msg))
                btic = time.time()
                batch_bleu.reset()
                accu_top1_metric.reset()
                accu_top3_metric.reset()
                ctc_loss_metric.reset()
                alpha_metric.reset()

        bleu, acc_top1 = validate(net,
                                  gpu_id=gpu_id,
                                  val_loader=val_loader,
                                  train_index2words=dataset.index2words,
                                  val_index2words=val_dataset.index2words)
        save_path = save_prefix + "_weights-%d-bleu-%.4f-%.4f.params" % (
            nepoch, bleu, acc_top1)
        net.collect_params().save(save_path)
        trainer.save_states(fname=save_path + ".states")
        logger.info("Saved checkpoint to {}.".format(save_path))
Exemple #9
0
def main(args):
    init(args)
    #Args setup:
    save_dir = os.path.join(args.output_dir, args.experiment_name,
                            "checkpoints")
    save_dir_local = "checkpoints_local"
    desc = args.desc
    data_dir = args.data_dir
    log_dir = os.path.join(args.output_dir, args.experiment_name, "logs")
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(save_dir_local, exist_ok=True)

    train_log_interval = args.train_log_interval
    val_log_interval = args.val_log_interval
    beam = args.beam
    p = args.p
    n_ctx = args.n_ctx
    gen_len = args.gen_len
    k = args.k
    decoding_strategy = args.decoding_strategy
    accum_iter = args.accum_iter
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)
    logger = Logger(log_dir)

    #Text Encoder
    if args.use_offline_gpt2:
        text_encoder = GPT2Tokenizer.from_pretrained('./gpt2model')
    elif args.debug_mode:
        text_encoder = GPT2Tokenizer.from_pretrained('gpt2')
    else:
        text_encoder = GPT2Tokenizer.from_pretrained('gpt2-medium')

    text_encoder.add_special_tokens({
        'bos_token':
        '_start_',
        'cls_token':
        '_classify_',
        'eos_token':
        '_end_',
        'additional_special_tokens':
        ['_kw_', '_endkw_', '_t_', '_i_', '_b_', '_c_']
    })

    vocab = len(text_encoder)

    print("Loading dataset...")
    if args.use_model == "base":
        train_loader = get_paragraph_input_loader(
            os.path.join(data_dir, "train_encoded.jsonl"),
            args.n_batch,
            text_encoder,
            num_workers=3,
            shuffle=True,
            gen_len=gen_len,
            n_ctx=n_ctx,
            include_discourse_type=args.use_discourse,
            include_neigh=args.use_neighbor_feat,
            max_size=args.max_ex,
            include_kw=not args.exclude_kw,
            dim=args.n_embd,
            debug_mode=args.debug_mode)

        val_loader = get_paragraph_input_loader(
            os.path.join(data_dir, "val_encoded.jsonl"),
            n_gpu,
            text_encoder,
            num_workers=0,
            shuffle=False,
            gen_len=gen_len,
            n_ctx=n_ctx,
            include_discourse_type=args.use_discourse,
            include_neigh=args.use_neighbor_feat,
            max_size=args.num_val_examples,
            include_kw=not args.exclude_kw,
            dim=args.n_embd,
            debug_mode=args.debug_mode)

        print("Train length: {}, Validation length: {}".format(
            len(train_loader), len(val_loader)))
        doc_model = GPT2BaseModel(args,
                                  vocab=vocab,
                                  n_ctx=n_ctx,
                                  gen_len=gen_len,
                                  lastidx=text_encoder.eos_token_id,
                                  includeprev=args.use_neighbor_feat,
                                  use_offline_gpt2=args.use_offline_gpt2)

    elif args.use_model == "plotmachines":
        #asli
        train_loader = get_paragraph_memory_input_loader(
            os.path.join(data_dir, "train_encoded.jsonl"),
            args.n_batch,
            text_encoder,
            num_workers=3,
            shuffle=True,
            gen_len=gen_len,
            n_ctx=n_ctx,
            include_discourse_type=args.use_discourse,
            include_neigh=args.use_neighbor_feat,
            max_size=args.max_ex,
            include_kw=not args.exclude_kw,
            memsize=args.memstatesize,
            dim=args.n_embd,
            use_kwmem=True,
            debug_mode=args.debug_mode)

        val_loader = get_paragraph_memory_input_loader(
            os.path.join(data_dir, "val_encoded.jsonl"),
            n_gpu,
            text_encoder,
            num_workers=0,
            shuffle=False,
            gen_len=gen_len,
            n_ctx=n_ctx,
            include_discourse_type=args.use_discourse,
            include_neigh=args.use_neighbor_feat,
            max_size=args.num_val_examples,
            include_kw=not args.exclude_kw,
            memsize=args.memstatesize,
            dim=args.n_embd,
            use_kwmem=True,
            debug_mode=args.debug_mode)

        print("Train length: {}, Validation length: {}".format(
            len(train_loader), len(val_loader)))
        doc_model = PlotMachinesModel(args,
                                      vocab=vocab,
                                      n_ctx=n_ctx,
                                      gen_len=gen_len,
                                      lastidx=text_encoder.eos_token_id,
                                      includeprev=args.use_neighbor_feat,
                                      use_offline_gpt2=args.use_offline_gpt2)

    n_updates_total = (len(train_loader) //
                       args.accum_iter) * (args.num_epochs)

    if args.debug_mode:
        print_model_params(log_dir, doc_model)

    criterion = nn.CrossEntropyLoss(reduction="none")

    model_opt = AdamW(filter(lambda p: p.requires_grad,
                             doc_model.parameters()),
                      lr=args.lr,
                      betas=(args.b1, args.b2),
                      eps=args.e)

    lm_loss = ParagraphLoss(criterion, n_ctx=n_ctx, gen_len=gen_len)

    print("Loading Model")
    doc_model.to(device)
    if n_gpu > 1:
        doc_model = DataParallelModel(doc_model)
        lm_loss = DataParallelCriterion(lm_loss)
    print("Parallelized")

    bestloss = -1
    start_iter, running_loss = 1, 0
    prevloss = 1000

    start_iter, running_loss = load_checkpoint(args.checkpoint, doc_model,
                                               model_opt)
    for i in range(args.num_epochs):
        start_iter, running_loss, bestloss, updates, val_loss1 = run_epoch(
            bestloss,
            start_iter,
            running_loss,
            doc_model,
            lm_loss,
            model_opt,
            train_loader,
            val_loader,
            train_log_interval,
            val_log_interval,
            device,
            beam,
            gen_len,
            k,
            p,
            decoding_strategy,
            accum_iter,
            "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs),
            save_dir,
            logger,
            text_encoder,
            show_progress=args.show_progress,
            my_local_dir=save_dir_local)
        print("VAL LOSS: ", str(val_loss1))
        if val_loss1 > prevloss or math.isnan(val_loss1):
            break
        prevloss = val_loss1

    print('Done training...')
    print('Evaluating on validation with best checkpoint...')

    bestcheck = os.path.join(save_dir, "checkpoint_best.pt")
    checkpoint = torch.load(bestcheck, map_location='cpu')
    state_dict = checkpoint["state_dict"]
    if state_dict.get('module.pos_emb_mask') is None and doc_model.state_dict(
    ).get('module.pos_emb_mask') is not None:
        state_dict['module.pos_emb_mask'] = doc_model.state_dict().get(
            'module.pos_emb_mask')
    doc_model.load_state_dict(state_dict)
    evaluate_doc_model(doc_model, val_loader, text_encoder, device, beam,
                       gen_len, k, p, args.decoding_strategy,
                       os.path.join(save_dir, 'valeval.log'), 'gen', 'tgt',
                       gen_len, [], args)
Exemple #10
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_corpus",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--discriminative_finetuning',
                        action='store_true',
                        help='Whether to use discriminative fine-tuning')

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_corpus)
        train_dataset = BERTDataset(args.train_corpus,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)
        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    #############################################################################
    # model = BertForPreTraining.from_pretrained(args.bert_model)
    model = BertForMaskedLM.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        # model = torch.nn.DataParallel(model)
        model = DataParallelModel(model)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        if args.discriminative_finetuning:
            group1 = ['layer.0', 'layer.1.']
            group2 = ['layer.2', 'layer.3']
            group3 = ['layer.4', 'layer.5']
            group4 = ['layer.6', 'layer.7']
            group5 = ['layer.8', 'layer.9']
            group6 = ['layer.10', 'layer.11']
            group_all = ['layer.0', 'layer.1', 'layer.2', 'layer.3', 'layer.4', 'layer.5', \
            'layer.6', 'layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.01},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.01, 'lr': args.learning_rate},

                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.0},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.0, 'lr': args.learning_rate},
            ]
        else:
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)

        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            #TODO: check if this works with current data generator from disk that relies on next(file)
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size,
                                      drop_last=True)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch

                logits = model(input_ids, segment_ids, input_mask)
                loss_fct = CrossEntropyLoss(ignore_index=-1)
                loss_fct = DataParallelCriterion(loss_fct)
                logits = [
                    logits[i].view(-1, model.module.config.vocab_size)
                    for i in range(len(logits))
                ]
                loss = loss_fct(logits, lm_label_ids.view(-1))

                # loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
                # loss = model(input_ids, segment_ids, input_mask, lm_label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        if args.do_train:
            torch.save(model_to_save.state_dict(), output_model_file)
            model_to_save.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(args.output_dir)
Exemple #11
0
def main(args):
    init(args)
    #Args setup:

    beam = args.beam
    p = args.p
    n_ctx = args.n_ctx
    gen_len = args.gen_len
    k = args.k
    decoding_strategy = args.decoding_strategy
    accum_iter = args.accum_iter
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)
    data_dir = args.data_dir
    #Text Encoder

    if args.debug_mode:
        text_encoder = GPT2Tokenizer.from_pretrained('gpt2')
    else:
        text_encoder = GPT2Tokenizer.from_pretrained('gpt2-medium')
    text_encoder.add_special_tokens({
        'bos_token':
        '_start_',
        'cls_token':
        '_classify_',
        'eos_token':
        '_end_',
        'additional_special_tokens':
        ['_kw_', '_endkw_', '_t_', '_i_', '_b_', '_c_']
    })

    vocab = len(text_encoder)

    datafile = os.path.join(
        data_dir, "test_encoded.jsonl") if args.testset else os.path.join(
            data_dir, "val_encoded.jsonl")
    print("Loading dataset...")
    val_loader = get_fullstory_loader(datafile,
                                      args.n_batch,
                                      text_encoder,
                                      num_workers=0,
                                      shuffle=False,
                                      gen_len=gen_len,
                                      n_ctx=n_ctx,
                                      include_kw=not args.exclude_kw,
                                      max_size=args.max_ex)
    print(len(val_loader))

    if args.use_model == "plotmachines":
        doc_model = PlotMachinesModel(args,
                                      vocab=vocab,
                                      n_ctx=n_ctx,
                                      gen_len=gen_len,
                                      lastidx=text_encoder.eos_token_id,
                                      includeprev=args.use_neighbor_feat)
    else:
        doc_model = GPT2BaseModel(args,
                                  vocab=vocab,
                                  n_ctx=n_ctx,
                                  gen_len=gen_len,
                                  lastidx=text_encoder.eos_token_id,
                                  includeprev=args.use_neighbor_feat)

    doc_model.to(device)
    if n_gpu > 1:
        doc_model = DataParallelModel(doc_model)

    if args.debug_mode:
        gptclf = GPT2Model.from_pretrained('gpt2')
        gptclf.eval()
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        gptclf.to(device)
        #gpttok = gptTokenizer.from_pretrained('openai-gpt')
        gpttok = GPT2Tokenizer.from_pretrained('gpt2')

    else:
        gptclf = GPT2Model.from_pretrained('gpt2-medium')
        gptclf.eval()
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        gptclf.to(device)
        #gpttok = gptTokenizer.from_pretrained('openai-gpt')
        gpttok = GPT2Tokenizer.from_pretrained('gpt2-medium')

    prevloss = []
    upd = []
    start_iter, running_loss = 1, 0
    load_dir = args.load_dir
    bestcheck = os.path.join(load_dir, "checkpoint_best.pt")
    checkpoint = torch.load(bestcheck, map_location='cpu')
    state_dict = checkpoint["state_dict"]
    if n_gpu == 1:
        if state_dict.get(
                'module.pos_emb_mask') is None and doc_model.state_dict().get(
                    'pos_emb_mask') is not None:
            state_dict['module.pos_emb_mask'] = doc_model.state_dict().get(
                'pos_emb_mask')
        for k in list(state_dict.keys()):
            state_dict[k[7:]] = state_dict[k]
            del state_dict[k]
    else:
        if state_dict.get(
                'module.pos_emb_mask') is None and doc_model.state_dict().get(
                    'module.pos_emb_mask') is not None:
            state_dict['module.pos_emb_mask'] = doc_model.state_dict().get(
                'module.pos_emb_mask')
    doc_model.load_state_dict(state_dict)

    print("Parallelized")
    tagset = ['_i_'] + args.bodynum * ['_b_'] + ['_c_']
    vort = 'test' if args.testset else 'val'
    generatedocs(doc_model,
                 gptclf,
                 gpttok,
                 val_loader,
                 text_encoder,
                 device,
                 beam,
                 gen_len,
                 k,
                 p,
                 args.decoding_strategy,
                 os.path.join(args.save_dir, vort + '.gens.tsv'),
                 'gen',
                 'tgt',
                 gen_len, [],
                 args,
                 tags=tagset,
                 dim=args.n_embd,
                 save_dir=args.save_dir,
                 localfile=os.path.join('/tmp', vort + '.gens.tsv'))

    print('done decoding....')
Exemple #12
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs
    args.warmup_steps = t_total // 100

    # Prepare optimizer and schedule (linear warmup and decay)
    optimizer_grouped_parameters = get_param_groups(args, model)
    optimizer = RAdam(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        # model = torch.nn.DataParallel(model)
        model = DataParallelModel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    args.logging_steps = len(train_dataloader) // 1
    args.save_steps = args.logging_steps
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    set_seed(args)
    for _ in train_iterator:
        args.current_epoch = _
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                'token_type_ids':
                batch[2] if args.model_type in ['bert', 'xlnet'] else None,
            }  # XLM and RoBERTa don't use segment_ids
            #   'labels':         batch[3]}
            outputs = model(**inputs)
            outputs = [outputs[i][0] for i in range(len(outputs))]

            loss_fct = CrossEntropyLoss()
            loss_fct = DataParallelCriterion(loss_fct)

            loss = loss_fct(outputs, batch[3])

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
lr_rate = 0.03
milestones = [5, 7, 8, 10, 12, 14, 16, 17, 18]
img_size = 384
gamma = 0.5

#use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#use_cuda = False

segm_model = ResNetLinkModel(input_channels=1, pretrained=True, num_classes=3)

if torch.cuda.device_count() > 1:
    #dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    #segm_model = nn.DataParallel(segm_model)
    #segm_model = encoding.parallel.DataParallelModel(segm_model, device_ids=[0,1,2,3,4,5,6,7])
    segm_model = DataParallelModel(segm_model)
print("Let's use", torch.cuda.device_count(), "GPUs!")
segm_model.to(device)
'''if use_cuda:
    segm_model.cuda()
seg_model = nn.DataParallel(seg_model)'''

mul_transf = [
    transforms.Resize(size=(img_size, img_size)),
    transforms.ToTensor()
]
#optimizer = optim.SGD(segm_model.parameters(), lr=lr_rate, momentum=momentum)
optimizer = optim.Adam(segm_model.parameters(), lr=0.0001)
#criterion = nn.BCEWithLogitsLoss().cuda() if use_cuda else nn.BCEWithLogitsLoss()
criterion = nn.BCEWithLogitsLoss()
criterion = DataParallelCriterion(criterion)
Exemple #14
0
    def __init__(self,
                 model,
                 vocab_size,
                 train_dataloader,
                 test_dataloader=None,
                 lr: float = 1e-4,
                 betas=(0.9, 0.999),
                 weight_decay: float = 0.01,
                 warmup_steps=10000,
                 with_cuda: bool = True,
                 cuda_devices=None,
                 log_freq: int = 10,
                 include_next=False,
                 include_vision=True,
                 total_epochs=1):
        """
        :param bert: BERT model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        n_gpu = torch.cuda.device_count()
        print("device", device, "n_gpu", n_gpu)

        # Initialize the BERT Language Model, with BERT model
        self.model = model.to(self.device)
        self.bert = self.model.bert
        self.padding_idx = 0
        self.include_next = include_next
        self.include_vision = include_vision

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count()))
            self.model = DataParallelModel(self.model,
                                           device_ids=range(
                                               torch.cuda.device_count()))

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = optim.Adamax(self.model.parameters(),
                                  lr=lr,
                                  betas=betas,
                                  weight_decay=weight_decay)
        if self.model.__class__.__name__ in [
                'DataParallel', 'DataParallelModel'
        ]:
            self.optim_schedule = ScheduledOptim(
                self.optim,
                self.model.module.bert.transformer_hidden_size,
                n_warmup_steps=warmup_steps)
        else:
            self.optim_schedule = ScheduledOptim(
                self.optim,
                self.model.bert.transformer_hidden_size,
                n_warmup_steps=warmup_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=0)
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count()))
            self.criterion = DataParallelCriterion(
                self.criterion, device_ids=range(torch.cuda.device_count()))

        self.log_freq = log_freq
        self.total_iters = total_epochs * len(train_dataloader)

        print("Total Parameters:",
              sum([p.nelement() for p in self.model.parameters()]))
Exemple #15
0
class BERTTrainer:
    """
    BERTTrainer make the pretrained BERT model with two LM training method.
        1. Masked Language Model : 3.3.1 Task #1: Masked LM
        2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction
    """
    def __init__(self,
                 model,
                 vocab_size,
                 train_dataloader,
                 test_dataloader=None,
                 lr: float = 1e-4,
                 betas=(0.9, 0.999),
                 weight_decay: float = 0.01,
                 warmup_steps=10000,
                 with_cuda: bool = True,
                 cuda_devices=None,
                 log_freq: int = 10,
                 include_next=False,
                 include_vision=True,
                 total_epochs=1):
        """
        :param bert: BERT model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        n_gpu = torch.cuda.device_count()
        print("device", device, "n_gpu", n_gpu)

        # Initialize the BERT Language Model, with BERT model
        self.model = model.to(self.device)
        self.bert = self.model.bert
        self.padding_idx = 0
        self.include_next = include_next
        self.include_vision = include_vision

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count()))
            self.model = DataParallelModel(self.model,
                                           device_ids=range(
                                               torch.cuda.device_count()))

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = optim.Adamax(self.model.parameters(),
                                  lr=lr,
                                  betas=betas,
                                  weight_decay=weight_decay)
        if self.model.__class__.__name__ in [
                'DataParallel', 'DataParallelModel'
        ]:
            self.optim_schedule = ScheduledOptim(
                self.optim,
                self.model.module.bert.transformer_hidden_size,
                n_warmup_steps=warmup_steps)
        else:
            self.optim_schedule = ScheduledOptim(
                self.optim,
                self.model.bert.transformer_hidden_size,
                n_warmup_steps=warmup_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=0)
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count()))
            self.criterion = DataParallelCriterion(
                self.criterion, device_ids=range(torch.cuda.device_count()))

        self.log_freq = log_freq
        self.total_iters = total_epochs * len(train_dataloader)

        print("Total Parameters:",
              sum([p.nelement() for p in self.model.parameters()]))

    def train(self, epoch):
        self.iteration(epoch, self.train_data)

    def test(self, epoch):
        self.iteration(epoch, self.test_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        """
        loop over the data_loader for training or testing
        if on train status, backward operation is activated
        and also auto save the model every peoch
        :param epoch: current epoch index
        :param data_loader: torch.utils.data.DataLoader for iteration
        :param train: boolean value of is train or test
        :return: None
        """
        str_code = "train" if train else "test"

        # Setting the tqdm progress bar
        data_iter = tqdm.tqdm(enumerate(data_loader),
                              desc="EP_%s:%d" % (str_code, epoch),
                              total=len(data_loader),
                              bar_format="{l_bar}{r_bar}",
                              disable=True)

        avg_loss = 0.0
        total_correct = 0
        total_element = 0

        for i, data in data_iter:
            # 0. prepare the text sequence tensor
            #data = {key: value.to(self.device) for key, value in data.items()}

            seq_tensor = data['masked_text_seq']
            labels = data['masked_text_label']

            seq_lengths = np.argmax(seq_tensor == self.padding_idx, axis=1)
            seq_lengths[seq_lengths == 0] = seq_tensor.shape[1]  # Full length

            # Sort sequences by lengths
            seq_lengths, perm_idx = seq_lengths.sort(0, True)
            sorted_tensor = seq_tensor[perm_idx]
            mask = (sorted_tensor == padding_idx)[:, :seq_lengths[0]]

            f_t_all = data['feature_all']
            isnext = data["isnext"]

            f_t_all = f_t_all[perm_idx]
            isnext = isnext[perm_idx]
            labels = labels[perm_idx]

            # 1. forward the next_sentence_prediction and masked_lm model

            if self.include_vision:
                #next_sent_output, mask_lm_output = self.model.forward(sorted_tensor.cuda(), mask.cuda(),seq_lengths.cuda(),f_t_all.cuda())
                output = self.model.forward(sorted_tensor.cuda(), mask.cuda(),
                                            seq_lengths.cuda(), f_t_all.cuda())
                length_output = len(output)
                print("You got %d outputs" % (length_output))
                next_sent_output, mask_lm_output = zip(*output)
                print("vision test shape is %d " % (next_sent_output[1].shape))
                print("lm test shape is %d " % (mask_lm_output[1].shape))
            else:
                #next_sent_output, mask_lm_output = self.model.forward(sorted_tensor.cuda(), mask.cuda(),seq_lengths.cuda(),None)
                output = self.model.forward(sorted_tensor.cuda(), mask.cuda(),
                                            seq_lengths.cuda(), None)
                length_output = len(output)
                print("You got %d outputs" % (length_output))
                next_sent_output, mask_lm_output = zip(*output)

            # 2-1. NLL(negative log likelihood) loss of is_next classification result
            next_loss = 0
            if self.include_vision and self.include_next:
                next_loss = self.criterion(next_sent_output, isnext.cuda())

            # 2-2. NLLLoss of predicting masked token word
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2),
                                       labels[:, :seq_lengths[0]].cuda())

            # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
            #loss = next_loss + mask_loss

            # 3. backward and optimization only in train
            loss = loss.mean()

            if train:
                self.optim_schedule.zero_grad()
                loss.backward()
                self.optim_schedule.step_and_update_lr()

            # next vision prediction accuracy
            if self.include_next:
                correct = next_sent_output.argmax(dim=-1).eq(
                    isnext.cuda()).sum().item()
                total_correct += correct
                total_element += data["isnext"].nelement()
            avg_loss += loss.item()

            if self.include_next:
                post_fix = {
                    "epoch": epoch,
                    "iter": i,
                    "avg_loss": avg_loss / (i + 1),
                    "avg_acc": total_correct / total_element * 100,
                    "loss": loss.item()
                }
            else:
                post_fix = {
                    "epoch": epoch,
                    "iter": i,
                    "avg_loss": avg_loss / (i + 1),
                    "loss": loss.item()
                }

            #if i % self.log_freq == 0:
            #    data_iter.write(str(post_fix))

            if i % 100 == 0:
                #print("PROGRESS: {}%".format(round((myidx) * 100 / n_iters, 4)))
                print("\n")
                print("PROGRESS: {}%".format(
                    round((epoch * len(data_loader) + i) * 100 /
                          self.total_iters, 4)))
                print("EVALERR: {}%".format(avg_loss / (i + 1)))

        #print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter))

    def save(self, epoch, file_path="pretrained_models/addbert_trained.model"):
        """
        Saving the current BERT model on file_path
        :param epoch: current epoch number
        :param file_path: model output path which gonna be file_path+"ep%d" % epoch
        :return: final_output_path
        """
        output_path = file_path + ".ep%d" % epoch
        torch.save(self.bert.cpu(), output_path)
        self.bert.to(self.device)
        print("EP:%d Model Saved on:" % epoch, output_path)
        return output_path
Exemple #16
0
for (key, value) in vars(args).items():
    print("{0:16} | {1}".format(key, value))

# check if processed data file exists or not

data_mean = [0.485, 0.456, 0.406]
data_std = [0.229, 0.224, 0.225]

# load the model
model = BiSalNet()
model.eval()

if args.onGPU and torch.cuda.device_count() > 1:
    # model = torch.nn.DataParallel(model)
    model = DataParallelModel(model)
if args.onGPU:
    model = model.cuda()

# compose the data with transforms
val_transforms = transforms.Compose([
    transforms.Resize((args.inHeight, args.inWidth)),
    transforms.ToTensor(),
    transforms.Normalize(data_mean, data_std)
])

if os.path.isfile(join(args.savedir, "checkpoint.pth")):
    print("=> loading checkpoint '{}'".format(
        join(args.savedir, "checkpoint.pth")))
    checkpoint = torch.load(join(args.savedir, "checkpoint.pth"))["state_dict"]
    if list(checkpoint.keys())[0][:7] == "module." and not isinstance(
class ImageMTTrainer:
    def __init__(self,
                 model,
                 mask_prob: float = 0.3,
                 clip: int = 1,
                 optimizer=None,
                 beam_width: int = 5,
                 max_len_a: float = 1.1,
                 max_len_b: int = 5,
                 len_penalty_ratio: float = 0.8,
                 nll_loss: bool = False,
                 fp16: bool = False,
                 mm_mode="mixed",
                 rank: int = -1):
        self.model = model

        self.clip = clip
        self.optimizer = optimizer

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.num_gpu = torch.cuda.device_count()

        self.mask_prob = mask_prob
        if nll_loss:
            self.criterion = nn.NLLLoss(
                ignore_index=model.text_processor.pad_token_id())
        else:
            self.criterion = SmoothedNLLLoss(
                ignore_index=model.text_processor.pad_token_id())

        self.num_gpu = torch.cuda.device_count()
        self.fp16 = False
        self.rank = rank
        if rank >= 0:
            self.device = torch.device('cuda', rank)
            torch.cuda.set_device(self.device)

        self.model = self.model.to(self.device)

        if fp16:
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level="O2")
            self.fp16 = True

        self.generator = BeamDecoder(self.model,
                                     beam_width=beam_width,
                                     max_len_a=max_len_a,
                                     max_len_b=max_len_b,
                                     len_penalty_ratio=len_penalty_ratio)
        if rank >= 0:
            self.model = DistributedDataParallel(self.model,
                                                 device_ids=[self.rank],
                                                 output_device=self.rank,
                                                 find_unused_parameters=True)
            self.generator = DistributedDataParallel(
                self.generator,
                device_ids=[self.rank],
                output_device=self.rank,
                find_unused_parameters=True)
        elif self.num_gpu > 1:
            print("Let's use", self.num_gpu, "GPUs!")
            self.model = DataParallelModel(self.model)
            self.criterion = DataParallelCriterion(self.criterion)
            self.generator = DataParallelModel(self.generator)

        self.reference = None
        self.best_bleu = -1.0
        self.mm_mode = mm_mode

    def train_epoch(self,
                    img_data_iter: List[data_utils.DataLoader],
                    step: int,
                    saving_path: str = None,
                    mass_data_iter: List[data_utils.DataLoader] = None,
                    mt_dev_iter: List[data_utils.DataLoader] = None,
                    mt_train_iter: List[data_utils.DataLoader] = None,
                    max_step: int = 300000,
                    accum=1,
                    beam_width=1,
                    fine_tune: bool = False,
                    lang_directions: dict = False,
                    lex_dict=None,
                    save_opt: bool = False,
                    **kwargs):
        "Standard Training and Logging Function"
        start = time.time()
        total_tokens, total_loss, tokens, cur_loss = 0, 0, 0, 0
        cur_loss = 0
        batch_zip, shortest = self.get_batch_zip(img_data_iter, mass_data_iter,
                                                 mt_train_iter)

        model = (self.model.module
                 if hasattr(self.model, "module") else self.model)
        self.optimizer.zero_grad()
        for i, batches in enumerate(batch_zip):
            for batch in batches:
                is_img_batch = isinstance(batch,
                                          list) and "captions" in batch[0]
                is_mass_batch = not is_img_batch and "dst_texts" not in batch
                is_contrastive = False
                try:
                    if fine_tune and (is_img_batch or is_mass_batch):
                        id2lid = lambda r: model.text_processor.languages[
                            model.text_processor.id2token(lang_directions[int(
                                r)])]
                        if is_mass_batch:
                            src_inputs = batch["src_texts"].squeeze(0)
                            src_pad_mask = src_inputs != model.text_processor.pad_token_id(
                            )
                            pad_indices = batch["pad_idx"].squeeze(0)
                            proposal = batch["proposal"].squeeze(
                                0) if lex_dict is not None else None
                            target_langs = torch.LongTensor([
                                lang_directions[int(l)]
                                for l in src_inputs[:, 0]
                            ])
                            dst_langs = torch.LongTensor(
                                [id2lid(l) for l in src_inputs[:, 0]])
                        else:
                            src_inputs = [b["captions"] for b in batch]
                            src_pad_mask = [b["caption_mask"] for b in batch]
                            pad_indices = [b["pad_idx"] for b in batch]
                            proposal = [
                                b["proposal"] if lex_dict is not None else None
                                for b in batch
                            ]
                            target_langs = [
                                torch.LongTensor([
                                    lang_directions[int(l)] for l in src[:, 0]
                                ]) for src in src_inputs
                            ]
                            dst_langs = [
                                torch.LongTensor(
                                    [id2lid(l) for l in src[:, 0]])
                                for src in src_inputs
                            ]
                        if len(src_inputs) < self.num_gpu:
                            continue

                        if is_mass_batch:
                            langs = batch["langs"].squeeze(0)
                        else:
                            langs = [b["langs"] for b in batch]

                        model.eval()
                        with torch.no_grad():
                            # We do not backpropagate the data generator following the MASS paper.
                            images = None
                            if is_img_batch:
                                images = [b["images"] for b in batch]
                            outputs = self.generator(
                                src_inputs=src_inputs,
                                src_sizes=pad_indices,
                                first_tokens=target_langs,
                                src_langs=langs,
                                tgt_langs=dst_langs,
                                pad_idx=model.text_processor.pad_token_id(),
                                src_mask=src_pad_mask,
                                unpad_output=False,
                                beam_width=beam_width,
                                images=images,
                                proposals=proposal)
                            if self.num_gpu > 1 and self.rank < 0:
                                if is_mass_batch:
                                    new_outputs = []
                                    for output in outputs:
                                        new_outputs += output
                                    outputs = new_outputs

                            if is_mass_batch or self.num_gpu <= 1:
                                translations = pad_sequence(
                                    outputs,
                                    batch_first=True,
                                    padding_value=model.text_processor.
                                    pad_token_id())
                                translation_proposals = None
                                if lex_dict is not None:
                                    translation_proposals = list(
                                        map(
                                            lambda o: dataset.
                                            get_lex_suggestions(
                                                lex_dict, o,
                                                model.text_processor.
                                                pad_token_id()), outputs))
                                    translation_proposals = pad_sequence(
                                        translation_proposals,
                                        batch_first=True,
                                        padding_value=model.text_processor.
                                        pad_token_id())
                                translation_pad_mask = (
                                    translations !=
                                    model.text_processor.pad_token_id())
                            else:
                                translation_proposals = None
                                if lex_dict is not None:
                                    translation_proposals = [
                                        pad_sequence(
                                            list(
                                                map(
                                                    lambda o: dataset.
                                                    get_lex_suggestions(
                                                        lex_dict, o,
                                                        model.text_processor.
                                                        pad_token_id()),
                                                    output)),
                                            batch_first=True,
                                            padding_value=model.text_processor.
                                            pad_token_id())
                                        for output in outputs
                                    ]

                                translations = [
                                    pad_sequence(output,
                                                 batch_first=True,
                                                 padding_value=model.
                                                 text_processor.pad_token_id())
                                    for output in outputs
                                ]
                                translation_pad_mask = [
                                    t != model.text_processor.pad_token_id()
                                    for t in translations
                                ]
                        model.train()

                        if is_mass_batch:
                            langs = batch["langs"].squeeze(0)
                        else:
                            langs = torch.cat([b["langs"] for b in batch])
                        # Now use it for back-translation loss.
                        predictions = model(
                            src_inputs=translations,
                            tgt_inputs=src_inputs,
                            src_pads=translation_pad_mask,
                            pad_idx=model.text_processor.pad_token_id(),
                            src_langs=dst_langs,
                            tgt_langs=langs,
                            proposals=translation_proposals,
                            log_softmax=True)
                        if is_mass_batch:
                            src_targets = src_inputs[:,
                                                     1:].contiguous().view(-1)
                            src_mask_flat = src_pad_mask[:,
                                                         1:].contiguous().view(
                                                             -1)
                        else:
                            src_targets = torch.cat(
                                list(map(lambda s: s[:, 1:], src_inputs)))
                            src_mask_flat = torch.cat(
                                list(map(lambda s: s[:, 1:], src_pad_mask)))
                        targets = src_targets[src_mask_flat]

                        ntokens = targets.size(0)
                    elif is_img_batch:
                        src_inputs = [b["captions"] for b in batch]
                        src_pad_mask = [b["caption_mask"] for b in batch]
                        proposals = [b["proposal"] for b in batch
                                     ] if lex_dict is not None else None
                        langs = [b["langs"] for b in batch]
                        if (self.mm_mode == "mixed" and random.random() <= .5
                            ) or self.mm_mode == "masked":
                            pad_indices = [b["pad_idx"] for b in batch]
                            if len(batch) < self.num_gpu:
                                continue

                            # For image masking, we are allowed to mask more than mask_prob
                            mask_prob = random.uniform(self.mask_prob, 1.0)

                            masked_info = list(
                                map(
                                    lambda pi, si: mass_mask(
                                        mask_prob, pi, si, model.text_processor
                                    ), pad_indices, src_inputs))
                            predictions = self.model(
                                src_inputs=list(
                                    map(lambda m: m["src_text"], masked_info)),
                                tgt_inputs=list(
                                    map(lambda m: m["to_recover"],
                                        masked_info)),
                                tgt_positions=list(
                                    map(lambda m: m["positions"],
                                        masked_info)),
                                src_pads=src_pad_mask,
                                pad_idx=model.text_processor.pad_token_id(),
                                src_langs=langs,
                                batch=batch,
                                proposals=proposals,
                                log_softmax=True)
                            targets = torch.cat(
                                list(map(lambda m: m["targets"], masked_info)))
                            ntokens = targets.size(0)
                        else:
                            neg_samples = [b["neg"] for b in batch]
                            neg_mask = [b["neg_mask"] for b in batch]
                            loss = self.model(
                                src_inputs=src_inputs,
                                src_pads=src_pad_mask,
                                neg_samples=neg_samples,
                                neg_mask=neg_mask,
                                pad_idx=model.text_processor.pad_token_id(),
                                src_langs=langs,
                                batch=batch,
                                proposals=proposals,
                                log_softmax=True)
                            is_contrastive = True

                    elif not is_mass_batch:  # MT data
                        src_inputs = batch["src_texts"].squeeze(0)
                        src_mask = batch["src_pad_mask"].squeeze(0)
                        tgt_inputs = batch["dst_texts"].squeeze(0)
                        tgt_mask = batch["dst_pad_mask"].squeeze(0)
                        src_langs = batch["src_langs"].squeeze(0)
                        dst_langs = batch["dst_langs"].squeeze(0)
                        proposals = batch["proposal"].squeeze(
                            0) if lex_dict is not None else None
                        if src_inputs.size(0) < self.num_gpu:
                            continue
                        predictions = self.model(
                            src_inputs=src_inputs,
                            tgt_inputs=tgt_inputs,
                            src_pads=src_mask,
                            tgt_mask=tgt_mask,
                            src_langs=src_langs,
                            tgt_langs=dst_langs,
                            proposals=proposals,
                            pad_idx=model.text_processor.pad_token_id(),
                            log_softmax=True)
                        targets = tgt_inputs[:, 1:].contiguous().view(-1)
                        tgt_mask_flat = tgt_mask[:, 1:].contiguous().view(-1)
                        targets = targets[tgt_mask_flat]
                        ntokens = targets.size(0)
                    else:  # MASS data
                        src_inputs = batch["src_texts"].squeeze(0)
                        pad_indices = batch["pad_idx"].squeeze(0)
                        proposals = batch["proposal"].squeeze(
                            0) if lex_dict is not None else None
                        if src_inputs.size(0) < self.num_gpu:
                            continue

                        masked_info = mass_mask(self.mask_prob, pad_indices,
                                                src_inputs,
                                                model.text_processor)
                        predictions = self.model(
                            src_inputs=masked_info["src_text"],
                            tgt_inputs=masked_info["to_recover"],
                            tgt_positions=masked_info["positions"],
                            pad_idx=model.text_processor.pad_token_id(),
                            src_langs=batch["langs"].squeeze(0),
                            proposals=proposals,
                            log_softmax=True)
                        targets = masked_info["targets"]
                        ntokens = targets.size(0)

                    if is_contrastive:  # Nothing to predict!
                        backward(loss, self.optimizer, self.fp16)
                        loss = loss.data
                    elif ntokens > 0:
                        if self.num_gpu == 1:
                            targets = targets.to(predictions.device)
                        if self.rank >= 0: targets = targets.to(self.device)

                        loss = self.criterion(predictions, targets).mean()
                        backward(loss, self.optimizer, self.fp16)

                        loss = float(loss.data) * ntokens
                        tokens += ntokens
                        total_tokens += ntokens
                    total_loss += loss
                    cur_loss += loss

                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   self.clip)
                    step += 1
                    if step % accum == 0:
                        self.optimizer.step()
                        self.optimizer.zero_grad()

                    if is_mass_batch and not fine_tune:
                        mass_unmask(masked_info["src_text"],
                                    masked_info["src_mask"],
                                    masked_info["mask_idx"])
                    if not is_contrastive and is_img_batch and not fine_tune:
                        map(
                            lambda m: mass_unmask(m["src_text"], m["src_mask"],
                                                  m["mask_idx"]), masked_info)

                    if step % 50 == 0 and tokens > 0:
                        elapsed = time.time() - start
                        print(
                            self.rank, "->", datetime.datetime.now(),
                            "Epoch Step: %d Loss: %f Tokens per Sec: %f " %
                            (step, cur_loss / tokens, tokens / elapsed))

                        if mt_dev_iter is not None and step % 5000 == 0 and self.rank <= 0:
                            bleu = self.eval_bleu(mt_dev_iter, saving_path)
                            print("BLEU:", bleu)

                        if step % 10000 == 0:
                            if self.rank <= 0:
                                if self.rank < 0:
                                    model.cpu().save(saving_path + ".latest")
                                elif self.rank == 0:
                                    model.save(saving_path + ".latest")

                                if save_opt:
                                    with open(
                                            os.path.join(
                                                saving_path + ".latest",
                                                "optim"), "wb") as fp:
                                        pickle.dump(self.optimizer, fp)
                                if self.rank < 0:
                                    model = model.to(self.device)

                        start, tokens, cur_loss = time.time(), 0, 0

                except RuntimeError as err:
                    print(repr(err))
                    print("Error processing", is_img_batch)
                    if (isinstance(model, ImageMassSeq2Seq)) and is_img_batch:
                        for b in batch:
                            print("->", len(b["images"]), b["captions"].size())
                    torch.cuda.empty_cache()

            if i == shortest - 1:
                break
            if step >= max_step:
                break

        try:
            if self.rank <= 0:
                print("Total loss in this epoch: %f" %
                      (total_loss / total_tokens))
                if self.rank < 0:
                    model.cpu().save(saving_path + ".latest")
                    model = model.to(self.device)
                elif self.rank == 0:
                    model.save(saving_path + ".latest")

                if mt_dev_iter is not None:
                    bleu = self.eval_bleu(mt_dev_iter, saving_path)
                    print("BLEU:", bleu)
        except RuntimeError as err:
            print(repr(err))

        return step

    def get_batch_zip(self, img_data_iter, mass_data_iter, mt_train_iter):
        # if img_data_iter is not None and mt_train_iter is not None:
        #     img_data_iter *= 5
        # if mass_data_iter is not None and mt_train_iter is not None:
        #     mass_data_iter *= 5
        iters = list(
            chain(*filter(lambda x: x != None,
                          [img_data_iter, mass_data_iter, mt_train_iter])))
        shortest = min(len(l) for l in iters)
        return zip(*iters), shortest

    def eval_bleu(self, dev_data_iter, saving_path, save_opt: bool = False):
        mt_output = []
        src_text = []
        model = (self.model.module
                 if hasattr(self.model, "module") else self.model)
        model.eval()

        with torch.no_grad():
            for iter in dev_data_iter:
                for batch in iter:
                    src_inputs = batch["src_texts"].squeeze(0)
                    src_mask = batch["src_pad_mask"].squeeze(0)
                    tgt_inputs = batch["dst_texts"].squeeze(0)
                    src_langs = batch["src_langs"].squeeze(0)
                    dst_langs = batch["dst_langs"].squeeze(0)
                    src_pad_idx = batch["pad_idx"].squeeze(0)
                    proposal = batch["proposal"].squeeze(
                        0) if batch["proposal"] is not None else None

                    src_ids = get_outputs_until_eos(
                        model.text_processor.sep_token_id(),
                        src_inputs,
                        remove_first_token=True)
                    src_text += list(
                        map(
                            lambda src: model.text_processor.tokenizer.decode(
                                src.numpy()), src_ids))

                    outputs = self.generator(
                        src_inputs=src_inputs,
                        src_sizes=src_pad_idx,
                        first_tokens=tgt_inputs[:, 0],
                        src_mask=src_mask,
                        src_langs=src_langs,
                        tgt_langs=dst_langs,
                        pad_idx=model.text_processor.pad_token_id(),
                        proposals=proposal)
                    if self.num_gpu > 1 and self.rank < 0:
                        new_outputs = []
                        for output in outputs:
                            new_outputs += output
                        outputs = new_outputs

                    mt_output += list(
                        map(
                            lambda x: model.text_processor.tokenizer.decode(x[
                                1:].numpy()), outputs))

            model.train()
        bleu = sacrebleu.corpus_bleu(mt_output,
                                     [self.reference[:len(mt_output)]],
                                     lowercase=True,
                                     tokenize="intl")

        with open(os.path.join(saving_path, "bleu.output"), "w") as writer:
            writer.write("\n".join([
                src + "\n" + ref + "\n" + o + "\n\n***************\n"
                for src, ref, o in zip(src_text, mt_output,
                                       self.reference[:len(mt_output)])
            ]))

        if bleu.score > self.best_bleu:
            self.best_bleu = bleu.score
            print("Saving best BLEU", self.best_bleu)
            with open(os.path.join(saving_path, "bleu.best.output"),
                      "w") as writer:
                writer.write("\n".join([
                    src + "\n" + ref + "\n" + o + "\n\n***************\n"
                    for src, ref, o in zip(src_text, mt_output,
                                           self.reference[:len(mt_output)])
                ]))
            if self.rank < 0:
                model.cpu().save(saving_path)
                model = model.to(self.device)
            elif self.rank == 0:
                model.save(saving_path)

            if save_opt:
                with open(os.path.join(saving_path, "optim"), "wb") as fp:
                    pickle.dump(self.optimizer, fp)

        return bleu.score

    @staticmethod
    def train(options):
        lex_dict = None
        if options.dict_path is not None:
            lex_dict = get_lex_dict(options.dict_path)
        if options.local_rank <= 0 and not os.path.exists(options.model_path):
            os.makedirs(options.model_path)

        text_processor = TextProcessor(options.tokenizer_path)
        assert text_processor.pad_token_id() == 0
        num_processors = max(torch.cuda.device_count(),
                             1) if options.local_rank < 0 else 1

        if options.pretrained_path is not None:
            mt_model = Seq2Seq.load(ImageMassSeq2Seq,
                                    options.pretrained_path,
                                    tok_dir=options.tokenizer_path)
        else:
            mt_model = ImageMassSeq2Seq(
                use_proposals=lex_dict is not None,
                tie_embed=options.tie_embed,
                text_processor=text_processor,
                resnet_depth=options.resnet_depth,
                lang_dec=options.lang_decoder,
                enc_layer=options.encoder_layer,
                dec_layer=options.decoder_layer,
                embed_dim=options.embed_dim,
                intermediate_dim=options.intermediate_layer_dim)

        if options.lm_path is not None:
            lm = LM(text_processor=text_processor,
                    enc_layer=options.encoder_layer,
                    embed_dim=options.embed_dim,
                    intermediate_dim=options.intermediate_layer_dim)
            mt_model.init_from_lm(lm)

        print("Model initialization done!")

        # We assume that the collator function returns a list with the size of number of gpus (in case of cpus,
        collator = dataset.ImageTextCollator()
        num_batches = max(1, torch.cuda.device_count())

        if options.continue_train:
            with open(os.path.join(options.pretrained_path, "optim"),
                      "rb") as fp:
                optimizer = pickle.load(fp)
        else:
            optimizer = build_optimizer(mt_model,
                                        options.learning_rate,
                                        warump_steps=options.warmup)
        trainer = ImageMTTrainer(model=mt_model,
                                 mask_prob=options.mask_prob,
                                 optimizer=optimizer,
                                 clip=options.clip,
                                 beam_width=options.beam_width,
                                 max_len_a=options.max_len_a,
                                 max_len_b=options.max_len_b,
                                 len_penalty_ratio=options.len_penalty_ratio,
                                 fp16=options.fp16,
                                 mm_mode=options.mm_mode,
                                 rank=options.local_rank)

        pin_memory = torch.cuda.is_available()
        img_train_loader = ImageMTTrainer.get_img_loader(
            collator,
            dataset.ImageCaptionDataset,
            options.train_path,
            mt_model,
            num_batches,
            options,
            pin_memory,
            lex_dict=lex_dict)

        mass_train_data, mass_train_loader, finetune_loader, mt_dev_loader = None, None, None, None
        if options.mass_train_path is not None:
            mass_train_paths = options.mass_train_path.strip().split(",")
            if options.step > 0:
                mass_train_data, mass_train_loader = ImageMTTrainer.get_mass_loader(
                    mass_train_paths,
                    mt_model,
                    num_processors,
                    options,
                    pin_memory,
                    keep_examples=options.finetune_step > 0,
                    lex_dict=lex_dict)

            if options.finetune_step > 0:
                finetune_loader, finetune_data = ImageMTTrainer.get_mass_finetune_data(
                    mass_train_data,
                    mass_train_paths,
                    mt_model,
                    num_processors,
                    options,
                    pin_memory,
                    lex_dict=lex_dict)

        mt_train_loader = None
        if options.mt_train_path is not None:
            mt_train_loader = ImageMTTrainer.get_mt_train_data(
                mt_model,
                num_processors,
                options,
                pin_memory,
                lex_dict=lex_dict)

        mt_dev_loader = None
        if options.mt_dev_path is not None:
            mt_dev_loader = ImageMTTrainer.get_mt_dev_data(mt_model,
                                                           options,
                                                           pin_memory,
                                                           text_processor,
                                                           trainer,
                                                           lex_dict=lex_dict)

        step, train_epoch = 0, 1
        while options.step > 0 and step < options.step:
            print("train epoch", train_epoch)
            step = trainer.train_epoch(img_data_iter=img_train_loader,
                                       mass_data_iter=mass_train_loader,
                                       mt_train_iter=mt_train_loader,
                                       max_step=options.step,
                                       lex_dict=lex_dict,
                                       mt_dev_iter=mt_dev_loader,
                                       saving_path=options.model_path,
                                       step=step,
                                       save_opt=options.save_opt,
                                       accum=options.accum)
            train_epoch += 1

        finetune_epoch = 0
        # Resetting the optimizer for the purpose of finetuning.
        trainer.optimizer.reset()

        lang_directions = ImageMTTrainer.get_lang_dirs(options.bt_langs,
                                                       text_processor)
        print(options.local_rank, "lang dirs", lang_directions)

        print(options.local_rank,
              "Reloading image train data with new batch size...")

        if options.finetune_step > 0 and img_train_loader is not None:
            img_train_loader = ImageMTTrainer.get_img_loader(
                collator,
                dataset.ImageCaptionDataset,
                options.train_path,
                mt_model,
                num_batches,
                options,
                pin_memory,
                denom=2,
                lex_dict=lex_dict)
        if options.ignore_mt_mass:
            mt_train_loader = None
        print(options.local_rank,
              "Reloading image train data with new batch size done!")

        while options.finetune_step > 0 and step <= options.finetune_step + options.step:
            print(options.local_rank, "finetune epoch", finetune_epoch)
            step = trainer.train_epoch(img_data_iter=img_train_loader,
                                       mass_data_iter=finetune_loader,
                                       mt_train_iter=mt_train_loader,
                                       max_step=options.finetune_step +
                                       options.step,
                                       mt_dev_iter=mt_dev_loader,
                                       saving_path=options.model_path,
                                       step=step,
                                       fine_tune=True,
                                       lang_directions=lang_directions,
                                       lex_dict=lex_dict,
                                       save_opt=options.save_opt,
                                       accum=options.accum,
                                       beam_width=options.bt_beam_width)
            finetune_epoch += 1

    @staticmethod
    def get_lang_dirs(bt_langs, text_processor: TextProcessor):
        langs = ["<" + l + ">" for l in bt_langs.strip().split(",")]
        langs = set([text_processor.token_id(l) for l in langs])
        if len(langs) < 2:
            return None
        assert len(langs) <= 2
        lang_directions = {}
        for lang1 in langs:
            for lang2 in langs:
                if lang1 != lang2:
                    # Assuming that we only have two languages!
                    lang_directions[lang1] = lang2
        return lang_directions

    @staticmethod
    def get_mt_dev_data(mt_model,
                        options,
                        pin_memory,
                        text_processor,
                        trainer,
                        lex_dict=None):
        mt_dev_loader = []
        dev_paths = options.mt_dev_path.split(",")
        trainer.reference = []
        for dev_path in dev_paths:
            mt_dev_data = dataset.MTDataset(
                batch_pickle_dir=dev_path,
                max_batch_capacity=options.total_capacity,
                keep_pad_idx=True,
                max_batch=int(options.batch / (options.beam_width * 2)),
                pad_idx=mt_model.text_processor.pad_token_id(),
                lex_dict=lex_dict)
            dl = data_utils.DataLoader(mt_dev_data,
                                       batch_size=1,
                                       shuffle=False,
                                       pin_memory=pin_memory)
            mt_dev_loader.append(dl)

            print(options.local_rank, "creating reference")

            generator = (trainer.generator.module if hasattr(
                trainer.generator, "module") else trainer.generator)

            for batch in dl:
                tgt_inputs = batch["dst_texts"].squeeze()
                refs = get_outputs_until_eos(text_processor.sep_token_id(),
                                             tgt_inputs,
                                             remove_first_token=True)
                ref = [
                    generator.seq2seq_model.text_processor.tokenizer.decode(
                        ref.numpy()) for ref in refs
                ]
                trainer.reference += ref
        return mt_dev_loader

    @staticmethod
    def get_mt_train_data(mt_model,
                          num_processors,
                          options,
                          pin_memory,
                          lex_dict=None):
        mt_train_loader = []
        train_paths = options.mt_train_path.split(",")
        for train_path in train_paths:
            mt_train_data = dataset.MTDataset(
                batch_pickle_dir=train_path,
                max_batch_capacity=int(num_processors *
                                       options.total_capacity / 2),
                max_batch=int(num_processors * options.batch / 2),
                pad_idx=mt_model.text_processor.pad_token_id(),
                lex_dict=lex_dict,
                keep_pad_idx=False)
            mtl = data_utils.DataLoader(
                mt_train_data,
                sampler=None if options.local_rank < 0 else DistributedSampler(
                    mt_train_data, rank=options.local_rank),
                batch_size=1,
                shuffle=(options.local_rank < 0),
                pin_memory=pin_memory)
            mt_train_loader.append(mtl)
        return mt_train_loader

    @staticmethod
    def get_mass_finetune_data(mass_train_data,
                               mass_train_paths,
                               mt_model,
                               num_processors,
                               options,
                               pin_memory,
                               lex_dict=None):
        finetune_data, finetune_loader = [], []
        for i, mass_train_path in enumerate(mass_train_paths):
            fd = dataset.MassDataset(
                batch_pickle_dir=mass_train_path,
                max_batch_capacity=int(num_processors *
                                       options.total_capacity /
                                       max(2, options.bt_beam_width)),
                max_batch=int(num_processors * options.batch /
                              max(2, options.bt_beam_width)),
                pad_idx=mt_model.text_processor.pad_token_id(),
                max_seq_len=options.max_seq_len,
                keep_examples=False,
                example_list=None if mass_train_data is None else
                mass_train_data[i].examples_list,
                lex_dict=lex_dict)
            finetune_data.append(fd)
            fl = data_utils.DataLoader(
                fd,
                sampler=None if options.local_rank < 0 else DistributedSampler(
                    fd, rank=options.local_rank),
                batch_size=1,
                shuffle=(options.local_rank < 0),
                pin_memory=pin_memory)
            finetune_loader.append(fl)
            if mass_train_data is not None:
                mass_train_data[i].examples_list = []
        return finetune_loader, finetune_data

    @staticmethod
    def get_mass_loader(mass_train_paths,
                        mt_model,
                        num_processors,
                        options,
                        pin_memory,
                        keep_examples,
                        lex_dict=None):
        mass_train_data, mass_train_loader = [], []
        for i, mass_train_path in enumerate(mass_train_paths):
            td = dataset.MassDataset(
                batch_pickle_dir=mass_train_path,
                max_batch_capacity=num_processors * options.total_capacity,
                max_batch=num_processors * options.batch,
                pad_idx=mt_model.text_processor.pad_token_id(),
                max_seq_len=options.max_seq_len,
                keep_examples=keep_examples,
                lex_dict=lex_dict)
            mass_train_data.append(td)

            dl = data_utils.DataLoader(
                td,
                sampler=None if options.local_rank < 0 else DistributedSampler(
                    td, rank=options.local_rank),
                batch_size=1,
                shuffle=(options.local_rank < 0),
                pin_memory=pin_memory)
            mass_train_loader.append(dl)
        return mass_train_data, mass_train_loader

    @staticmethod
    def get_img_loader(collator,
                       dataset_class,
                       paths,
                       mt_model,
                       num_batches,
                       options,
                       pin_memory,
                       denom=1,
                       lex_dict=None,
                       shuffle=True):
        if paths is not None:
            img_loader = []
            for pth in paths.strip().split(","):
                data = dataset_class(
                    root_img_dir=options.image_dir,
                    data_bin_file=pth,
                    max_capacity=int(options.img_capacity / denom),
                    text_processor=mt_model.text_processor,
                    max_img_per_batch=options.max_image / denom,
                    lex_dict=lex_dict)
                print(options.local_rank, pth, "Length of training data",
                      len(data))
                tl = data_utils.DataLoader(
                    data,
                    sampler=None if options.local_rank < 0 else
                    DistributedSampler(data, rank=options.local_rank),
                    batch_size=num_batches,
                    shuffle=shuffle,
                    pin_memory=pin_memory,
                    collate_fn=collator)
                img_loader.append(tl)
            return img_loader

        return None
Exemple #18
0
class Trainer:
    """
    trainer class
    """
    def __init__(self, cfg: Namespace, data: Dataset):
        """
        Args:
            cfg:  configuration
            data:  train dataset
        """
        self.cfg = cfg
        self.train, self.valid = data.split(0.8)
        RATING_FIELD.build_vocab(self.train)

        self.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')  # pylint: disable=no-member
        self.batch_size = cfg.batch_size
        if torch.cuda.is_available():
            self.batch_size *= torch.cuda.device_count()

        self.trn_itr = BucketIterator(
            self.train,
            device=self.device,
            batch_size=self.batch_size,
            shuffle=True,
            train=True,
            sort_within_batch=True,
            sort_key=lambda exam: -len(exam.comment_text))
        self.vld_itr = BucketIterator(
            self.valid,
            device=self.device,
            batch_size=self.batch_size,
            shuffle=False,
            train=False,
            sort_within_batch=True,
            sort_key=lambda exam: -len(exam.comment_text))
        self.log_step = 1000
        if len(self.vld_itr) < 100:
            self.log_step = 10
        elif len(self.vld_itr) < 1000:
            self.log_step = 100

        bert_path = cfg.bert_path if cfg.bert_path else 'bert-base-cased'
        self.model = BertForSequenceClassification.from_pretrained(
            bert_path, num_labels=2)
        pos_weight = (
            len([exam for exam in self.train.examples if exam.target < 0.5]) /
            len([exam for exam in self.train.examples if exam.target >= 0.5]))
        pos_wgt_tensor = torch.tensor([1.0, pos_weight], device=self.device)  # pylint: disable=not-callable
        self.criterion = nn.CrossEntropyLoss(weight=pos_wgt_tensor)
        if torch.cuda.is_available():
            self.model = DataParallelModel(self.model.cuda())
            self.criterion = DataParallelCriterion(self.criterion)
        self.optimizer = optim.Adam(self.model.parameters(), cfg.learning_rate)

    def run(self):
        """
        do train
        """
        max_f_score = -9e10
        max_epoch = -1
        for epoch in range(self.cfg.epoch):
            train_loss = self._train_epoch(epoch)
            metrics = self._evaluate(epoch)
            max_f_score_str = f' < {max_f_score:.2f}'
            if metrics['f_score'] > max_f_score:
                max_f_score_str = ' is max'
                max_f_score = metrics['f_score']
                max_epoch = epoch
                torch.save(self.model.state_dict(), self.cfg.model_path)
            logging.info('EPOCH[%d]: train loss: %.6f, valid loss: %.6f, acc: %.2f,' \
                         ' F: %.2f%s', epoch, train_loss, metrics['loss'],
                         metrics['accuracy'], metrics['f_score'], max_f_score_str)
            if (epoch - max_epoch) >= self.cfg.patience:
                logging.info('early stopping...')
                break
        logging.info('epoch: %d, f-score: %.2f', max_epoch, max_f_score)

    def _train_epoch(self, epoch: int) -> float:
        """
        train single epoch
        Args:
            epoch:  epoch number
        Returns:
            average loss
        """
        self.model.train()
        progress = tqdm(self.trn_itr,
                        f'EPOCH[{epoch}]',
                        mininterval=1,
                        ncols=100)
        losses = []
        for step, batch in enumerate(progress, start=1):
            outputs = self.model(batch.comment_text)
            # output of model wrapped with DataParallelModel is a list of outputs from each GPU
            # make input of DataParallelCriterion as a list of tuples
            if isinstance(self.model, DataParallelModel):
                loss = self.criterion([(output, ) for output in outputs],
                                      batch.target)
            else:
                loss = self.criterion(outputs, batch.target)
            losses.append(loss.item())
            if step % self.log_step == 0:
                avg_loss = sum(losses) / len(losses)
                progress.set_description(f'EPOCH[{epoch}] ({avg_loss:.6f})')
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
        return sum(losses) / len(losses)

    def _evaluate(self, epoch: int) -> Dict[str, float]:
        """
        evaluate on validation data
        Args:
            epoch:  epoch number
        Returns:
            metrics
        """
        self.model.eval()
        progress = tqdm(self.vld_itr,
                        f' EVAL[{epoch}]',
                        mininterval=1,
                        ncols=100)
        losses = []
        preds = []
        golds = []
        for step, batch in enumerate(progress, start=1):
            with torch.no_grad():
                outputs = self.model(batch.comment_text)
                if isinstance(self.model, DataParallelModel):
                    loss = self.criterion([(output, ) for output in outputs],
                                          batch.target)
                    for output in outputs:
                        preds.extend([(0 if o[0] < o[1] else 1)
                                      for o in output])
                else:
                    loss = self.criterion(outputs, batch.target)
                    preds.extend([(0 if output[0] < output[1] else 1)
                                  for output in outputs])
                losses.append(loss.item())
                golds.extend([gold.item() for gold in batch.target])
                if step % self.log_step == 0:
                    avg_loss = sum(losses) / len(losses)
                    progress.set_description(
                        f' EVAL[{epoch}] ({avg_loss:.6f})')
        metrics = self._get_metrics(preds, golds)
        metrics['loss'] = sum(losses) / len(losses)
        return metrics

    @classmethod
    def _get_metrics(cls, preds: List[float],
                     golds: List[float]) -> Dict[str, float]:
        """
        get metric values
        Args:
            preds:  predictions
            golds:  gold standards
        Returns:
            metric
        """
        assert len(preds) == len(golds)
        true_pos = 0
        false_pos = 0
        false_neg = 0
        true_neg = 0
        for pred, gold in zip(preds, golds):
            if pred >= 0.5:
                if gold >= 0.5:
                    true_pos += 1
                else:
                    false_pos += 1
            else:
                if gold >= 0.5:
                    false_neg += 1
                else:
                    true_neg += 1
        accuracy = (true_pos + true_neg) / (true_pos + false_pos + false_neg +
                                            true_neg)
        precision = 0.0
        if (true_pos + false_pos) > 0:
            precision = true_pos / (true_pos + false_pos)
        recall = 0.0
        if (true_pos + false_neg) > 0:
            recall = true_pos / (true_pos + false_neg)
        f_score = 0.0
        if (precision + recall) > 0.0:
            f_score = 2.0 * precision * recall / (precision + recall)
        return {
            'accuracy': 100.0 * accuracy,
            'precision': 100.0 * precision,
            'recall': 100.0 * recall,
            'f_score': 100.0 * f_score,
        }
def main(args):
    init(args)

    # Constants
    n_ctx = args.n_ctx
    data_dir = args.data_dir

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)
    text_encoder.decoder[len(encoder)] = '_start_'
    encoder['_start_'] = len(encoder)
    text_encoder.decoder[len(encoder)] = '_delimiter_'
    encoder['_delimiter_'] = len(encoder)
    text_encoder.decoder[len(encoder)] = '_classify_'
    encoder['_classify_'] = len(encoder)

    n_special = 3  # XD: useless for language modeling task
    vocab = n_vocab + n_special + n_ctx

    lm_model = LMModel(args,
                       vocab,
                       n_ctx,
                       return_probs=True,
                       doc_embed=args.doc_model)
    load_openai_pretrained_model(lm_model.transformer,
                                 n_ctx=n_ctx,
                                 n_special=n_special)
    if args.checkpoint != "none":
        checkpoint = torch.load(args.checkpoint, map_location='cpu')
        state_dict = checkpoint["state_dict"]
        for key in list(state_dict.keys()):
            state_dict[key[7:]] = state_dict[key]
            del state_dict[key]
        pos_emb_mask = torch.zeros(1, 1, vocab)
        pos_emb_mask[:, :, -n_ctx] = -1e12
        state_dict['pos_emb_mask'] = pos_emb_mask
        lm_model.load_state_dict(state_dict)
    lm_model.to(device)
    lm_model = DataParallelModel(lm_model)

    train_bar = get_loader(os.path.join(data_dir, "val_encoded.jsonl"),
                           n_gpu,
                           encoder,
                           num_workers=1,
                           shuffle=True,
                           max_size=args.n_iter)
    srcs, hyps, refs = [], [], []
    with torch.no_grad():
        lm_model.eval()
        for i, (pad_output, mask_output) in enumerate(tqdm(train_bar), 1):
            src_strs, tgt_strs, gen_strs = generate_outputs(
                lm_model, pad_output, mask_output, text_encoder, device,
                args.beam, args.gen_len, args.k, args.decoding_strategy)
            srcs.extend(src_strs)
            hyps.extend(gen_strs)
            refs.extend(tgt_strs)

    for i in range(len(hyps)):
        print("*" * 50)
        print("Source: {}".format(srcs[i]))
        print('Hypothesis: {}'.format(hyps[i]))
        print("Reference: {}".format(refs[i]))
Exemple #20
0
def train(config):
    net = BertForMaskedLM.from_pretrained(config.model)
    lossFunc = KLDivLoss(config)

    if torch.cuda.is_available():
        net = net.cuda()
        lossFunc = lossFunc.cuda()

        if config.dataParallel:
            net = DataParallelModel(net)
            lossFunc = DataParallelCriterion(lossFunc)

    options = optionsLoader(LOG, config.optionFrames, disp=False)
    Tokenizer = BertTokenizer.from_pretrained(config.model)
    prepareFunc = prepare_data

    trainSet = Dataset('train', config.batch_size,
                       lambda x: len(x[0]) + len(x[1]), prepareFunc, Tokenizer,
                       options['dataset'], LOG, 'train')
    validSet = Dataset('valid', config.batch_size,
                       lambda x: len(x[0]) + len(x[1]), prepareFunc, Tokenizer,
                       options['dataset'], LOG, 'valid')

    print(trainSet.__len__())

    Q = []
    best_vloss = 1e99
    counter = 0
    lRate = config.lRate

    prob_src = config.prob_src
    prob_tgt = config.prob_tgt

    num_train_optimization_steps = trainSet.__len__(
    ) * options['training']['stopConditions']['max_epoch']
    param_optimizer = list(net.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=lRate,
                         e=1e-9,
                         t_total=num_train_optimization_steps,
                         warmup=0.0)

    for epoch_idx in range(options['training']['stopConditions']['max_epoch']):
        total_seen = 0
        total_similar = 0
        total_unseen = 0
        total_source = 0

        trainSet.setConfig(config, prob_src, prob_tgt)
        trainLoader = data.DataLoader(dataset=trainSet,
                                      batch_size=1,
                                      shuffle=True,
                                      num_workers=config.dataLoader_workers,
                                      pin_memory=True)

        validSet.setConfig(config, 0.0, prob_tgt)
        validLoader = data.DataLoader(dataset=validSet,
                                      batch_size=1,
                                      shuffle=False,
                                      num_workers=config.dataLoader_workers,
                                      pin_memory=True)

        for batch_idx, batch_data in enumerate(trainLoader):
            if (batch_idx + 1) % 10000 == 0:
                gc.collect()
            start_time = time.time()

            net.train()

            inputs, positions, token_types, labels, masks, batch_seen, batch_similar, batch_unseen, batch_source = batch_data

            inputs = inputs[0].cuda()
            positions = positions[0].cuda()
            token_types = token_types[0].cuda()
            labels = labels[0].cuda()
            masks = masks[0].cuda()
            total_seen += batch_seen
            total_similar += batch_similar
            total_unseen += batch_unseen
            total_source += batch_source

            n_token = int((labels.data != 0).data.sum())

            predicts = net(inputs, positions, token_types, masks)
            loss = lossFunc(predicts, labels, n_token).sum()

            Q.append(float(loss))
            if len(Q) > 200:
                Q.pop(0)
            loss_avg = sum(Q) / len(Q)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()

            LOG.log(
                'Epoch %2d, Batch %6d, Loss %9.6f, Average Loss %9.6f, Time %9.6f'
                % (epoch_idx + 1, batch_idx + 1, loss, loss_avg,
                   time.time() - start_time))

            # Checkpoints
            idx = epoch_idx * trainSet.__len__() + batch_idx + 1
            if (idx >= options['training']['checkingPoints']['checkMin']) and (
                    idx % options['training']['checkingPoints']['checkFreq']
                    == 0):
                if config.do_eval:
                    vloss = 0
                    total_tokens = 0
                    for bid, batch_data in enumerate(validLoader):
                        inputs, positions, token_types, labels, masks, batch_seen, batch_similar, batch_unseen, batch_source = batch_data

                        inputs = inputs[0].cuda()
                        positions = positions[0].cuda()
                        token_types = token_types[0].cuda()
                        labels = labels[0].cuda()
                        masks = masks[0].cuda()

                        n_token = int((labels.data != config.PAD).data.sum())

                        with torch.no_grad():
                            net.eval()
                            predicts = net(inputs, positions, token_types,
                                           masks)
                            vloss += float(lossFunc(predicts, labels).sum())

                        total_tokens += n_token

                    vloss /= total_tokens
                    is_best = vloss < best_vloss
                    best_vloss = min(vloss, best_vloss)
                    LOG.log(
                        'CheckPoint: Validation Loss %11.8f, Best Loss %11.8f'
                        % (vloss, best_vloss))

                    if is_best:
                        LOG.log('Best Model Updated')
                        save_check_point(
                            {
                                'epoch': epoch_idx + 1,
                                'batch': batch_idx + 1,
                                'options': options,
                                'config': config,
                                'state_dict': net.state_dict(),
                                'best_vloss': best_vloss
                            },
                            is_best,
                            path=config.save_path,
                            fileName='latest.pth.tar')
                        counter = 0
                    else:
                        counter += options['training']['checkingPoints'][
                            'checkFreq']
                        if counter >= options['training']['stopConditions'][
                                'rateReduce_bound']:
                            counter = 0
                            for param_group in optimizer.param_groups:
                                lr_ = param_group['lr']
                                param_group['lr'] *= 0.55
                                _lr = param_group['lr']
                            LOG.log(
                                'Reduce Learning Rate from %11.8f to %11.8f' %
                                (lr_, _lr))
                        LOG.log('Current Counter = %d' % (counter))

                else:
                    save_check_point(
                        {
                            'epoch': epoch_idx + 1,
                            'batch': batch_idx + 1,
                            'options': options,
                            'config': config,
                            'state_dict': net.state_dict(),
                            'best_vloss': 1e99
                        },
                        False,
                        path=config.save_path,
                        fileName='checkpoint_Epoch' + str(epoch_idx + 1) +
                        '_Batch' + str(batch_idx + 1) + '.pth.tar')
                    LOG.log('CheckPoint Saved!')

        if options['training']['checkingPoints']['everyEpoch']:
            save_check_point(
                {
                    'epoch': epoch_idx + 1,
                    'batch': batch_idx + 1,
                    'options': options,
                    'config': config,
                    'state_dict': net.state_dict(),
                    'best_vloss': 1e99
                },
                False,
                path=config.save_path,
                fileName='checkpoint_Epoch' + str(epoch_idx + 1) + '.pth.tar')

        LOG.log('Epoch Finished.')
        LOG.log(
            'Total Seen: %d, Total Unseen: %d, Total Similar: %d, Total Source: %d.'
            % (total_seen, total_unseen, total_similar, total_source))
        gc.collect()
Exemple #21
0
def test(config):
    Best_Model = torch.load(config.test_model)
    Tokenizer = BertTokenizer.from_pretrained(config.model)

    f_in = open(config.inputFile, 'r')

    net = BertForMaskedLM.from_pretrained(config.model)

    # When loading from a model not trained from DataParallel
    #net.load_state_dict(Best_Model['state_dict'])
    #net.eval()

    if torch.cuda.is_available():
        net = net.cuda(0)
        if config.dataParallel:
            net = DataParallelModel(net)

    # When loading from a model trained from DataParallel
    net.load_state_dict(Best_Model['state_dict'])
    net.eval()

    mySearcher = Searcher(net, config)

    f_top1 = open('summary' + config.suffix + '.txt', 'w', encoding='utf-8')
    f_topK = open('summary' + config.suffix + '.txt.' +
                  str(config.answer_size),
                  'w',
                  encoding='utf-8')

    ed = '\n------------------------\n'

    for idx, line in enumerate(f_in):
        source_ = line.strip().split()
        source = Tokenizer.tokenize(line.strip())
        mapping = mapping_tokenize(source_, source)

        source = Tokenizer.convert_tokens_to_ids(source)

        print(idx)
        print(detokenize(translate(source, Tokenizer), mapping), end=ed)

        l_pred = mySearcher.length_Predict(source)
        Answers = mySearcher.search(source)
        baseline = sum(Answers[0][0])

        if config.reranking_method == 'none':
            Answers = sorted(Answers, key=lambda x: sum(x[0]))
        elif config.reranking_method == 'length_norm':
            Answers = sorted(Answers, key=lambda x: length_norm(x[0]))
        elif config.reranking_method == 'bounded_word_reward':
            Answers = sorted(
                Answers,
                key=lambda x: bounded_word_reward(x[0], config.reward, l_pred))
        elif config.reranking_method == 'bounded_adaptive_reward':
            Answers = sorted(
                Answers,
                key=lambda x: bounded_adaptive_reward(x[0], x[2], l_pred))

        texts = [
            detokenize(translate(Answers[k][1], Tokenizer), mapping)
            for k in range(len(Answers))
        ]

        if baseline != sum(Answers[0][0]):
            print('Reranked!')

        print(texts[0], end=ed)
        print(texts[0], file=f_top1)
        print(len(texts), file=f_topK)
        for i in range(len(texts)):
            print(Answers[i][0], file=f_topK)
            print(texts[i], file=f_topK)

    f_top1.close()
    f_topK.close()
        # target dimension[0] / 2
        # tar = target.contiguous().view(-1)
        # out = output.contiguous().view(target.size(0),-1)

        target = tar.contiguous().view(-1)
        output = out[:tar.size(0)]
        normalize = output.size(0) * output.size(1)
        output = output.contiguous().view(target.size(0), -1)
        loss = self.NLL(output, target) / normalize

        return loss


if not eval_model:
    criterion = NLLLoss(ignore_index=PAD)
    parallel_model = DataParallelModel(model)  # Encapsulate the model
    parallel_loss = DataParallelCriterion(criterion)

# In[5]:

# ---------------------------

# def merge_res(res):
#     ((inds1, log_probs1, enc_out1),(inds2, log_probs2, enc_out2)) = res
#     inds = T.cat([inds1, inds2], dim = 0).cpu()
#     enc_out = T.cat([enc_out1, enc_out2], dim = 0).cpu()
#     if type(log_probs1) != list:
#         log_probs = T.cat([log_probs1, log_probs2], dim = 0)
#         return inds, log_probs, enc_out
#     else:
#         return inds, _, enc_out
Exemple #23
0
    if config.model_type=='LSTM':
        model = LSTMLM(input_size=len(vocab),
                       embedding_size=config.embedding_size,
                       hidden_size=config.hidden_size,
                       output_size=len(vocab),
                       n_layers=config.n_layers,
                       dropout_p=config.dropout_p)
    elif config.model_type=='BiLSTM':
        model = BiLSTMLM(input_size=len(vocab),
                         embedding_size=config.embedding_size,
                         hidden_size=config.hidden_size,
                         output_size=len(vocab),
                         n_layers=config.n_layers,
                         dropout_p=config.dropout_p)
        
    loss_fn = nn.NLLLoss(ignore_index=vocab.stoi[vocab.pad_token])
    optimizer = optim.Adam(model.parameters())
    
    if config.cuda:
        if config.multi_gpu:
            from parallel import DataParallelModel, DataParallelCriterion
            model = DataParallelModel(model).cuda()
            loss_fn = DataParallelCriterion(loss_fn).cuda()
        else:
            model = model.cuda()
            loss_fn = loss_fn.cuda()
    print('=========MODEL=========\n',model)

    # Train
    for epoch in range(1, config.epochs+1):
        train()
Exemple #24
0
 def parallelize(self):
     self.parallel = True
     self.model = DataParallelModel(self.model)
     self.criterion = DataParallelCriterion(self.criterion)
Exemple #25
0
def main():
    args = setup_parser()
    args.final_eval = False

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup CUDA, GPU & distributed training
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels(args.data_dir)
    num_labels = len(label_list)
    args.num_labels = num_labels

    # Load pretrained model and tokenizer
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case)
    model = model_class.from_pretrained(args.model_name_or_path, config=config)
    model.to(args.device)

    # logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)

        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)
        if args.n_gpu > 1:
            model = DataParallelModel(model)

    # Evaluation
    results = {}
    if args.do_eval:
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            if args.n_gpu > 1:
                model = DataParallelModel(model)
            args.final_eval = True
            result = evaluate(args, model, tokenizer, prefix=global_step)
            result = dict(
                (k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

    if args.save_embeddings:
        save_embeddings(args, model, tokenizer)

    return results
    def __init__(self,
                 model,
                 mask_prob: float = 0.3,
                 clip: int = 1,
                 optimizer=None,
                 beam_width: int = 5,
                 max_len_a: float = 1.1,
                 max_len_b: int = 5,
                 len_penalty_ratio: float = 0.8,
                 nll_loss: bool = False,
                 fp16: bool = False,
                 mm_mode="mixed",
                 rank: int = -1):
        self.model = model

        self.clip = clip
        self.optimizer = optimizer

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.num_gpu = torch.cuda.device_count()

        self.mask_prob = mask_prob
        if nll_loss:
            self.criterion = nn.NLLLoss(
                ignore_index=model.text_processor.pad_token_id())
        else:
            self.criterion = SmoothedNLLLoss(
                ignore_index=model.text_processor.pad_token_id())

        self.num_gpu = torch.cuda.device_count()
        self.fp16 = False
        self.rank = rank
        if rank >= 0:
            self.device = torch.device('cuda', rank)
            torch.cuda.set_device(self.device)

        self.model = self.model.to(self.device)

        if fp16:
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level="O2")
            self.fp16 = True

        self.generator = BeamDecoder(self.model,
                                     beam_width=beam_width,
                                     max_len_a=max_len_a,
                                     max_len_b=max_len_b,
                                     len_penalty_ratio=len_penalty_ratio)
        if rank >= 0:
            self.model = DistributedDataParallel(self.model,
                                                 device_ids=[self.rank],
                                                 output_device=self.rank,
                                                 find_unused_parameters=True)
            self.generator = DistributedDataParallel(
                self.generator,
                device_ids=[self.rank],
                output_device=self.rank,
                find_unused_parameters=True)
        elif self.num_gpu > 1:
            print("Let's use", self.num_gpu, "GPUs!")
            self.model = DataParallelModel(self.model)
            self.criterion = DataParallelCriterion(self.criterion)
            self.generator = DataParallelModel(self.generator)

        self.reference = None
        self.best_bleu = -1.0
        self.mm_mode = mm_mode
Exemple #27
0
class TransferNetworkImg(Network):
    def __init__(self,
                 model_name='DenseNet',
                 model_type='cv_transfer',
                 lr=0.02,
                 criterion=nn.CrossEntropyLoss(),
                 optimizer_name='Adam',
                 dropout_p=0.45,
                 pretrained=True,
                 device=None,
                 best_accuracy=0.,
                 best_validation_loss=None,
                 best_model_file='best_model.pth',
                 head={
                     'num_outputs': 10,
                     'layers': [],
                     'model_type': 'classifier'
                 },
                 class_names=[],
                 num_classes=None,
                 add_extra=True,
                 set_params=True,
                 set_head=True):

        super().__init__(device=device)

        self.set_transfer_model(model_name,
                                pretrained=pretrained,
                                add_extra=add_extra,
                                dropout_p=dropout_p)

        if set_head:
            self.set_model_head(model_name=model_name,
                                head=head,
                                dropout_p=dropout_p,
                                criterion=criterion,
                                device=device)
        if set_params:
            self.set_model_params(criterion=criterion,
                                  optimizer_name=optimizer_name,
                                  lr=lr,
                                  dropout_p=dropout_p,
                                  model_name=model_name,
                                  model_type=model_type,
                                  best_accuracy=best_accuracy,
                                  best_validation_loss=best_validation_loss,
                                  best_model_file=best_model_file,
                                  class_names=class_names,
                                  num_classes=num_classes)

        self.model = self.model.to(device)

    def set_model_params(self,
                         criterion=nn.CrossEntropyLoss(),
                         optimizer_name='Adam',
                         lr=0.1,
                         dropout_p=0.45,
                         model_name='DenseNet',
                         model_type='cv_transfer',
                         best_accuracy=0.,
                         best_validation_loss=None,
                         best_model_file='best_model_file.pth',
                         class_names=[],
                         num_classes=None):

        print('Transfer Learning: current best accuracy = {:.3f}'.format(
            best_accuracy))

        super(TransferNetworkImg,
              self).set_model_params(criterion=criterion,
                                     optimizer_name=optimizer_name,
                                     lr=lr,
                                     dropout_p=dropout_p,
                                     model_name=model_name,
                                     model_type=model_type,
                                     best_accuracy=best_accuracy,
                                     best_validation_loss=best_validation_loss,
                                     best_model_file=best_model_file)
        self.class_names = class_names
        self.num_classes = num_classes
        if len(class_names) == 0:
            self.class_names = {
                k: str(v)
                for k, v in enumerate(list(range(self.head['num_outputs'])))
            }

    def forward(self, x):
        return self.model(x)

    def freeze(self, train_classifier=True):
        super(TransferNetworkImg, self).freeze()
        if train_classifier:
            for param in self.model.fc.parameters():
                param.requires_grad = True

    def parallelize(self):
        self.parallel = True
        self.model = DataParallelModel(self.model)
        self.criterion = DataParallelCriterion(self.criterion)

    def set_transfer_model(self,
                           mname,
                           pretrained=True,
                           add_extra=True,
                           dropout_p=0.45):
        self.model = None
        models_dict = {
            'densenet': {
                'model': models.densenet121(pretrained=pretrained),
                'conv_channels': 1024
            },
            'resnet34': {
                'model': models.resnet34(pretrained=pretrained),
                'conv_channels': 512
            },
            'resnet50': {
                'model': models.resnet50(pretrained=pretrained),
                'conv_channels': 2048
            }
        }
        meta = models_dict[mname.lower()]
        try:
            model = meta['model']
            for param in model.parameters():
                param.requires_grad = False
            self.model = model
            print(
                'Setting transfer learning model: self.model set to {}'.format(
                    mname))
        except:
            print(
                'Setting transfer learning model: model name {} not supported'.
                format(mname))

        # creating and adding extra layers to the model
        dream_model = None
        if add_extra:
            channels = meta['conv_channels']
            dream_model = nn.Sequential(
                nn.Conv2d(channels, channels, 3, 1, 1),
                # Printer(),
                nn.BatchNorm2d(channels),
                nn.ReLU(True),
                nn.Dropout2d(dropout_p),
                nn.Conv2d(channels, channels, 3, 1, 1),
                nn.BatchNorm2d(channels),
                nn.ReLU(True),
                nn.Dropout2d(dropout_p),
                nn.Conv2d(channels, channels, 3, 1, 1),
                nn.BatchNorm2d(channels),
                nn.ReLU(True),
                nn.Dropout2d(dropout_p))
        self.dream_model = dream_model

    def set_model_head(
            self,
            model_name='DenseNet',
            head={
                'num_outputs': 10,
                'layers': [],
                'class_names': None,
                'model_type': 'classifier'
            },
            criterion=nn.NLLLoss(),
            adaptive=True,
            dropout_p=0.45,
            device=None):

        models_meta = {
            'resnet34': {
                'conv_channels': 512,
                'head_id': -2,
                'adaptive_head': [DAI_AvgPool],
                'normal_head': [nn.AvgPool2d(7, 1)]
            },
            'resnet50': {
                'conv_channels': 2048,
                'head_id': -2,
                'adaptive_head': [DAI_AvgPool],
                'normal_head': [nn.AvgPool2d(7, 1)]
            },
            'densenet': {
                'conv_channels': 1024,
                'head_id': -1,
                'adaptive_head': [nn.ReLU(inplace=True), DAI_AvgPool],
                'normal_head': [nn.ReLU(inplace=True),
                                nn.AvgPool2d(7, 1)]
            }
        }

        name = model_name.lower()
        meta = models_meta[name]
        modules = list(self.model.children())
        l = modules[:meta['head_id']]
        if self.dream_model:
            l += self.dream_model
        if type(head).__name__ != 'dict':
            model = nn.Sequential(*l)
            for layer in head.children():
                if (type(layer).__name__) == 'StdConv':
                    conv_module = layer
                    break
            conv_layer = conv_module.conv
            temp_args = [
                conv_layer.out_channels, conv_layer.kernel_size,
                conv_layer.stride, conv_layer.padding
            ]
            temp_args.insert(0, meta['conv_channels'])
            conv_layer = nn.Conv2d(*temp_args)
            conv_module.conv = conv_layer
            model.add_module('custom_head', head)
        else:
            head['criterion'] = criterion
            if head['model_type'].lower() == 'classifier':
                head['output_non_linearity'] = None
            self.num_outputs = head['num_outputs']
            fc = modules[-1]
            try:
                in_features = fc.in_features
            except:
                in_features = fc.model.out.in_features
            fc = FC(num_inputs=in_features,
                    num_outputs=head['num_outputs'],
                    layers=head['layers'],
                    model_type=head['model_type'],
                    output_non_linearity=head['output_non_linearity'],
                    dropout_p=dropout_p,
                    criterion=head['criterion'],
                    optimizer_name=None,
                    device=device)
            if adaptive:
                l += meta['adaptive_head']
            else:
                l += meta['normal_head']
            model = nn.Sequential(*l)
            model.add_module('fc', fc)
        self.model = model
        self.head = head

        if type(head).__name__ == 'dict':
            print('Model: {}, Setting head: inputs: {} hidden:{} outputs: {}'.
                  format(model_name, in_features, head['layers'],
                         head['num_outputs']))
        else:
            print('Model: {}, Setting head: {}'.format(model_name,
                                                       type(head).__name__))

    def _get_dropout(self):
        return self.dropout_p

    def _set_dropout(self, p=0.45):

        if self.model.classifier is not None:
            print('{}: setting head (FC) dropout prob to {:.3f}'.format(
                self.model_name, p))
            self.model.fc._set_dropout(p=p)

    def get_model_params(self):
        params = super(TransferNetworkImg, self).get_model_params()
        params['class_names'] = self.class_names
        params['num_classes'] = self.num_classes
        params['head'] = self.head
        return params
Exemple #28
0
def createModels(args, userNum, itemNum, adj):
    if args.model == 'NCF':
        model = NCF(userNum, itemNum, 64, layers=[128, 64, 32, 16, 8]).cuda()
    if args.model == 'NMF':
        model = NMF(args.model, userNum, itemNum, 3, args.embedSize,
                    args.droprate).cuda()
    elif args.model == 'NGCFMF':
        model = NGCFMF(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers).cuda()
    elif args.model == 'NGCFMLP':
        model = NGCFMLP(userNum,
                        itemNum,
                        adj,
                        embedSize=args.embedSize,
                        layers=args.layers).cuda()
    elif args.model == 'NGCFMFMLP':
        model = NGCFMFMLP(userNum,
                          itemNum,
                          adj,
                          embedSize=args.embedSize,
                          layers=args.layers).cuda()
    elif args.model == 'NGCFMF_concat_MF':
        model = NGCFMF_concat_MF(userNum,
                                 itemNum,
                                 adj,
                                 embedSize=args.embedSize,
                                 layers=args.layers).cuda()
    elif args.model == 'NGCFMF_concat_MLP':
        model = NGCFMF_concat_MLP(userNum,
                                  itemNum,
                                  adj,
                                  embedSize=args.embedSize,
                                  layers=args.layers).cuda()
    elif args.model == 'NGCFMLP_concat_MF':
        model = NGCFMLP_concat_MF(userNum,
                                  itemNum,
                                  adj,
                                  embedSize=args.embedSize,
                                  layers=args.layers).cuda()
    elif args.model == 'NGCFMLP_concat_MLP':
        model = NGCFMLP_concat_MLP(userNum,
                                   itemNum,
                                   adj,
                                   embedSize=args.embedSize,
                                   layers=args.layers).cuda()
    elif args.model == 'NGCFMF_concat_MF_MLP':
        model = NGCFMF_concat_MF_MLP(userNum,
                                     itemNum,
                                     adj,
                                     embedSize=args.embedSize,
                                     layers=args.layers).cuda()
    elif args.model == 'NGCFMLP_concat_MF_MLP':
        model = NGCFMLP_concat_MF_MLP(userNum,
                                      itemNum,
                                      adj,
                                      embedSize=args.embedSize,
                                      layers=args.layers).cuda()
    elif args.model == 'GACFV1':
        model = GACFV1(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV2':
        model = GACFV2(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFMask':
        model = GACFMask(userNum,
                         itemNum,
                         adj,
                         embedSize=args.embedSize,
                         layers=args.layers,
                         droprate=args.droprate).cuda()
    elif args.model == 'SPGA':
        model = SPGACF(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV3':
        model = GACFV3(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV4':
        model = GACFV4(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV5':
        model = GACFV5(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV6':
        model = GACFV6(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()

    if args.train_mode == 'PairSampling':
        lossfn = BPRLoss()
        if args.parallel == True:
            model = DataParallelModel(model)
            lossfn = DataParallelCriterion2(lossfn)
    elif args.train_mode == 'NegSampling':
        lossfn = BCEWithLogitsLoss()
        if args.parallel == True:
            model = DataParallelModel(model)  # 并行化model
            lossfn = DataParallelCriterion(lossfn)  # 并行化损失函数
    optim = Adam(model.parameters(),
                 lr=args.lr,
                 weight_decay=args.weight_decay)
    return model, lossfn, optim
Exemple #29
0
class FoodIngredients(Network):
    def __init__(self,
                 model_name='DenseNet',
                 model_type='food',
                 lr=0.02,
                 optimizer_name='Adam',
                 criterion1=nn.CrossEntropyLoss(),
                 criterion2=nn.BCEWithLogitsLoss(),
                 dropout_p=0.45,
                 pretrained=True,
                 device=None,
                 best_accuracy=0.,
                 best_validation_loss=None,
                 best_model_file='best_model.pth',
                 head1={
                     'num_outputs': 10,
                     'layers': [],
                     'model_type': 'classifier'
                 },
                 head2={
                     'num_outputs': 10,
                     'layers': [],
                     'model_type': 'multi_label_classifier'
                 },
                 class_names=[],
                 num_classes=None,
                 ingredient_names=[],
                 num_ingredients=None,
                 add_extra=True,
                 set_params=True,
                 set_head=True):

        super().__init__(device=device)

        self.set_transfer_model(model_name,
                                pretrained=pretrained,
                                add_extra=add_extra,
                                dropout_p=dropout_p)

        if set_head:
            self.set_model_head(model_name=model_name,
                                head1=head1,
                                head2=head2,
                                dropout_p=dropout_p,
                                criterion1=criterion1,
                                criterion2=criterion2,
                                device=device)
        if set_params:
            self.set_model_params(
                optimizer_name=optimizer_name,
                lr=lr,
                dropout_p=dropout_p,
                model_name=model_name,
                model_type=model_type,
                best_accuracy=best_accuracy,
                best_validation_loss=best_validation_loss,
                best_model_file=best_model_file,
                class_names=class_names,
                num_classes=num_classes,
                ingredient_names=ingredient_names,
                num_ingredients=num_ingredients,
            )

        self.model = self.model.to(device)

    def set_model_params(self,
                         criterion1=nn.CrossEntropyLoss(),
                         criterion2=nn.BCEWithLogitsLoss(),
                         optimizer_name='Adam',
                         lr=0.1,
                         dropout_p=0.45,
                         model_name='DenseNet',
                         model_type='cv_transfer',
                         best_accuracy=0.,
                         best_validation_loss=None,
                         best_model_file='best_model_file.pth',
                         head1={
                             'num_outputs': 10,
                             'layers': [],
                             'model_type': 'classifier'
                         },
                         head2={
                             'num_outputs': 10,
                             'layers': [],
                             'model_type': 'muilti_label_classifier'
                         },
                         class_names=[],
                         num_classes=None,
                         ingredient_names=[],
                         num_ingredients=None):

        print(
            'Food Names: current best accuracy = {:.3f}'.format(best_accuracy))
        if best_validation_loss is not None:
            print('Food Ingredients: current best loss = {:.3f}'.format(
                best_validation_loss))

        super(FoodIngredients,
              self).set_model_params(optimizer_name=optimizer_name,
                                     lr=lr,
                                     dropout_p=dropout_p,
                                     model_name=model_name,
                                     model_type=model_type,
                                     best_accuracy=best_accuracy,
                                     best_validation_loss=best_validation_loss,
                                     best_model_file=best_model_file)
        self.class_names = class_names
        self.num_classes = num_classes
        self.ingredeint_names = ingredient_names
        self.num_ingredients = num_ingredients
        self.criterion1 = criterion1
        self.criterion2 = criterion2

    def forward(self, x):
        l = list(self.model.children())
        for m in l[:-2]:
            x = m(x)
        food = l[-2](x)
        ingredients = l[-1](x)
        return (food, ingredients)

    def compute_loss(self, outputs, labels, w1=1., w2=1.):
        out1, out2 = outputs
        label1, label2 = labels
        loss1 = self.criterion1(out1, label1)
        loss2 = self.criterion2(out2, label2)
        return [(loss1 * w1) + (loss2 * w2)]

    def freeze(self, train_classifier=True):
        super(FoodIngredients, self).freeze()
        if train_classifier:
            for param in self.model.fc1.parameters():
                param.requires_grad = True
            for param in self.model.fc2.parameters():
                param.requires_grad = True

    def parallelize(self):
        self.parallel = True
        self.model = DataParallelModel(self.model)
        self.criterion = DataParallelCriterion(self.criterion)

    def set_transfer_model(self,
                           mname,
                           pretrained=True,
                           add_extra=True,
                           dropout_p=0.45):
        self.model = None
        models_dict = {
            'densenet': {
                'model': models.densenet121(pretrained=pretrained),
                'conv_channels': 1024
            },
            'resnet34': {
                'model': models.resnet34(pretrained=pretrained),
                'conv_channels': 512
            },
            'resnet50': {
                'model': models.resnet50(pretrained=pretrained),
                'conv_channels': 2048
            }
        }
        meta = models_dict[mname.lower()]
        try:
            model = meta['model']
            for param in model.parameters():
                param.requires_grad = False
            self.model = model
            print(
                'Setting transfer learning model: self.model set to {}'.format(
                    mname))
        except:
            print(
                'Setting transfer learning model: model name {} not supported'.
                format(mname))

        # creating and adding extra layers to the model
        dream_model = None
        if add_extra:
            channels = meta['conv_channels']
            dream_model = nn.Sequential(
                nn.Conv2d(channels, channels, 3, 1, 1),
                # Printer(),
                nn.BatchNorm2d(channels),
                nn.ReLU(True),
                nn.Dropout2d(dropout_p),
                nn.Conv2d(channels, channels, 3, 1, 1),
                nn.BatchNorm2d(channels),
                nn.ReLU(True),
                nn.Dropout2d(dropout_p),
                nn.Conv2d(channels, channels, 3, 1, 1),
                nn.BatchNorm2d(channels),
                nn.ReLU(True),
                nn.Dropout2d(dropout_p))
        self.dream_model = dream_model

    def set_model_head(
            self,
            model_name='DenseNet',
            head1={
                'num_outputs': 10,
                'layers': [],
                'class_names': None,
                'model_type': 'classifier'
            },
            head2={
                'num_outputs': 10,
                'layers': [],
                'class_names': None,
                'model_type': 'muilti_label_classifier'
            },
            criterion1=nn.CrossEntropyLoss(),
            criterion2=nn.BCEWithLogitsLoss(),
            adaptive=True,
            dropout_p=0.45,
            device=None):

        models_meta = {
            'resnet34': {
                'conv_channels': 512,
                'head_id': -2,
                'adaptive_head': [DAI_AvgPool],
                'normal_head': [nn.AvgPool2d(7, 1)]
            },
            'resnet50': {
                'conv_channels': 2048,
                'head_id': -2,
                'adaptive_head': [DAI_AvgPool],
                'normal_head': [nn.AvgPool2d(7, 1)]
            },
            'densenet': {
                'conv_channels': 1024,
                'head_id': -1,
                'adaptive_head': [nn.ReLU(inplace=True), DAI_AvgPool],
                'normal_head': [nn.ReLU(inplace=True),
                                nn.AvgPool2d(7, 1)]
            }
        }

        name = model_name.lower()
        meta = models_meta[name]
        modules = list(self.model.children())
        l = modules[:meta['head_id']]
        if self.dream_model:
            l += self.dream_model
        heads = [head1, head2]
        crits = [criterion1, criterion2]
        fcs = []
        for head, criterion in zip(heads, crits):
            head['criterion'] = criterion
            if head['model_type'].lower() == 'classifier':
                head['output_non_linearity'] = None
            fc = modules[-1]
            try:
                in_features = fc.in_features
            except:
                in_features = fc.model.out.in_features
            fc = FC(num_inputs=in_features,
                    num_outputs=head['num_outputs'],
                    layers=head['layers'],
                    model_type=head['model_type'],
                    output_non_linearity=head['output_non_linearity'],
                    dropout_p=dropout_p,
                    criterion=head['criterion'],
                    optimizer_name=None,
                    device=device)
            fcs.append(fc)
        if adaptive:
            l += meta['adaptive_head']
        else:
            l += meta['normal_head']
        model = nn.Sequential(*l)
        model.add_module('fc1', fcs[0])
        model.add_module('fc2', fcs[1])
        self.model = model
        self.head1 = head1
        self.head2 = head2

        print('Multi-head set up complete.')

    def train_(self, e, trainloader, optimizer, print_every):

        epoch, epochs = e
        self.train()
        t0 = time.time()
        t1 = time.time()
        batches = 0
        running_loss = 0.
        for data_batch in trainloader:
            inputs, label1, label2 = data_batch[0], data_batch[1], data_batch[
                2]
            batches += 1
            inputs = inputs.to(self.device)
            label1 = label1.to(self.device)
            label2 = label2.to(self.device)
            labels = (label1, label2)
            optimizer.zero_grad()
            outputs = self.forward(inputs)
            loss = self.compute_loss(outputs, labels)[0]
            if self.parallel:
                loss.sum().backward()
                loss = loss.sum()
            else:
                loss.backward()
                loss = loss.item()
            optimizer.step()
            running_loss += loss
            if batches % print_every == 0:
                elapsed = time.time() - t1
                if elapsed > 60:
                    elapsed /= 60.
                    measure = 'min'
                else:
                    measure = 'sec'
                batch_time = time.time() - t0
                if batch_time > 60:
                    batch_time /= 60.
                    measure2 = 'min'
                else:
                    measure2 = 'sec'
                print(
                    '+----------------------------------------------------------------------+\n'
                    f"{time.asctime().split()[-2]}\n"
                    f"Time elapsed: {elapsed:.3f} {measure}\n"
                    f"Epoch:{epoch+1}/{epochs}\n"
                    f"Batch: {batches+1}/{len(trainloader)}\n"
                    f"Batch training time: {batch_time:.3f} {measure2}\n"
                    f"Batch training loss: {loss:.3f}\n"
                    f"Average training loss: {running_loss/(batches):.3f}\n"
                    '+----------------------------------------------------------------------+\n'
                )
                t0 = time.time()
        return running_loss / len(trainloader)

    def evaluate(self, dataloader, metric='accuracy'):

        running_loss = 0.
        classifier = None

        if self.model_type == 'classifier':  # or self.num_classes is not None:
            classifier = Classifier(self.class_names)

        y_pred = []
        y_true = []

        self.eval()
        rmse_ = 0.
        with torch.no_grad():
            for data_batch in dataloader:
                inputs, label1, label2 = data_batch[0], data_batch[
                    1], data_batch[2]
                inputs = inputs.to(self.device)
                label1 = label1.to(self.device)
                label2 = label2.to(self.device)
                labels = (label1, label2)
                outputs = self.forward(inputs)
                loss = self.compute_loss(outputs, labels)[0]
                if self.parallel:
                    running_loss += loss.sum()
                    outputs = parallel.gather(outputs, self.device)
                else:
                    running_loss += loss.item()
                if classifier is not None and metric == 'accuracy':
                    classifier.update_accuracies(outputs, labels)
                    y_true.extend(list(labels.squeeze(0).cpu().numpy()))
                    _, preds = torch.max(torch.exp(outputs), 1)
                    y_pred.extend(list(preds.cpu().numpy()))
                elif metric == 'rmse':
                    rmse_ += rmse(outputs, labels).cpu().numpy()

        self.train()

        ret = {}
        # print('Running_loss: {:.3f}'.format(running_loss))
        if metric == 'rmse':
            print('Total rmse: {:.3f}'.format(rmse_))
            ret['final_rmse'] = rmse_ / len(dataloader)

        ret['final_loss'] = running_loss / len(dataloader)

        if classifier is not None:
            ret['accuracy'], ret[
                'class_accuracies'] = classifier.get_final_accuracies()
            ret['report'] = classification_report(
                y_true, y_pred, target_names=self.class_names)
            ret['confusion_matrix'] = confusion_matrix(y_true, y_pred)
            try:
                ret['roc_auc_score'] = roc_auc_score(y_true, y_pred)
            except:
                pass
        return ret

    def evaluate_food(self, dataloader, metric='accuracy'):

        running_loss = 0.
        classifier = None

        classifier = Classifier(self.class_names)

        y_pred = []
        y_true = []

        self.eval()
        rmse_ = 0.
        with torch.no_grad():
            for data_batch in dataloader:
                inputs, labels = data_batch[0], data_batch[1]
                inputs = inputs.to(self.device)
                labels = labels.to(self.device)
                outputs = self.forward(inputs)[0]
                if classifier is not None and metric == 'accuracy':
                    try:
                        classifier.update_accuracies(outputs, labels)
                        y_true.extend(list(labels.squeeze(0).cpu().numpy()))
                        _, preds = torch.max(torch.exp(outputs), 1)
                        y_pred.extend(list(preds.cpu().numpy()))
                    except:
                        pass
                elif metric == 'rmse':
                    rmse_ += rmse(outputs, labels).cpu().numpy()

        self.train()

        ret = {}
        # print('Running_loss: {:.3f}'.format(running_loss))
        if metric == 'rmse':
            print('Total rmse: {:.3f}'.format(rmse_))
            ret['final_rmse'] = rmse_ / len(dataloader)

        ret['final_loss'] = running_loss / len(dataloader)

        if classifier is not None:
            ret['accuracy'], ret[
                'class_accuracies'] = classifier.get_final_accuracies()
            ret['report'] = classification_report(
                y_true, y_pred, target_names=self.class_names)
            ret['confusion_matrix'] = confusion_matrix(y_true, y_pred)
            try:
                ret['roc_auc_score'] = roc_auc_score(y_true, y_pred)
            except:
                pass
        return ret

    def find_lr(self,
                trn_loader,
                init_value=1e-8,
                final_value=10.,
                beta=0.98,
                plot=False):

        print('\nFinding the ideal learning rate.')

        model_state = copy.deepcopy(self.model.state_dict())
        optim_state = copy.deepcopy(self.optimizer.state_dict())
        optimizer = self.optimizer
        num = len(trn_loader) - 1
        mult = (final_value / init_value)**(1 / num)
        lr = init_value
        optimizer.param_groups[0]['lr'] = lr
        avg_loss = 0.
        best_loss = 0.
        batch_num = 0
        losses = []
        log_lrs = []
        for data_batch in trn_loader:
            batch_num += 1
            inputs, label1, label2 = data_batch[0], data_batch[1], data_batch[
                2]
            inputs = inputs.to(self.device)
            label1 = label1.to(self.device)
            label2 = label2.to(self.device)
            labels = (label1, label2)
            optimizer.zero_grad()
            outputs = self.forward(inputs)
            loss = self.compute_loss(outputs, labels)[0]
            #Compute the smoothed loss
            if self.parallel:
                avg_loss = beta * avg_loss + (1 - beta) * loss.sum()
            else:
                avg_loss = beta * avg_loss + (1 - beta) * loss.item()
            smoothed_loss = avg_loss / (1 - beta**batch_num)
            #Stop if the loss is exploding
            if batch_num > 1 and smoothed_loss > 4 * best_loss:
                self.log_lrs, self.find_lr_losses = log_lrs, losses
                self.model.load_state_dict(model_state)
                self.optimizer.load_state_dict(optim_state)
                if plot:
                    self.plot_find_lr()
                temp_lr = self.log_lrs[np.argmin(self.find_lr_losses) -
                                       (len(self.log_lrs) // 8)]
                self.lr = (10**temp_lr)
                print('Found it: {}\n'.format(self.lr))
                return self.lr
            #Record the best loss
            if smoothed_loss < best_loss or batch_num == 1:
                best_loss = smoothed_loss
            #Store the values
            losses.append(smoothed_loss)
            log_lrs.append(math.log10(lr))
            #Do the SGD step
            if self.parallel:
                loss.sum().backward()
            else:
                loss.backward()
            optimizer.step()
            #Update the lr for the next step
            lr *= mult
            optimizer.param_groups[0]['lr'] = lr

        self.log_lrs, self.find_lr_losses = log_lrs, losses
        self.model.load_state_dict(model_state)
        self.optimizer.load_state_dict(optim_state)
        if plot:
            self.plot_find_lr()
        temp_lr = self.log_lrs[np.argmin(self.find_lr_losses) -
                               (len(self.log_lrs) // 10)]
        self.lr = (10**temp_lr)
        print('Found it: {}\n'.format(self.lr))
        return self.lr

    def plot_find_lr(self):
        plt.ylabel("Loss")
        plt.xlabel("Learning Rate (log scale)")
        plt.plot(self.log_lrs, self.find_lr_losses)
        plt.show()

    def classify(self,
                 inputs,
                 thresh=0.4):  #,show = False,mean = None,std = None):
        outputs = self.predict(inputs)
        food, ing = outputs
        try:
            _, preds = torch.max(torch.exp(food), 1)
        except:
            _, preds = torch.max(torch.exp(food.unsqueeze(0)), 1)
        ing_outs = ing.sigmoid()
        ings = (ing_outs >= thresh)
        class_preds = [str(self.class_names[p]) for p in preds]
        ing_preds = [
            self.ingredeint_names[p.nonzero().squeeze(1).cpu()] for p in ings
        ]
        return class_preds, ing_preds

    def _get_dropout(self):
        return self.dropout_p

    def get_model_params(self):
        params = super(FoodIngredients, self).get_model_params()
        params['class_names'] = self.class_names
        params['num_classes'] = self.num_classes
        params['ingredient_names'] = self.ingredient_names
        params['num_ingredients'] = self.num_ingredients
        params['head1'] = self.head1
        params['head2'] = self.head2
        return params
Exemple #30
0
class LMTrainer:
    def __init__(self,
                 model,
                 mask_prob: float = 0.15,
                 clip: int = 1,
                 optimizer=None):
        self.model = model
        self.clip = clip
        self.optimizer = optimizer

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)

        self.mask_prob = mask_prob
        self.criterion = nn.NLLLoss(
            ignore_index=model.text_processor.pad_token_id())

        num_gpu = torch.cuda.device_count()
        if num_gpu > 1:
            print("Let's use", num_gpu, "GPUs!")
            self.model = DataParallelModel(self.model)
            self.criterion = DataParallelCriterion(self.criterion)

        self.best_dev_loss = float("inf")
        self.best_train_loss = float("inf")
        self.last_train_loss = float("inf")

    def train_epoch(self, data_iter: data_utils.DataLoader,
                    dev_data_iter: data_utils.DataLoader, saving_path: str,
                    step: int):
        "Standard Training and Logging Function"
        start = time.time()
        total_tokens, total_loss, tokens, cur_loss = 0, 0, 0, 0
        cur_loss = 0
        model = self.model.module if hasattr(self.model,
                                             "module") else self.model

        for i, batch in enumerate(data_iter):
            if self.optimizer is not None:
                self.optimizer.zero_grad()
            mask, target, texts = mask_text(self.mask_prob, batch["pad_mask"],
                                            batch["texts"],
                                            model.text_processor)
            try:
                predictions = self.model(mask=mask,
                                         texts=texts,
                                         pads=batch["pad_mask"],
                                         langs=batch["langs"])
                ntokens = target.size(0)

                if ntokens == 0:  # Nothing to predict!
                    continue

                loss = self.criterion(predictions, target).mean()
                loss.backward()

                unmask_text(mask, target, texts)

                if self.optimizer is not None:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   self.clip)

                    self.optimizer.step()
                    step += 1

                loss = float(loss.data) * ntokens
                total_loss += loss
                cur_loss += loss
                total_tokens += ntokens
                tokens += ntokens

                if step % 50 == 0:
                    elapsed = time.time() - start
                    print(
                        datetime.datetime.now(),
                        "Epoch Step: %d Loss: %f Tokens per Sec: %f" %
                        (step, cur_loss / tokens, tokens / elapsed))

                    if step % 500 == 0:
                        self.validate_and_save(saving_path, dev_data_iter)

                    start, tokens, cur_loss = time.time(), 0, 0
            except RuntimeError as err:
                print("Problem with batch item", texts.size())
                torch.cuda.empty_cache()
                pass

        current_loss = total_loss / total_tokens
        print("Total loss in this epoch: %f" % current_loss)
        if current_loss < self.best_train_loss:
            self.best_train_loss = current_loss
            model_to_save = (self.model.module if hasattr(
                self.model, "module") else self.model)
            model_to_save.save(saving_path + ".latest")
            with open(os.path.join(saving_path + ".latest", "optim"),
                      "wb") as fp:
                pickle.dump(self.optimizer, fp)
        self.last_train_loss = current_loss

        self.validate_and_save(saving_path, dev_data_iter)
        return step

    def validate_and_save(self, saving_path, dev_data_iter):
        with torch.no_grad():
            model = self.model.module if hasattr(self.model,
                                                 "module") else self.model
            model.eval()
            total_dev_loss, total_dev_tokens = 0, 0
            for batch in dev_data_iter:
                mask, target, texts = mask_text(self.mask_prob,
                                                batch["pad_mask"],
                                                batch["texts"].clone(),
                                                model.text_processor)
                predictions = self.model(mask=mask,
                                         texts=texts,
                                         pads=batch["pad_mask"],
                                         langs=batch["langs"])
                ntokens = target.size(0)

                if ntokens == 0:  # Nothing to predict!
                    continue
                loss = self.criterion(predictions,
                                      target).mean().data * ntokens
                total_dev_loss += float(loss)
                total_dev_tokens += ntokens

            dev_loss = total_dev_loss / total_dev_tokens
            print("Current dev loss", dev_loss)
            if self.best_dev_loss > float(dev_loss):
                self.best_dev_loss = float(dev_loss)
                print("saving best dev loss", self.best_dev_loss)
                model_to_save = (self.model.module if hasattr(
                    self.model, "module") else self.model)
                model_to_save.save(saving_path)
                with open(os.path.join(saving_path, "optim"), "wb") as fp:
                    pickle.dump(self.optimizer, fp)
            model.train()

    @staticmethod
    def config_dropout(model, dropout):
        model.encoder.config.hidden_dropout_prob = dropout
        model.encoder.config.attention_probs_dropout_prob = dropout

    @staticmethod
    def train(options):
        if not os.path.exists(options.model_path):
            os.makedirs(options.model_path)

        text_processor = TextProcessor(options.tokenizer_path)

        lm_class = ReformerLM if options.reformer else LM
        if options.pretrained_path is None:
            lm = lm_class(text_processor=text_processor,
                          size=options.model_size)
        else:
            lm = lm_class.load(options.pretrained_path)

        if options.reformer:
            lm.config.hidden_dropout_prob = options.dropout
            lm.config.local_attention_probs_dropout_prob = options.dropout
            lm.config.lsh_attention_probs_dropout_prob = options.dropout
        else:
            LMTrainer.config_dropout(lm, options.dropout)

        train_data = dataset.TextDataset(save_cache_dir=options.train_path,
                                         max_cache_size=options.cache_size)
        dev_data = dataset.TextDataset(save_cache_dir=options.dev_path,
                                       max_cache_size=options.cache_size,
                                       load_all=True)

        if options.continue_train:
            with open(os.path.join(options.pretrained_path, "optim"),
                      "rb") as fp:
                optimizer = pickle.load(fp)
        else:
            optimizer = build_optimizer(lm, options.learning_rate,
                                        options.warmup)

        trainer = LMTrainer(model=lm,
                            mask_prob=options.mask_prob,
                            optimizer=optimizer,
                            clip=options.clip)

        collator = dataset.TextCollator(pad_idx=text_processor.pad_token_id())
        train_sampler, dev_sampler = None, None

        pin_memory = torch.cuda.is_available()
        loader = data_utils.DataLoader(train_data,
                                       batch_size=options.batch,
                                       shuffle=False,
                                       pin_memory=pin_memory,
                                       collate_fn=collator,
                                       sampler=train_sampler)
        dev_loader = data_utils.DataLoader(dev_data,
                                           batch_size=options.batch,
                                           shuffle=False,
                                           pin_memory=pin_memory,
                                           collate_fn=collator,
                                           sampler=dev_sampler)

        step, train_epoch = 0, 1
        while step <= options.step:
            print("train epoch", train_epoch)
            step = trainer.train_epoch(data_iter=loader,
                                       dev_data_iter=dev_loader,
                                       saving_path=options.model_path,
                                       step=step)