Exemple #1
0
def createModels(args, userNum, itemNum):
    if args.model == 'SPUIGACF':
        model = SPUIGACF(userNum,
                         itemNum,
                         embedSize=args.embedSize,
                         layers=args.layers,
                         droprate=args.droprate).cuda()
    elif args.model == 'SPUIMultiGACF':
        model = SPUIMultiGACF(userNum,
                              itemNum,
                              embedSize=args.embedSize,
                              layers=args.layers,
                              droprate=args.droprate).cuda()
    elif args.model == 'SPUIGAGPCF':
        model = SPUIGAGPCF(userNum,
                           itemNum,
                           adj,
                           embedSize=args.embedSize,
                           layers=args.layers,
                           droprate=args.droprate).cuda()

    if args.train_mode == 'PairSampling':
        lossfn = BPRLoss()
        if args.parallel == True:
            model = DataParallelModel(model)
            lossfn = DataParallelCriterion2(lossfn)
    elif args.train_mode == 'NegSampling':
        lossfn = BCEWithLogitsLoss()
        if args.parallel == True:
            model = DataParallelModel(model)  # 并行化model
            lossfn = DataParallelCriterion(lossfn)  # 并行化损失函数
    optim = Adam(model.parameters(),
                 lr=args.lr,
                 weight_decay=args.weight_decay)
    return model, lossfn, optim
Exemple #2
0
    def __init__(self,
                 model,
                 mask_prob: float = 0.15,
                 clip: int = 1,
                 optimizer=None):
        self.model = model
        self.clip = clip
        self.optimizer = optimizer

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)

        self.mask_prob = mask_prob
        self.criterion = nn.NLLLoss(
            ignore_index=model.text_processor.pad_token_id())

        num_gpu = torch.cuda.device_count()
        if num_gpu > 1:
            print("Let's use", num_gpu, "GPUs!")
            self.model = DataParallelModel(self.model)
            self.criterion = DataParallelCriterion(self.criterion)

        self.best_dev_loss = float("inf")
        self.best_train_loss = float("inf")
        self.last_train_loss = float("inf")
    def __init__(self,
                 model,
                 caption_model,
                 mask_prob: float = 0.3,
                 clip: int = 1,
                 optimizer=None,
                 beam_width: int = 5,
                 max_len_a: float = 1.1,
                 max_len_b: int = 5,
                 len_penalty_ratio: float = 0.8,
                 nll_loss: bool = False,
                 fp16: bool = False,
                 mm_mode="mixed"):
        super().__init__(model, mask_prob, clip, optimizer, beam_width,
                         max_len_a, max_len_b, len_penalty_ratio, nll_loss,
                         fp16, mm_mode)
        self.caption_model = caption_model
        self.caption_model.eval()
        self.caption_model = self.caption_model.to(self.device)

        if self.num_gpu == 1 and fp16:
            self.caption_model = amp.initialize(self.caption_model,
                                                opt_level="O2")

        if self.num_gpu > 1:
            print("Let's use", self.num_gpu, "GPUs!")
            self.caption_model = DataParallelModel(self.caption_model)
Exemple #4
0
    def __init__(self, cfg: Namespace, data: Dataset):
        """
        Args:
            cfg:  configuration
            data:  train dataset
        """
        self.cfg = cfg
        self.train, self.valid = data.split(0.8)
        RATING_FIELD.build_vocab(self.train)

        self.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')  # pylint: disable=no-member
        self.batch_size = cfg.batch_size
        if torch.cuda.is_available():
            self.batch_size *= torch.cuda.device_count()

        self.trn_itr = BucketIterator(
            self.train,
            device=self.device,
            batch_size=self.batch_size,
            shuffle=True,
            train=True,
            sort_within_batch=True,
            sort_key=lambda exam: -len(exam.comment_text))
        self.vld_itr = BucketIterator(
            self.valid,
            device=self.device,
            batch_size=self.batch_size,
            shuffle=False,
            train=False,
            sort_within_batch=True,
            sort_key=lambda exam: -len(exam.comment_text))
        self.log_step = 1000
        if len(self.vld_itr) < 100:
            self.log_step = 10
        elif len(self.vld_itr) < 1000:
            self.log_step = 100

        bert_path = cfg.bert_path if cfg.bert_path else 'bert-base-cased'
        self.model = BertForSequenceClassification.from_pretrained(
            bert_path, num_labels=2)
        pos_weight = (
            len([exam for exam in self.train.examples if exam.target < 0.5]) /
            len([exam for exam in self.train.examples if exam.target >= 0.5]))
        pos_wgt_tensor = torch.tensor([1.0, pos_weight], device=self.device)  # pylint: disable=not-callable
        self.criterion = nn.CrossEntropyLoss(weight=pos_wgt_tensor)
        if torch.cuda.is_available():
            self.model = DataParallelModel(self.model.cuda())
            self.criterion = DataParallelCriterion(self.criterion)
        self.optimizer = optim.Adam(self.model.parameters(), cfg.learning_rate)
Exemple #5
0
def build_model(options):
    model = Seq2Seq.load(ImageCaptioning,
                         options.model_path,
                         tok_dir=options.tokenizer_path,
                         use_obj=options.obj)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    num_gpu = torch.cuda.device_count()
    generator = BeamDecoder(model,
                            beam_width=options.beam_width,
                            max_len_a=options.max_len_a,
                            max_len_b=options.max_len_b,
                            len_penalty_ratio=options.len_penalty_ratio)
    if options.fp16:
        generator = amp.initialize(generator, opt_level="O2")
    if num_gpu > 1:
        generator = DataParallelModel(generator)
    return generator, model.text_processor
def train():
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loaders = create_datasets(num_workers=32, batch_size=600)
    # info = pd.read_csv("./flower_data/train.csv")[["image","label"]]
    # class_weights = torch.tensor(1.0/info.groupby(["label"]).count().values.astype(np.float32))
    # del info
    models_ensamble = [
                    # {"name":"vgg", "model":models.vgg16_bn(pretrained=True)},
                    {"name":"resnet", "model":models.resnet50(pretrained=True)}, 
                    # {"name":"densenet", "model":models.densenet121(pretrained=True) },
                    {"name":"resnet", "model":models.resnet101(pretrained=True) },
                    ]

    # model = Ensemble(models_ensamble, name="star_ensemble")
    model = load_checkpoint("ensemble_iso_star_5118.pt")

    ft, cl =model.get_parameters()
    # model = nn.DataParallel(model)
    model = DataParallelModel(model)
    model = model.to(device)
    weight = torch.from_numpy(weight_train[0]).to(device)
    criterion = nn.NLLLoss(weight)
    criterion = DataParallelCriterion(criterion)
  
    optimizers = [ optim.Adam(ft, lr=5e-4), optim.Adam(cl, lr=5e-3)]
    # # print("")
    # # print('-' * 40)
    # # print("lr = {} bs= {}".format(lr,bs) )
    # # print('-' * 40)

    # # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_schedulers = [lr_scheduler.StepLR(optimizers[0], step_size = 1, gamma = 0.995),
                        lr_scheduler.StepLR(optimizers[1], step_size = 1, gamma = 0.992) ]


    model = [model, criterion, optimizers, exp_lr_schedulers, device]

    model = train_model(*model, loaders, num_epochs = 100)
Exemple #7
0
def main():
    parser = setup_parser()
    args = parser.parse_args()

    processors = {
        'stsb': StsbProcessor,
        'mednli': MednliProcessor,
        'medsts': MedstsProcessor
    }

    output_modes = {
        'mnli': 'classification',
        'stsb': 'regression',
        'mednli': 'classification',
        'medsts': 'regression'
    }

    bert_types = {
        'discharge':
        '/home/dc925/project/data/clinicalbert/biobert_pretrain_output_disch_100000',
        'all':
        '/home/dc925/project/data/clinicalbert/biobert_pretrain_output_all_notes_150000',
        'base_uncased': 'bert-base-uncased',
        'base_cased': 'bert-base-cased'
    }

    ##################################################################################################
    ################################### SETUP DATA, DEVICE, MODEL ####################################
    ##################################################################################################
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device('cuda' if torch.cuda.is_available()
                              and not args.no_cuda else 'cpu')
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device('cuda', args.local_rank)
        n_gpu = 1
        #Initialize the distributed backend which will take care of synchronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))
    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: {}".format(task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]
    label_list = processor.get_labels(output_mode)
    num_labels = len(label_list)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
            args.local_rank))
    model = BertForSequenceClassification.from_pretrained(
        args.bert_model, cache_dir=cache_dir, num_labels=num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        # model = torch.nn.DataParallel(model)
        model = DataParallelModel(model)

    ##################################################################################################
    ########################################### OPTIMIZER ############################################
    ##################################################################################################

    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        if args.discriminative_finetuning:
            group1 = ['layer.0', 'layer.1.']
            group2 = ['layer.2', 'layer.3']
            group3 = ['layer.4', 'layer.5']
            group4 = ['layer.6', 'layer.7']
            group5 = ['layer.8', 'layer.9']
            group6 = ['layer.10', 'layer.11']
            group_all = ['layer.0', 'layer.1.', 'layer.2', 'layer.3', 'layer.4', 'layer.5', \
            'layer.6', 'layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.01},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.01, 'lr': args.learning_rate},

                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.0},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.0, 'lr': args.learning_rate},
            ]
        else:
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)

        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    ##################################################################################################
    ############################################# TRAIN ##############################################
    ##################################################################################################
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer, output_mode)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer, output_mode)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.float)

        all_pids = np.array([f.pid for f in eval_features])

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size,
                                     drop_last=True)

        model.train()
        epoch_metric = {}
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                # define a new function to compute loss values for both output_modes
                logits = model(input_ids, segment_ids, input_mask, labels=None)

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    loss_fct = DataParallelCriterion(loss_fct)
                    logits = [
                        logits[i].view(-1, num_labels)
                        for i in range(len(logits))
                    ]
                    loss = loss_fct(logits, label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss_fct = DataParallelCriterion(loss_fct)
                    logits = [logits[i].view(-1) for i in range(len(logits))]
                    loss = loss_fct(logits, label_ids.view(-1))
                if n_gpu > 1:
                    loss = loss.mean()  #average on multi-gpu
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        #modify lr with special warm up BERT uses
                        #if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            with torch.no_grad():
                model.eval()
                eval_loss = 0
                nb_eval_steps = 0
                preds = []
                i = 0

                for input_ids, input_mask, segment_ids, label_ids in tqdm(
                        eval_dataloader, desc="Evaluating"):
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        logits = model(input_ids,
                                       segment_ids,
                                       input_mask,
                                       labels=None)

                    if output_mode == 'classification':
                        # loss_fct = CrossEntropyLoss()
                        # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                        loss_fct = CrossEntropyLoss()
                        loss_fct = DataParallelCriterion(loss_fct)
                        logits = [
                            logits[i].view(-1, num_labels)
                            for i in range(len(logits))
                        ]
                        tmp_eval_loss = loss_fct(logits, label_ids.view(-1))
                    elif output_mode == 'regression':
                        # loss_fct = MSELoss()
                        # tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

                        loss_fct = MSELoss()
                        loss_fct = DataParallelCriterion(loss_fct)
                        logits = [
                            logits[i].view(-1) for i in range(len(logits))
                        ]
                        tmp_eval_loss = loss_fct(logits, label_ids.view(-1))

                    eval_loss += tmp_eval_loss.mean().item()
                    nb_eval_steps += 1
                    logits = parallel.gather(logits, target_device='cuda:0')
                    if len(preds) == 0:
                        preds.append(logits.detach().cpu().numpy())
                    else:
                        preds[0] = np.append(preds[0],
                                             logits.detach().cpu().numpy(),
                                             axis=0)
                eval_loss = eval_loss / nb_eval_steps
                preds = preds[0]
                if output_mode == 'classification':
                    preds = np.argmax(preds, axis=1)
                elif output_mode == 'regression':
                    preds = np.squeeze(preds)

                all_label_ids = all_label_ids[:preds.shape[0]]
                all_pids = all_pids[:preds.shape[0]]
                errors = generate_errors(preds, all_label_ids.numpy(),
                                         all_pids)

                result = compute_metrics(task_name, preds,
                                         all_label_ids.numpy())

                loss = tr_loss / global_step if args.do_train else None

                result['eval_loss'] = eval_loss
                result['global_step'] = global_step
                result['loss'] = loss
                logger.info('***** Eval Results *****')
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))

                epoch_metric[_] = result[
                    'pearson'] if output_mode == 'regression' else result['acc']

        output_eval_file = os.path.join(args.output_dir, 'eval_results.txt')
        with open(output_eval_file, 'w') as writer:
            logger.info('***** Eval Results *****')
            # for key in sorted(result.keys()):
            #     logger.info("  %s = %s", key, str(result[key]))
            #     writer.write("%s = %s\n" % (key, str(result[key])))
            # writer.write("{}     {}\n".format("epoch","pearson"))
            for key in sorted(epoch_metric.keys()):
                writer.write("{}\t{}\t{}\t{}\n".format(key,
                                                       str(epoch_metric[key]),
                                                       args.learning_rate,
                                                       args.train_batch_size))

        errors.to_csv('errors.txt', sep='\t', index=False)

    ##################################################################################################
    ########################################## SAVE & RELOAD #########################################
    ##################################################################################################
    if args.do_train:
        #Save a trained model, config, and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  #only save the model itself
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)
        model = BertForSequenceClassification.from_pretrained(
            args.output_dir, num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
    else:
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model, num_labels=num_labels)
    model.to(device)
Exemple #8
0
def createModels(args, userNum, itemNum, rt):
    if args.model == 'NCF':
        model = NCF(userNum, itemNum, 64, layers=[128, 64, 32, 16, 8]).cuda()
    elif args.model == 'GCF':
        model = GCF(userNum,
                    itemNum,
                    rt,
                    embedSize=args.embedSize,
                    layers=args.layers).cuda()
    elif args.model == 'GACFV1':
        model = GACFV1(userNum,
                       itemNum,
                       rt,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV2':
        model = GACFV2(userNum,
                       itemNum,
                       rt,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV3':
        model = GACFV2(userNum,
                       itemNum,
                       rt,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV4':
        model = GACFV4(userNum,
                       itemNum,
                       rt,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV5':
        model = GACFV5(userNum,
                       itemNum,
                       rt,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV6':
        model = GACFV6(userNum,
                       itemNum,
                       rt,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    # model = SVD(userNum,itemNum,50).cuda()
    # model = NCF(userNum,itemNum,64,layers=[128,64,32,16,8]).cuda()

    if args.evaluate == 'MSE':
        lossfn = MSELoss()
    elif args.evaluate == 'RANK':
        lossfn = BCEWithLogitsLoss()

    if args.parallel == True:
        model = DataParallelModel(model)  # 并行化model
        lossfn = DataParallelCriterion(lossfn)  # 并行化损失函数
    optim = Adam(model.parameters(),
                 lr=args.lr,
                 weight_decay=args.weight_decay)
    return model, lossfn, optim
Exemple #9
0
def main():
    epoches = 32
    gpu_id = 7
    ctx_list = [mx.gpu(x) for x in [7, 8]]
    log_interval = 100
    batch_size = 32
    start_epoch = 0
    # trainer_resume = resume + ".states" if resume is not None else None
    trainer_resume = None

    resume = None
    from mxnet.gluon.data.vision import transforms
    transform_fn = transforms.Compose([
        LeftTopPad(dest_shape=(256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406),
                             std=(0.229, 0.224, 0.225))
    ])
    dataset = CaptionDataSet(
        image_root="/data3/zyx/yks/coco2017/train2017",
        annotation_path=
        "/data3/zyx/yks/coco2017/annotations/captions_train2017.json",
        transforms=transform_fn,
        feature_hdf5="output/train2017.h5")
    val_dataset = CaptionDataSet(
        image_root="/data3/zyx/yks/coco2017/val2017",
        annotation_path=
        "/data3/zyx/yks/coco2017/annotations/captions_val2017.json",
        words2index=dataset.words2index,
        index2words=dataset.index2words,
        transforms=transform_fn,
        feature_hdf5="output/val2017.h5")
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=1,
                            pin_memory=True,
                            last_batch="discard")
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=1,
                            pin_memory=True)

    num_words = dataset.words_count

    # set up logger
    save_prefix = "output/res50_"
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)

    net = EncoderDecoder(num_words=num_words, test_max_len=val_dataset.max_len)
    if resume is not None:
        net.collect_params().load(resume,
                                  allow_missing=True,
                                  ignore_extra=True)
        logger.info("Resumed form checkpoint {}.".format(resume))
    params = net.collect_params()
    for key in params.keys():
        if params[key]._data is not None:
            continue
        else:
            if "bias" in key or "mean" in key or "beta" in key:
                params[key].initialize(init=mx.init.Zero())
                logging.info("initialized {} using Zero.".format(key))
            elif "weight" in key:
                params[key].initialize(init=mx.init.Normal())
                logging.info("initialized {} using Normal.".format(key))
            elif "var" in key or "gamma" in key:
                params[key].initialize(init=mx.init.One())
                logging.info("initialized {} using One.".format(key))
            else:
                params[key].initialize(init=mx.init.Normal())
                logging.info("initialized {} using Normal.".format(key))

    net.collect_params().reset_ctx(ctx=ctx_list)
    trainer = mx.gluon.Trainer(
        net.collect_params(),
        'adam',
        {
            'learning_rate': 4e-4,
            'clip_gradient': 5,
            'multi_precision': True
        },
    )
    if trainer_resume is not None:
        trainer.load_states(trainer_resume)
        logger.info(
            "Loaded trainer states form checkpoint {}.".format(trainer_resume))
    criterion = Criterion()
    accu_top3_metric = TopKAccuracy(top_k=3)
    accu_top1_metric = Accuracy(name="batch_accu")
    ctc_loss_metric = Loss(name="ctc_loss")
    alpha_metric = Loss(name="alpha_loss")
    batch_bleu = BleuMetric(name="batch_bleu",
                            pred_index2words=dataset.index2words,
                            label_index2words=dataset.index2words)
    epoch_bleu = BleuMetric(name="epoch_bleu",
                            pred_index2words=dataset.index2words,
                            label_index2words=dataset.index2words)
    btic = time.time()
    logger.info(batch_size)
    logger.info(num_words)
    logger.info(len(dataset.words2index))
    logger.info(len(dataset.index2words))
    logger.info(dataset.words2index["<PAD>"])
    logger.info(val_dataset.words2index["<PAD>"])
    logger.info(len(val_dataset.words2index))
    # net.hybridize(static_alloc=True, static_shape=True)
    net_parallel = DataParallelModel(net, ctx_list=ctx_list, sync=True)
    for nepoch in range(start_epoch, epoches):
        if nepoch > 15:
            trainer.set_learning_rate(4e-5)
        logger.info("Current lr: {}".format(trainer.learning_rate))
        accu_top1_metric.reset()
        accu_top3_metric.reset()
        ctc_loss_metric.reset()
        alpha_metric.reset()
        epoch_bleu.reset()
        batch_bleu.reset()
        for nbatch, batch in enumerate(tqdm.tqdm(dataloader)):
            batch = [mx.gluon.utils.split_and_load(x, ctx_list) for x in batch]
            inputs = [[x[n] for x in batch] for n, _ in enumerate(ctx_list)]
            losses = []
            with ag.record():
                net_parallel.sync = nbatch > 1
                outputs = net_parallel(*inputs)
                for s_batch, s_outputs in zip(inputs, outputs):
                    image, label, label_len = s_batch
                    predictions, alphas = s_outputs
                    ctc_loss = criterion(predictions, label, label_len)
                    loss2 = 1.0 * ((1. - alphas.sum(axis=1))**2).mean()
                    losses.extend([ctc_loss, loss2])
            ag.backward(losses)
            trainer.step(batch_size=batch_size, ignore_stale_grad=True)
            for n, l in enumerate(label_len):
                l = int(l.asscalar())
                la = label[n, 1:l]
                pred = predictions[n, :(l - 1)]
                accu_top3_metric.update(la, pred)
                accu_top1_metric.update(la, pred)
                epoch_bleu.update(la, predictions[n, :])
                batch_bleu.update(la, predictions[n, :])
            ctc_loss_metric.update(None,
                                   preds=nd.sum(ctc_loss) / image.shape[0])
            alpha_metric.update(None, preds=loss2)
            if nbatch % log_interval == 0 and nbatch > 0:
                msg = ','.join([
                    '{}={:.3f}'.format(*metric.get()) for metric in [
                        epoch_bleu, batch_bleu, accu_top1_metric,
                        accu_top3_metric, ctc_loss_metric, alpha_metric
                    ]
                ])
                logger.info(
                    '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.
                    format(nepoch, nbatch,
                           log_interval * batch_size / (time.time() - btic),
                           msg))
                btic = time.time()
                batch_bleu.reset()
                accu_top1_metric.reset()
                accu_top3_metric.reset()
                ctc_loss_metric.reset()
                alpha_metric.reset()

        bleu, acc_top1 = validate(net,
                                  gpu_id=gpu_id,
                                  val_loader=val_loader,
                                  train_index2words=dataset.index2words,
                                  val_index2words=val_dataset.index2words)
        save_path = save_prefix + "_weights-%d-bleu-%.4f-%.4f.params" % (
            nepoch, bleu, acc_top1)
        net.collect_params().save(save_path)
        trainer.save_states(fname=save_path + ".states")
        logger.info("Saved checkpoint to {}.".format(save_path))
Exemple #10
0
def createModels(args, userNum, itemNum, adj):
    if args.model == 'NCF':
        model = NCF(userNum, itemNum, 64, layers=[128, 64, 32, 16, 8]).cuda()
    if args.model == 'NMF':
        model = NMF(args.model, userNum, itemNum, 3, args.embedSize,
                    args.droprate).cuda()
    elif args.model == 'NGCFMF':
        model = NGCFMF(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers).cuda()
    elif args.model == 'NGCFMLP':
        model = NGCFMLP(userNum,
                        itemNum,
                        adj,
                        embedSize=args.embedSize,
                        layers=args.layers).cuda()
    elif args.model == 'NGCFMFMLP':
        model = NGCFMFMLP(userNum,
                          itemNum,
                          adj,
                          embedSize=args.embedSize,
                          layers=args.layers).cuda()
    elif args.model == 'NGCFMF_concat_MF':
        model = NGCFMF_concat_MF(userNum,
                                 itemNum,
                                 adj,
                                 embedSize=args.embedSize,
                                 layers=args.layers).cuda()
    elif args.model == 'NGCFMF_concat_MLP':
        model = NGCFMF_concat_MLP(userNum,
                                  itemNum,
                                  adj,
                                  embedSize=args.embedSize,
                                  layers=args.layers).cuda()
    elif args.model == 'NGCFMLP_concat_MF':
        model = NGCFMLP_concat_MF(userNum,
                                  itemNum,
                                  adj,
                                  embedSize=args.embedSize,
                                  layers=args.layers).cuda()
    elif args.model == 'NGCFMLP_concat_MLP':
        model = NGCFMLP_concat_MLP(userNum,
                                   itemNum,
                                   adj,
                                   embedSize=args.embedSize,
                                   layers=args.layers).cuda()
    elif args.model == 'NGCFMF_concat_MF_MLP':
        model = NGCFMF_concat_MF_MLP(userNum,
                                     itemNum,
                                     adj,
                                     embedSize=args.embedSize,
                                     layers=args.layers).cuda()
    elif args.model == 'NGCFMLP_concat_MF_MLP':
        model = NGCFMLP_concat_MF_MLP(userNum,
                                      itemNum,
                                      adj,
                                      embedSize=args.embedSize,
                                      layers=args.layers).cuda()
    elif args.model == 'GACFV1':
        model = GACFV1(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV2':
        model = GACFV2(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFMask':
        model = GACFMask(userNum,
                         itemNum,
                         adj,
                         embedSize=args.embedSize,
                         layers=args.layers,
                         droprate=args.droprate).cuda()
    elif args.model == 'SPGA':
        model = SPGACF(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV3':
        model = GACFV3(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV4':
        model = GACFV4(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV5':
        model = GACFV5(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()
    elif args.model == 'GACFV6':
        model = GACFV6(userNum,
                       itemNum,
                       adj,
                       embedSize=args.embedSize,
                       layers=args.layers,
                       droprate=args.droprate).cuda()

    if args.train_mode == 'PairSampling':
        lossfn = BPRLoss()
        if args.parallel == True:
            model = DataParallelModel(model)
            lossfn = DataParallelCriterion2(lossfn)
    elif args.train_mode == 'NegSampling':
        lossfn = BCEWithLogitsLoss()
        if args.parallel == True:
            model = DataParallelModel(model)  # 并行化model
            lossfn = DataParallelCriterion(lossfn)  # 并行化损失函数
    optim = Adam(model.parameters(),
                 lr=args.lr,
                 weight_decay=args.weight_decay)
    return model, lossfn, optim
Exemple #11
0
def main():
    parser = setup_parser()
    args = parser.parse_args()
    logger.info('@@@@@ START @@@@@')
    device = torch.device(
        'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu')
    n_gpu = torch.cuda.device_count()
    logger.info('device %s n_gpu %d', device, n_gpu)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")
    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    bert_config = BertConfig.from_json_file(args.bert_config_file)
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    if args.tasks == 'all':
        task_names = ['medsts', 'mednli']
        data_dirs = ['MEDSTS', 'MEDNLI']
    elif args.tasks == 'single':
        task_names = ['medsts', 'mednli']
        data_dirs = ['MEDSTS', 'MEDNLI']
        task_names = [task_names[int(args.target_task_id)]]
        data_dirs = [data_dirs[int(args.target_task_id)]]
    if args.k_fold:
        target_data_dir = data_dirs[args.target_task_id]
        k_fold_data_dir = target_data_dir + '/k_fold_{}'.format(args.k)
        data_dirs[args.target_task_id] = k_fold_data_dir
    # if args.add_medsts_c:
    # 	assert args.k_fold==True
    # 	task_names.append('medsts_c')
    # 	data_dirs.append('MEDSTS_c')
    # 	k_fold_data_dir = data_dirs[-1] + '/k_fold_{}'.format(args.k)
    # 	data_dirs[-1] = k_fold_data_dir

    if task_names[0] not in processors:
        raise ValueError('Task not found: {}'.format(task_names[0]))

    processor_list = [processors[task_name]() for task_name in task_names]
    label_list = [processor.get_labels() for processor in processor_list]

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    num_tasks = len(task_names)

    if args.do_train:
        train_examples = [
            processor.get_train_examples(args.data_dir + data_dir)
            for processor, data_dir in zip(processor_list, data_dirs)
        ]
        num_train_steps = int(
            len(train_examples[0]) / args.train_batch_size *
            args.num_train_epochs)
        if args.tasks == 'all':
            total_tr = args.tr_factor * num_tasks * int(args.num_train_epochs)
        else:
            total_tr = int(0.5 * num_train_steps)

    if args.tasks == 'all':
        steps_per_epoch = args.gradient_accumulation_steps * args.tr_factor * num_tasks
    else:
        steps_per_epoch = int(num_train_steps / (2. * args.num_train_epochs))
    bert_config.num_tasks = num_tasks
    bert_config.hidden_size_aug = int(args.h_aug)

    model = BertForMultiTask(bert_config,
                             [len(labels) for labels in label_list])

    if args.init_checkpoint is not None:
        if args.multi:
            load_checkpoint_mult(args.init_checkpoint, model, args.same,
                                 args.tasks)
        else:
            model.bert.load_state_dict(
                torch.load(args.init_checkpoint, map_location='cpu'))

    if args.freeze:
        for n, p in model.bert.named_parameters():
            if 'aug' in n or 'classifier' in n or 'mult' in n or 'gamma' in n or 'beta' in n:
                continue
            p.requires_grad = False

    model.to(device)
    if n_gpu > 1:
        model = DataParallelModel(model)

    group_size = 2

    optimizer_parameters = get_param_groups(model, args, group_size)
    optimizer = BERTAdam(optimizer_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=total_tr)

    if args.do_eval:
        eval_loaders = []
        error_analysis_dicts = []
        for i, task in enumerate(task_names):
            eval_examples = processor_list[i].get_dev_examples(args.data_dir +
                                                               data_dirs[i])
            eval_features = convert_examples_to_features(
                eval_examples, label_list[i], args.max_seq_length, tokenizer,
                output_modes[task])
            all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in eval_features], dtype=torch.long)
            all_pids = [int(f.pid) for f in eval_examples]
            all_text_a = [f.text_a for f in eval_examples]
            all_text_b = [f.text_b for f in eval_examples]
            error_data = {
                'pids': all_pids,
                'text_a': all_text_a,
                'text_b': all_text_b
            }

            if output_modes[task] == 'classification':
                all_label_ids = torch.tensor(
                    [f.label_id for f in eval_features], dtype=torch.long)
            else:
                all_label_ids = torch.tensor(
                    [f.label_id for f in eval_features], dtype=torch.float32)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label_ids)
            eval_sampler = SequentialSampler(eval_data)
            eval_loaders.append(
                DataLoader(eval_data,
                           sampler=eval_sampler,
                           batch_size=args.eval_batch_size,
                           drop_last=True))
            error_analysis_dicts.append(error_data)

    global_step = 0
    if args.do_train:
        loaders = []
        logger.info(' Num tasks = {}'.format(len(train_examples)))
        for i, task in enumerate(task_names):
            train_features = convert_examples_to_features(
                train_examples[i], label_list[i], args.max_seq_length,
                tokenizer, output_modes[task])
            logger.info('********* Training data for {}'.format(task))
            logger.info('   Data size = {}'.format(len(train_features)))

            all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in train_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in train_features], dtype=torch.long)
            if output_modes[task] == 'classification':
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.long)
            else:
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.float32)
            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_segment_ids, all_label_ids)
            train_sampler = RandomSampler(train_data)
            loaders.append(
                iter(
                    DataLoader(train_data,
                               sampler=train_sampler,
                               batch_size=args.train_batch_size,
                               drop_last=True)))
        total_params = sum(p.numel() for p in model.parameters())
        logger.info(' Num param = {}'.format(total_params))
        loaders = [cycle(it) for it in loaders]

        model.train()
        best_target_score = 0.
        task_id = 0
        all_ev_acc = []

        for epoch in trange(int(args.num_train_epochs), desc='Epoch'):
            if args.sample == 'anneal':
                probs = [len(dataset) for dataset in train_examples]
                probs = anneal(probs,
                               epoch,
                               args.num_train_epochs,
                               anneal_factor=0.8,
                               target_task_id=0,
                               weight=5)

            tr_loss = [0. for i in range(num_tasks)]
            nb_tr_examples, nb_tr_steps = 0, 0

            ## DEBUG
            # steps_per_epoch = 5

            for step in trange(steps_per_epoch, desc='Steps'):
                if step % args.gradient_accumulation_steps == 0:
                    task_id = np.random.choice(len(probs), p=probs)
                    output_mode = output_modes[task_names[task_id]]
                batch = next(loaders[task_id])
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                logits = model(input_ids, segment_ids, input_mask, task_id,
                               output_mode)

                if output_mode == 'classification':
                    loss_fct = CrossEntropyLoss()
                    loss_fct = DataParallelCriterion(loss_fct)
                    logits = [
                        logits[i].view(-1, logits[0].size(-1))
                        for i in range(len(logits))
                    ]
                    loss = loss_fct(logits, label_ids.view(-1))
                else:
                    loss_fct = MSELoss()
                    loss_fct = DataParallelCriterion(loss_fct)
                    logits = [logits[i].view(-1) for i in range(len(logits))]
                    loss = loss_fct(logits, label_ids.view(-1))
                if n_gpu > 1:
                    loss = loss.mean()
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss[task_id] += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    model.zero_grad()
                    global_step += 1

                #this is where you'd calculate training acc

            ev_acc = []
            for i, task in enumerate(task_names):
                acc = do_eval(model, logger, args.output_dir, device,
                              tr_loss[i], nb_tr_steps, global_step,
                              processor_list[i], label_list[i], tokenizer,
                              eval_loaders[i], error_analysis_dicts[i],
                              output_modes[task], i, task)
                ev_acc.append(acc)
            all_ev_acc.append(ev_acc)
            # logger.info('Average acc: {}'.format(np.mean(ev_acc)))
            if ev_acc[args.target_task_id] > best_target_score:
                best_target_score = ev_acc[args.target_task_id]
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
                output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
                torch.save(model_to_save.state_dict(), output_model_file)
                bert_config.to_json_file(output_config_file)
                tokenizer.save_vocabulary(args.output_dir)

                ##TODO: this is where you should add error analysis to get best version

            logger.info('Best target acc: {}'.format(best_target_score))

        output_eval_file = os.path.join(args.output_dir, 'eval_results.txt')
        with open(output_eval_file, 'w') as writer:
            logger.info('******** Eval Results ********')
            for n, acc in enumerate(all_ev_acc):
                logger.info('   {} = {}\n'.format(n, acc))
                writer.write('{} \t {}\n'.format(n, acc))
lr_rate = 0.03
milestones = [5, 7, 8, 10, 12, 14, 16, 17, 18]
img_size = 384
gamma = 0.5

#use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#use_cuda = False

segm_model = ResNetLinkModel(input_channels=1, pretrained=True, num_classes=3)

if torch.cuda.device_count() > 1:
    #dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    #segm_model = nn.DataParallel(segm_model)
    #segm_model = encoding.parallel.DataParallelModel(segm_model, device_ids=[0,1,2,3,4,5,6,7])
    segm_model = DataParallelModel(segm_model)
print("Let's use", torch.cuda.device_count(), "GPUs!")
segm_model.to(device)
'''if use_cuda:
    segm_model.cuda()
seg_model = nn.DataParallel(seg_model)'''

mul_transf = [
    transforms.Resize(size=(img_size, img_size)),
    transforms.ToTensor()
]
#optimizer = optim.SGD(segm_model.parameters(), lr=lr_rate, momentum=momentum)
optimizer = optim.Adam(segm_model.parameters(), lr=0.0001)
#criterion = nn.BCEWithLogitsLoss().cuda() if use_cuda else nn.BCEWithLogitsLoss()
criterion = nn.BCEWithLogitsLoss()
criterion = DataParallelCriterion(criterion)
Exemple #13
0
def main_tr(args, crossVal):
    dataLoad = ld.LoadData(args.data_dir, args.classes)
    data = dataLoad.processData(crossVal, args.data_name)

    # load the model
    model = net.MiniSeg(args.classes, aux=True)
    if not osp.isdir(osp.join(args.savedir + '_mod' + str(args.max_epochs))):
        os.mkdir(args.savedir + '_mod' + str(args.max_epochs))
    if not osp.isdir(
            osp.join(args.savedir + '_mod' + str(args.max_epochs),
                     args.data_name)):
        os.mkdir(
            osp.join(args.savedir + '_mod' + str(args.max_epochs),
                     args.data_name))
    saveDir = args.savedir + '_mod' + str(
        args.max_epochs) + '/' + args.data_name + '/' + args.model_name
    # create the directory if not exist
    if not osp.exists(saveDir):
        os.mkdir(saveDir)

    if args.gpu and torch.cuda.device_count() > 1:
        #model = torch.nn.DataParallel(model)
        model = DataParallelModel(model)
    if args.gpu:
        model = model.cuda()

    total_paramters = sum([np.prod(p.size()) for p in model.parameters()])
    print('Total network parameters: ' + str(total_paramters))

    # define optimization criteria
    weight = torch.from_numpy(
        data['classWeights'])  # convert the numpy array to torch
    if args.gpu:
        weight = weight.cuda()

    criteria = CrossEntropyLoss2d(weight, args.ignore_label)  #weight
    if args.gpu and torch.cuda.device_count() > 1:
        criteria = DataParallelCriterion(criteria)
    if args.gpu:
        criteria = criteria.cuda()

    # compose the data with transforms
    trainDataset_main = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(args.width, args.height),
        myTransforms.RandomCropResize(int(32. / 1024. * args.width)),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor()
    ])
    trainDataset_scale1 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(int(args.width * 1.5), int(args.height * 1.5)),
        myTransforms.RandomCropResize(int(100. / 1024. * args.width)),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor()
    ])

    trainDataset_scale2 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(int(args.width * 1.25), int(args.height * 1.25)),
        myTransforms.RandomCropResize(int(100. / 1024. * args.width)),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor()
    ])
    trainDataset_scale3 = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(int(args.width * 0.75), int(args.height * 0.75)),
        myTransforms.RandomCropResize(int(32. / 1024. * args.width)),
        myTransforms.RandomFlip(),
        myTransforms.ToTensor()
    ])

    valDataset = myTransforms.Compose([
        myTransforms.Normalize(mean=data['mean'], std=data['std']),
        myTransforms.Scale(args.width, args.height),
        myTransforms.ToTensor()
    ])

    # since we training from scratch, we create data loaders at different scales
    # so that we can generate more augmented data and prevent the network from overfitting
    trainLoader = torch.utils.data.DataLoader(myDataLoader.Dataset(
        data['trainIm'], data['trainAnnot'], transform=trainDataset_main),
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=args.num_workers,
                                              pin_memory=True,
                                              drop_last=True)

    trainLoader_scale1 = torch.utils.data.DataLoader(
        myDataLoader.Dataset(data['trainIm'],
                             data['trainAnnot'],
                             transform=trainDataset_scale1),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=True)

    trainLoader_scale2 = torch.utils.data.DataLoader(
        myDataLoader.Dataset(data['trainIm'],
                             data['trainAnnot'],
                             transform=trainDataset_scale2),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=True)
    trainLoader_scale3 = torch.utils.data.DataLoader(
        myDataLoader.Dataset(data['trainIm'],
                             data['trainAnnot'],
                             transform=trainDataset_scale3),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=True)

    valLoader = torch.utils.data.DataLoader(myDataLoader.Dataset(
        data['valIm'], data['valAnnot'], transform=valDataset),
                                            batch_size=args.batch_size,
                                            shuffle=False,
                                            num_workers=args.num_workers,
                                            pin_memory=True)
    max_batches = len(trainLoader) + len(trainLoader_scale1) + len(
        trainLoader_scale2) + len(trainLoader_scale3)

    if args.gpu:
        cudnn.benchmark = True

    start_epoch = 0

    if args.pretrained is not None:
        state_dict = torch.load(args.pretrained)
        new_keys = []
        new_values = []
        for idx, key in enumerate(state_dict.keys()):
            if 'pred' not in key:
                new_keys.append(key)
                new_values.append(list(state_dict.values())[idx])
        new_dict = OrderedDict(list(zip(new_keys, new_values)))
        model.load_state_dict(new_dict, strict=False)
        print('pretrained model loaded')

    if args.resume is not None:
        if osp.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            args.lr = checkpoint['lr']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    log_file = osp.join(saveDir, 'trainValLog_' + args.model_name + '.txt')
    if osp.isfile(log_file):
        logger = open(log_file, 'a')
    else:
        logger = open(log_file, 'w')
        logger.write("Parameters: %s" % (str(total_paramters)))
        logger.write("\n%s\t%s\t\t%s\t%s\t%s\t%s\tlr" %
                     ('CrossVal', 'Epoch', 'Loss(Tr)', 'Loss(val)',
                      'mIOU (tr)', 'mIOU (val)'))
    logger.flush()

    optimizer = torch.optim.Adam(model.parameters(),
                                 args.lr, (0.9, 0.999),
                                 eps=1e-08,
                                 weight_decay=1e-4)
    maxmIOU = 0
    maxEpoch = 0
    print(args.model_name + '-CrossVal: ' + str(crossVal + 1))
    for epoch in range(start_epoch, args.max_epochs):
        # train for one epoch
        cur_iter = 0

        train(args, trainLoader_scale1, model, criteria, optimizer, epoch,
              max_batches, cur_iter)
        cur_iter += len(trainLoader_scale1)
        train(args, trainLoader_scale2, model, criteria, optimizer, epoch,
              max_batches, cur_iter)
        cur_iter += len(trainLoader_scale2)
        train(args, trainLoader_scale3, model, criteria, optimizer, epoch,
              max_batches, cur_iter)
        cur_iter += len(trainLoader_scale3)
        lossTr, overall_acc_tr, per_class_acc_tr, per_class_iu_tr, mIOU_tr, lr = \
                train(args, trainLoader, model, criteria, optimizer, epoch, max_batches, cur_iter)

        # evaluate on validation set
        lossVal, overall_acc_val, per_class_acc_val, per_class_iu_val, mIOU_val = \
                val(args, valLoader, model, criteria)

        torch.save(
            {
                'epoch': epoch + 1,
                'arch': str(model),
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lossTr': lossTr,
                'lossVal': lossVal,
                'iouTr': mIOU_tr,
                'iouVal': mIOU_val,
                'lr': lr
            },
            osp.join(
                saveDir, 'checkpoint_' + args.model_name + '_crossVal' +
                str(crossVal + 1) + '.pth.tar'))

        # save the model also
        model_file_name = osp.join(
            saveDir, 'model_' + args.model_name + '_crossVal' +
            str(crossVal + 1) + '_' + str(epoch + 1) + '.pth')
        torch.save(model.state_dict(), model_file_name)

        logger.write(
            "\n%d\t\t%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.7f" %
            (crossVal + 1, epoch + 1, lossTr, lossVal, mIOU_tr, mIOU_val, lr))
        logger.flush()
        print("\nEpoch No. %d:\tTrain Loss = %.4f\tVal Loss = %.4f\t mIOU(tr) = %.4f\t mIOU(val) = %.4f\n" \
                % (epoch + 1, lossTr, lossVal, mIOU_tr, mIOU_val))

        if mIOU_val >= maxmIOU:
            maxmIOU = mIOU_val
            maxEpoch = epoch + 1
        torch.cuda.empty_cache()
    logger.flush()
    logger.close()
    return maxEpoch, maxmIOU
Exemple #14
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs
    args.warmup_steps = t_total // 100

    # Prepare optimizer and schedule (linear warmup and decay)
    optimizer_grouped_parameters = get_param_groups(args, model)
    optimizer = RAdam(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        # model = torch.nn.DataParallel(model)
        model = DataParallelModel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    args.logging_steps = len(train_dataloader) // 1
    args.save_steps = args.logging_steps
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    set_seed(args)
    for _ in train_iterator:
        args.current_epoch = _
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                'token_type_ids':
                batch[2] if args.model_type in ['bert', 'xlnet'] else None,
            }  # XLM and RoBERTa don't use segment_ids
            #   'labels':         batch[3]}
            outputs = model(**inputs)
            outputs = [outputs[i][0] for i in range(len(outputs))]

            loss_fct = CrossEntropyLoss()
            loss_fct = DataParallelCriterion(loss_fct)

            loss = loss_fct(outputs, batch[3])

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
        # target dimension[0] / 2
        # tar = target.contiguous().view(-1)
        # out = output.contiguous().view(target.size(0),-1)

        target = tar.contiguous().view(-1)
        output = out[:tar.size(0)]
        normalize = output.size(0) * output.size(1)
        output = output.contiguous().view(target.size(0), -1)
        loss = self.NLL(output, target) / normalize

        return loss


if not eval_model:
    criterion = NLLLoss(ignore_index=PAD)
    parallel_model = DataParallelModel(model)  # Encapsulate the model
    parallel_loss = DataParallelCriterion(criterion)

# In[5]:

# ---------------------------

# def merge_res(res):
#     ((inds1, log_probs1, enc_out1),(inds2, log_probs2, enc_out2)) = res
#     inds = T.cat([inds1, inds2], dim = 0).cpu()
#     enc_out = T.cat([enc_out1, enc_out2], dim = 0).cpu()
#     if type(log_probs1) != list:
#         log_probs = T.cat([log_probs1, log_probs2], dim = 0)
#         return inds, log_probs, enc_out
#     else:
#         return inds, _, enc_out
Exemple #16
0
def main(args):
    init(args)
    #Args setup:

    beam = args.beam
    p = args.p
    n_ctx = args.n_ctx
    gen_len = args.gen_len
    k = args.k
    decoding_strategy = args.decoding_strategy
    accum_iter = args.accum_iter
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)
    data_dir = args.data_dir
    #Text Encoder

    if args.debug_mode:
        text_encoder = GPT2Tokenizer.from_pretrained('gpt2')
    else:
        text_encoder = GPT2Tokenizer.from_pretrained('gpt2-medium')
    text_encoder.add_special_tokens({
        'bos_token':
        '_start_',
        'cls_token':
        '_classify_',
        'eos_token':
        '_end_',
        'additional_special_tokens':
        ['_kw_', '_endkw_', '_t_', '_i_', '_b_', '_c_']
    })

    vocab = len(text_encoder)

    datafile = os.path.join(
        data_dir, "test_encoded.jsonl") if args.testset else os.path.join(
            data_dir, "val_encoded.jsonl")
    print("Loading dataset...")
    val_loader = get_fullstory_loader(datafile,
                                      args.n_batch,
                                      text_encoder,
                                      num_workers=0,
                                      shuffle=False,
                                      gen_len=gen_len,
                                      n_ctx=n_ctx,
                                      include_kw=not args.exclude_kw,
                                      max_size=args.max_ex)
    print(len(val_loader))

    if args.use_model == "plotmachines":
        doc_model = PlotMachinesModel(args,
                                      vocab=vocab,
                                      n_ctx=n_ctx,
                                      gen_len=gen_len,
                                      lastidx=text_encoder.eos_token_id,
                                      includeprev=args.use_neighbor_feat)
    else:
        doc_model = GPT2BaseModel(args,
                                  vocab=vocab,
                                  n_ctx=n_ctx,
                                  gen_len=gen_len,
                                  lastidx=text_encoder.eos_token_id,
                                  includeprev=args.use_neighbor_feat)

    doc_model.to(device)
    if n_gpu > 1:
        doc_model = DataParallelModel(doc_model)

    if args.debug_mode:
        gptclf = GPT2Model.from_pretrained('gpt2')
        gptclf.eval()
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        gptclf.to(device)
        #gpttok = gptTokenizer.from_pretrained('openai-gpt')
        gpttok = GPT2Tokenizer.from_pretrained('gpt2')

    else:
        gptclf = GPT2Model.from_pretrained('gpt2-medium')
        gptclf.eval()
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        gptclf.to(device)
        #gpttok = gptTokenizer.from_pretrained('openai-gpt')
        gpttok = GPT2Tokenizer.from_pretrained('gpt2-medium')

    prevloss = []
    upd = []
    start_iter, running_loss = 1, 0
    load_dir = args.load_dir
    bestcheck = os.path.join(load_dir, "checkpoint_best.pt")
    checkpoint = torch.load(bestcheck, map_location='cpu')
    state_dict = checkpoint["state_dict"]
    if n_gpu == 1:
        if state_dict.get(
                'module.pos_emb_mask') is None and doc_model.state_dict().get(
                    'pos_emb_mask') is not None:
            state_dict['module.pos_emb_mask'] = doc_model.state_dict().get(
                'pos_emb_mask')
        for k in list(state_dict.keys()):
            state_dict[k[7:]] = state_dict[k]
            del state_dict[k]
    else:
        if state_dict.get(
                'module.pos_emb_mask') is None and doc_model.state_dict().get(
                    'module.pos_emb_mask') is not None:
            state_dict['module.pos_emb_mask'] = doc_model.state_dict().get(
                'module.pos_emb_mask')
    doc_model.load_state_dict(state_dict)

    print("Parallelized")
    tagset = ['_i_'] + args.bodynum * ['_b_'] + ['_c_']
    vort = 'test' if args.testset else 'val'
    generatedocs(doc_model,
                 gptclf,
                 gpttok,
                 val_loader,
                 text_encoder,
                 device,
                 beam,
                 gen_len,
                 k,
                 p,
                 args.decoding_strategy,
                 os.path.join(args.save_dir, vort + '.gens.tsv'),
                 'gen',
                 'tgt',
                 gen_len, [],
                 args,
                 tags=tagset,
                 dim=args.n_embd,
                 save_dir=args.save_dir,
                 localfile=os.path.join('/tmp', vort + '.gens.tsv'))

    print('done decoding....')
def main(args):
    init(args)

    # Constants
    n_ctx = args.n_ctx
    data_dir = args.data_dir

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)
    text_encoder.decoder[len(encoder)] = '_start_'
    encoder['_start_'] = len(encoder)
    text_encoder.decoder[len(encoder)] = '_delimiter_'
    encoder['_delimiter_'] = len(encoder)
    text_encoder.decoder[len(encoder)] = '_classify_'
    encoder['_classify_'] = len(encoder)

    n_special = 3  # XD: useless for language modeling task
    vocab = n_vocab + n_special + n_ctx

    lm_model = LMModel(args,
                       vocab,
                       n_ctx,
                       return_probs=True,
                       doc_embed=args.doc_model)
    load_openai_pretrained_model(lm_model.transformer,
                                 n_ctx=n_ctx,
                                 n_special=n_special)
    if args.checkpoint != "none":
        checkpoint = torch.load(args.checkpoint, map_location='cpu')
        state_dict = checkpoint["state_dict"]
        for key in list(state_dict.keys()):
            state_dict[key[7:]] = state_dict[key]
            del state_dict[key]
        pos_emb_mask = torch.zeros(1, 1, vocab)
        pos_emb_mask[:, :, -n_ctx] = -1e12
        state_dict['pos_emb_mask'] = pos_emb_mask
        lm_model.load_state_dict(state_dict)
    lm_model.to(device)
    lm_model = DataParallelModel(lm_model)

    train_bar = get_loader(os.path.join(data_dir, "val_encoded.jsonl"),
                           n_gpu,
                           encoder,
                           num_workers=1,
                           shuffle=True,
                           max_size=args.n_iter)
    srcs, hyps, refs = [], [], []
    with torch.no_grad():
        lm_model.eval()
        for i, (pad_output, mask_output) in enumerate(tqdm(train_bar), 1):
            src_strs, tgt_strs, gen_strs = generate_outputs(
                lm_model, pad_output, mask_output, text_encoder, device,
                args.beam, args.gen_len, args.k, args.decoding_strategy)
            srcs.extend(src_strs)
            hyps.extend(gen_strs)
            refs.extend(tgt_strs)

    for i in range(len(hyps)):
        print("*" * 50)
        print("Source: {}".format(srcs[i]))
        print('Hypothesis: {}'.format(hyps[i]))
        print("Reference: {}".format(refs[i]))
    def __init__(self,
                 model,
                 mask_prob: float = 0.3,
                 clip: int = 1,
                 optimizer=None,
                 beam_width: int = 5,
                 max_len_a: float = 1.1,
                 max_len_b: int = 5,
                 len_penalty_ratio: float = 0.8,
                 nll_loss: bool = False,
                 fp16: bool = False,
                 mm_mode="mixed",
                 rank: int = -1):
        self.model = model

        self.clip = clip
        self.optimizer = optimizer

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.num_gpu = torch.cuda.device_count()

        self.mask_prob = mask_prob
        if nll_loss:
            self.criterion = nn.NLLLoss(
                ignore_index=model.text_processor.pad_token_id())
        else:
            self.criterion = SmoothedNLLLoss(
                ignore_index=model.text_processor.pad_token_id())

        self.num_gpu = torch.cuda.device_count()
        self.fp16 = False
        self.rank = rank
        if rank >= 0:
            self.device = torch.device('cuda', rank)
            torch.cuda.set_device(self.device)

        self.model = self.model.to(self.device)

        if fp16:
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level="O2")
            self.fp16 = True

        self.generator = BeamDecoder(self.model,
                                     beam_width=beam_width,
                                     max_len_a=max_len_a,
                                     max_len_b=max_len_b,
                                     len_penalty_ratio=len_penalty_ratio)
        if rank >= 0:
            self.model = DistributedDataParallel(self.model,
                                                 device_ids=[self.rank],
                                                 output_device=self.rank,
                                                 find_unused_parameters=True)
            self.generator = DistributedDataParallel(
                self.generator,
                device_ids=[self.rank],
                output_device=self.rank,
                find_unused_parameters=True)
        elif self.num_gpu > 1:
            print("Let's use", self.num_gpu, "GPUs!")
            self.model = DataParallelModel(self.model)
            self.criterion = DataParallelCriterion(self.criterion)
            self.generator = DataParallelModel(self.generator)

        self.reference = None
        self.best_bleu = -1.0
        self.mm_mode = mm_mode
Exemple #19
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_corpus",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--discriminative_finetuning',
                        action='store_true',
                        help='Whether to use discriminative fine-tuning')

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_corpus)
        train_dataset = BERTDataset(args.train_corpus,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)
        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    #############################################################################
    # model = BertForPreTraining.from_pretrained(args.bert_model)
    model = BertForMaskedLM.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        # model = torch.nn.DataParallel(model)
        model = DataParallelModel(model)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        if args.discriminative_finetuning:
            group1 = ['layer.0', 'layer.1.']
            group2 = ['layer.2', 'layer.3']
            group3 = ['layer.4', 'layer.5']
            group4 = ['layer.6', 'layer.7']
            group5 = ['layer.8', 'layer.9']
            group6 = ['layer.10', 'layer.11']
            group_all = ['layer.0', 'layer.1', 'layer.2', 'layer.3', 'layer.4', 'layer.5', \
            'layer.6', 'layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.01},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.01, 'lr': args.learning_rate},

                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.0},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.0, 'lr': args.learning_rate},
            ]
        else:
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)

        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            #TODO: check if this works with current data generator from disk that relies on next(file)
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size,
                                      drop_last=True)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch

                logits = model(input_ids, segment_ids, input_mask)
                loss_fct = CrossEntropyLoss(ignore_index=-1)
                loss_fct = DataParallelCriterion(loss_fct)
                logits = [
                    logits[i].view(-1, model.module.config.vocab_size)
                    for i in range(len(logits))
                ]
                loss = loss_fct(logits, lm_label_ids.view(-1))

                # loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
                # loss = model(input_ids, segment_ids, input_mask, lm_label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        if args.do_train:
            torch.save(model_to_save.state_dict(), output_model_file)
            model_to_save.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(args.output_dir)
Exemple #20
0
def main(args):
    init(args)
    #Args setup:
    save_dir = os.path.join(args.output_dir, args.experiment_name,
                            "checkpoints")
    save_dir_local = "checkpoints_local"
    desc = args.desc
    data_dir = args.data_dir
    log_dir = os.path.join(args.output_dir, args.experiment_name, "logs")
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(save_dir_local, exist_ok=True)

    train_log_interval = args.train_log_interval
    val_log_interval = args.val_log_interval
    beam = args.beam
    p = args.p
    n_ctx = args.n_ctx
    gen_len = args.gen_len
    k = args.k
    decoding_strategy = args.decoding_strategy
    accum_iter = args.accum_iter
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)
    logger = Logger(log_dir)

    #Text Encoder
    if args.use_offline_gpt2:
        text_encoder = GPT2Tokenizer.from_pretrained('./gpt2model')
    elif args.debug_mode:
        text_encoder = GPT2Tokenizer.from_pretrained('gpt2')
    else:
        text_encoder = GPT2Tokenizer.from_pretrained('gpt2-medium')

    text_encoder.add_special_tokens({
        'bos_token':
        '_start_',
        'cls_token':
        '_classify_',
        'eos_token':
        '_end_',
        'additional_special_tokens':
        ['_kw_', '_endkw_', '_t_', '_i_', '_b_', '_c_']
    })

    vocab = len(text_encoder)

    print("Loading dataset...")
    if args.use_model == "base":
        train_loader = get_paragraph_input_loader(
            os.path.join(data_dir, "train_encoded.jsonl"),
            args.n_batch,
            text_encoder,
            num_workers=3,
            shuffle=True,
            gen_len=gen_len,
            n_ctx=n_ctx,
            include_discourse_type=args.use_discourse,
            include_neigh=args.use_neighbor_feat,
            max_size=args.max_ex,
            include_kw=not args.exclude_kw,
            dim=args.n_embd,
            debug_mode=args.debug_mode)

        val_loader = get_paragraph_input_loader(
            os.path.join(data_dir, "val_encoded.jsonl"),
            n_gpu,
            text_encoder,
            num_workers=0,
            shuffle=False,
            gen_len=gen_len,
            n_ctx=n_ctx,
            include_discourse_type=args.use_discourse,
            include_neigh=args.use_neighbor_feat,
            max_size=args.num_val_examples,
            include_kw=not args.exclude_kw,
            dim=args.n_embd,
            debug_mode=args.debug_mode)

        print("Train length: {}, Validation length: {}".format(
            len(train_loader), len(val_loader)))
        doc_model = GPT2BaseModel(args,
                                  vocab=vocab,
                                  n_ctx=n_ctx,
                                  gen_len=gen_len,
                                  lastidx=text_encoder.eos_token_id,
                                  includeprev=args.use_neighbor_feat,
                                  use_offline_gpt2=args.use_offline_gpt2)

    elif args.use_model == "plotmachines":
        #asli
        train_loader = get_paragraph_memory_input_loader(
            os.path.join(data_dir, "train_encoded.jsonl"),
            args.n_batch,
            text_encoder,
            num_workers=3,
            shuffle=True,
            gen_len=gen_len,
            n_ctx=n_ctx,
            include_discourse_type=args.use_discourse,
            include_neigh=args.use_neighbor_feat,
            max_size=args.max_ex,
            include_kw=not args.exclude_kw,
            memsize=args.memstatesize,
            dim=args.n_embd,
            use_kwmem=True,
            debug_mode=args.debug_mode)

        val_loader = get_paragraph_memory_input_loader(
            os.path.join(data_dir, "val_encoded.jsonl"),
            n_gpu,
            text_encoder,
            num_workers=0,
            shuffle=False,
            gen_len=gen_len,
            n_ctx=n_ctx,
            include_discourse_type=args.use_discourse,
            include_neigh=args.use_neighbor_feat,
            max_size=args.num_val_examples,
            include_kw=not args.exclude_kw,
            memsize=args.memstatesize,
            dim=args.n_embd,
            use_kwmem=True,
            debug_mode=args.debug_mode)

        print("Train length: {}, Validation length: {}".format(
            len(train_loader), len(val_loader)))
        doc_model = PlotMachinesModel(args,
                                      vocab=vocab,
                                      n_ctx=n_ctx,
                                      gen_len=gen_len,
                                      lastidx=text_encoder.eos_token_id,
                                      includeprev=args.use_neighbor_feat,
                                      use_offline_gpt2=args.use_offline_gpt2)

    n_updates_total = (len(train_loader) //
                       args.accum_iter) * (args.num_epochs)

    if args.debug_mode:
        print_model_params(log_dir, doc_model)

    criterion = nn.CrossEntropyLoss(reduction="none")

    model_opt = AdamW(filter(lambda p: p.requires_grad,
                             doc_model.parameters()),
                      lr=args.lr,
                      betas=(args.b1, args.b2),
                      eps=args.e)

    lm_loss = ParagraphLoss(criterion, n_ctx=n_ctx, gen_len=gen_len)

    print("Loading Model")
    doc_model.to(device)
    if n_gpu > 1:
        doc_model = DataParallelModel(doc_model)
        lm_loss = DataParallelCriterion(lm_loss)
    print("Parallelized")

    bestloss = -1
    start_iter, running_loss = 1, 0
    prevloss = 1000

    start_iter, running_loss = load_checkpoint(args.checkpoint, doc_model,
                                               model_opt)
    for i in range(args.num_epochs):
        start_iter, running_loss, bestloss, updates, val_loss1 = run_epoch(
            bestloss,
            start_iter,
            running_loss,
            doc_model,
            lm_loss,
            model_opt,
            train_loader,
            val_loader,
            train_log_interval,
            val_log_interval,
            device,
            beam,
            gen_len,
            k,
            p,
            decoding_strategy,
            accum_iter,
            "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs),
            save_dir,
            logger,
            text_encoder,
            show_progress=args.show_progress,
            my_local_dir=save_dir_local)
        print("VAL LOSS: ", str(val_loss1))
        if val_loss1 > prevloss or math.isnan(val_loss1):
            break
        prevloss = val_loss1

    print('Done training...')
    print('Evaluating on validation with best checkpoint...')

    bestcheck = os.path.join(save_dir, "checkpoint_best.pt")
    checkpoint = torch.load(bestcheck, map_location='cpu')
    state_dict = checkpoint["state_dict"]
    if state_dict.get('module.pos_emb_mask') is None and doc_model.state_dict(
    ).get('module.pos_emb_mask') is not None:
        state_dict['module.pos_emb_mask'] = doc_model.state_dict().get(
            'module.pos_emb_mask')
    doc_model.load_state_dict(state_dict)
    evaluate_doc_model(doc_model, val_loader, text_encoder, device, beam,
                       gen_len, k, p, args.decoding_strategy,
                       os.path.join(save_dir, 'valeval.log'), 'gen', 'tgt',
                       gen_len, [], args)
Exemple #21
0
def train(config):
    net = BertForMaskedLM.from_pretrained(config.model)
    lossFunc = KLDivLoss(config)

    if torch.cuda.is_available():
        net = net.cuda()
        lossFunc = lossFunc.cuda()

        if config.dataParallel:
            net = DataParallelModel(net)
            lossFunc = DataParallelCriterion(lossFunc)

    options = optionsLoader(LOG, config.optionFrames, disp=False)
    Tokenizer = BertTokenizer.from_pretrained(config.model)
    prepareFunc = prepare_data

    trainSet = Dataset('train', config.batch_size,
                       lambda x: len(x[0]) + len(x[1]), prepareFunc, Tokenizer,
                       options['dataset'], LOG, 'train')
    validSet = Dataset('valid', config.batch_size,
                       lambda x: len(x[0]) + len(x[1]), prepareFunc, Tokenizer,
                       options['dataset'], LOG, 'valid')

    print(trainSet.__len__())

    Q = []
    best_vloss = 1e99
    counter = 0
    lRate = config.lRate

    prob_src = config.prob_src
    prob_tgt = config.prob_tgt

    num_train_optimization_steps = trainSet.__len__(
    ) * options['training']['stopConditions']['max_epoch']
    param_optimizer = list(net.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=lRate,
                         e=1e-9,
                         t_total=num_train_optimization_steps,
                         warmup=0.0)

    for epoch_idx in range(options['training']['stopConditions']['max_epoch']):
        total_seen = 0
        total_similar = 0
        total_unseen = 0
        total_source = 0

        trainSet.setConfig(config, prob_src, prob_tgt)
        trainLoader = data.DataLoader(dataset=trainSet,
                                      batch_size=1,
                                      shuffle=True,
                                      num_workers=config.dataLoader_workers,
                                      pin_memory=True)

        validSet.setConfig(config, 0.0, prob_tgt)
        validLoader = data.DataLoader(dataset=validSet,
                                      batch_size=1,
                                      shuffle=False,
                                      num_workers=config.dataLoader_workers,
                                      pin_memory=True)

        for batch_idx, batch_data in enumerate(trainLoader):
            if (batch_idx + 1) % 10000 == 0:
                gc.collect()
            start_time = time.time()

            net.train()

            inputs, positions, token_types, labels, masks, batch_seen, batch_similar, batch_unseen, batch_source = batch_data

            inputs = inputs[0].cuda()
            positions = positions[0].cuda()
            token_types = token_types[0].cuda()
            labels = labels[0].cuda()
            masks = masks[0].cuda()
            total_seen += batch_seen
            total_similar += batch_similar
            total_unseen += batch_unseen
            total_source += batch_source

            n_token = int((labels.data != 0).data.sum())

            predicts = net(inputs, positions, token_types, masks)
            loss = lossFunc(predicts, labels, n_token).sum()

            Q.append(float(loss))
            if len(Q) > 200:
                Q.pop(0)
            loss_avg = sum(Q) / len(Q)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()

            LOG.log(
                'Epoch %2d, Batch %6d, Loss %9.6f, Average Loss %9.6f, Time %9.6f'
                % (epoch_idx + 1, batch_idx + 1, loss, loss_avg,
                   time.time() - start_time))

            # Checkpoints
            idx = epoch_idx * trainSet.__len__() + batch_idx + 1
            if (idx >= options['training']['checkingPoints']['checkMin']) and (
                    idx % options['training']['checkingPoints']['checkFreq']
                    == 0):
                if config.do_eval:
                    vloss = 0
                    total_tokens = 0
                    for bid, batch_data in enumerate(validLoader):
                        inputs, positions, token_types, labels, masks, batch_seen, batch_similar, batch_unseen, batch_source = batch_data

                        inputs = inputs[0].cuda()
                        positions = positions[0].cuda()
                        token_types = token_types[0].cuda()
                        labels = labels[0].cuda()
                        masks = masks[0].cuda()

                        n_token = int((labels.data != config.PAD).data.sum())

                        with torch.no_grad():
                            net.eval()
                            predicts = net(inputs, positions, token_types,
                                           masks)
                            vloss += float(lossFunc(predicts, labels).sum())

                        total_tokens += n_token

                    vloss /= total_tokens
                    is_best = vloss < best_vloss
                    best_vloss = min(vloss, best_vloss)
                    LOG.log(
                        'CheckPoint: Validation Loss %11.8f, Best Loss %11.8f'
                        % (vloss, best_vloss))

                    if is_best:
                        LOG.log('Best Model Updated')
                        save_check_point(
                            {
                                'epoch': epoch_idx + 1,
                                'batch': batch_idx + 1,
                                'options': options,
                                'config': config,
                                'state_dict': net.state_dict(),
                                'best_vloss': best_vloss
                            },
                            is_best,
                            path=config.save_path,
                            fileName='latest.pth.tar')
                        counter = 0
                    else:
                        counter += options['training']['checkingPoints'][
                            'checkFreq']
                        if counter >= options['training']['stopConditions'][
                                'rateReduce_bound']:
                            counter = 0
                            for param_group in optimizer.param_groups:
                                lr_ = param_group['lr']
                                param_group['lr'] *= 0.55
                                _lr = param_group['lr']
                            LOG.log(
                                'Reduce Learning Rate from %11.8f to %11.8f' %
                                (lr_, _lr))
                        LOG.log('Current Counter = %d' % (counter))

                else:
                    save_check_point(
                        {
                            'epoch': epoch_idx + 1,
                            'batch': batch_idx + 1,
                            'options': options,
                            'config': config,
                            'state_dict': net.state_dict(),
                            'best_vloss': 1e99
                        },
                        False,
                        path=config.save_path,
                        fileName='checkpoint_Epoch' + str(epoch_idx + 1) +
                        '_Batch' + str(batch_idx + 1) + '.pth.tar')
                    LOG.log('CheckPoint Saved!')

        if options['training']['checkingPoints']['everyEpoch']:
            save_check_point(
                {
                    'epoch': epoch_idx + 1,
                    'batch': batch_idx + 1,
                    'options': options,
                    'config': config,
                    'state_dict': net.state_dict(),
                    'best_vloss': 1e99
                },
                False,
                path=config.save_path,
                fileName='checkpoint_Epoch' + str(epoch_idx + 1) + '.pth.tar')

        LOG.log('Epoch Finished.')
        LOG.log(
            'Total Seen: %d, Total Unseen: %d, Total Similar: %d, Total Source: %d.'
            % (total_seen, total_unseen, total_similar, total_source))
        gc.collect()
Exemple #22
0
                           schedule=args.lr_schedule,
                           warmup=args.lr_warmup,
                           t_total=n_updates_total,
                           b1=args.b1,
                           b2=args.b2,
                           e=args.e,
                           l2=args.l2,
                           vector_l2=args.vector_l2,
                           max_grad_norm=args.max_grad_norm)
    load_openai_pretrained_model(dh_model.transformer,
                                 n_ctx=n_ctx,
                                 n_special=n_special)

    dh_model.to(device)

    dh_model = DataParallelModel(dh_model)
    criterion_lm = DataParallelCriterion(criterion_lm)
    criterion_clf = DataParallelCriterion(criterion_clf)

    n_updates = 0
    n_epochs = 0
    if submit:
        path = os.path.join(save_dir, desc, 'best_params_para_selector')
        torch.save(dh_model.state_dict(), make_path(path))
    best_score = 0
    for i in range(args.n_iter):
        if i == 0:
            log_msmarco()
        print("running epoch", i)
        run_epoch()
        n_epochs += 1
Exemple #23
0
def test(config):
    Best_Model = torch.load(config.test_model)
    Tokenizer = BertTokenizer.from_pretrained(config.model)

    f_in = open(config.inputFile, 'r')

    net = BertForMaskedLM.from_pretrained(config.model)

    # When loading from a model not trained from DataParallel
    #net.load_state_dict(Best_Model['state_dict'])
    #net.eval()

    if torch.cuda.is_available():
        net = net.cuda(0)
        if config.dataParallel:
            net = DataParallelModel(net)

    # When loading from a model trained from DataParallel
    net.load_state_dict(Best_Model['state_dict'])
    net.eval()

    mySearcher = Searcher(net, config)

    f_top1 = open('summary' + config.suffix + '.txt', 'w', encoding='utf-8')
    f_topK = open('summary' + config.suffix + '.txt.' +
                  str(config.answer_size),
                  'w',
                  encoding='utf-8')

    ed = '\n------------------------\n'

    for idx, line in enumerate(f_in):
        source_ = line.strip().split()
        source = Tokenizer.tokenize(line.strip())
        mapping = mapping_tokenize(source_, source)

        source = Tokenizer.convert_tokens_to_ids(source)

        print(idx)
        print(detokenize(translate(source, Tokenizer), mapping), end=ed)

        l_pred = mySearcher.length_Predict(source)
        Answers = mySearcher.search(source)
        baseline = sum(Answers[0][0])

        if config.reranking_method == 'none':
            Answers = sorted(Answers, key=lambda x: sum(x[0]))
        elif config.reranking_method == 'length_norm':
            Answers = sorted(Answers, key=lambda x: length_norm(x[0]))
        elif config.reranking_method == 'bounded_word_reward':
            Answers = sorted(
                Answers,
                key=lambda x: bounded_word_reward(x[0], config.reward, l_pred))
        elif config.reranking_method == 'bounded_adaptive_reward':
            Answers = sorted(
                Answers,
                key=lambda x: bounded_adaptive_reward(x[0], x[2], l_pred))

        texts = [
            detokenize(translate(Answers[k][1], Tokenizer), mapping)
            for k in range(len(Answers))
        ]

        if baseline != sum(Answers[0][0]):
            print('Reranked!')

        print(texts[0], end=ed)
        print(texts[0], file=f_top1)
        print(len(texts), file=f_topK)
        for i in range(len(texts)):
            print(Answers[i][0], file=f_topK)
            print(texts[i], file=f_topK)

    f_top1.close()
    f_topK.close()
def main(args):
    # Constants
    n_ctx = args.n_ctx
    desc = args.desc

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3

    print("Loading dataset...")
    test_loader = get_loader(args.data_file,
                             args.n_batch,
                             encoder,
                             num_workers=1,
                             shuffle=False,
                             subset=args.subset)

    vocab = n_vocab + n_special + n_ctx
    dh_model = LMModel(args,
                       vocab=vocab,
                       n_ctx=n_ctx,
                       doc_embed=args.doc_model)

    print("Loading model...")
    load_openai_pretrained_model(dh_model.transformer,
                                 n_ctx=n_ctx,
                                 n_special=n_special,
                                 path="./model/",
                                 path_names="./")
    if args.checkpoint != "none":
        checkpoint = torch.load(args.checkpoint, map_location='cpu')
        state_dict = checkpoint["state_dict"]
        for key in list(state_dict.keys()):
            state_dict[key[7:]] = state_dict[key]
            del state_dict[key]
        pos_emb_mask = torch.zeros(1, 1, vocab)
        pos_emb_mask[:, :, -n_ctx] = -1e12
        state_dict['pos_emb_mask'] = pos_emb_mask
        dh_model.load_state_dict(state_dict)

    dh_model.to(device)
    dh_model = DataParallelModel(dh_model)

    stop_words = []
    if args.stop_words is not None:
        with open(args.stop_words) as f:
            for line in f:
                stop_words.append(line)
    evaluate_model(dh_model, test_loader, text_encoder, device, args.beam,
                   args.gen_len, args.k, args.decoding_strategy,
                   args.save_file, args.gen_dir, args.tgt_dir, args.max_len,
                   stop_words, args)
Exemple #25
0
    if config.model_type=='LSTM':
        model = LSTMLM(input_size=len(vocab),
                       embedding_size=config.embedding_size,
                       hidden_size=config.hidden_size,
                       output_size=len(vocab),
                       n_layers=config.n_layers,
                       dropout_p=config.dropout_p)
    elif config.model_type=='BiLSTM':
        model = BiLSTMLM(input_size=len(vocab),
                         embedding_size=config.embedding_size,
                         hidden_size=config.hidden_size,
                         output_size=len(vocab),
                         n_layers=config.n_layers,
                         dropout_p=config.dropout_p)
        
    loss_fn = nn.NLLLoss(ignore_index=vocab.stoi[vocab.pad_token])
    optimizer = optim.Adam(model.parameters())
    
    if config.cuda:
        if config.multi_gpu:
            from parallel import DataParallelModel, DataParallelCriterion
            model = DataParallelModel(model).cuda()
            loss_fn = DataParallelCriterion(loss_fn).cuda()
        else:
            model = model.cuda()
            loss_fn = loss_fn.cuda()
    print('=========MODEL=========\n',model)

    # Train
    for epoch in range(1, config.epochs+1):
        train()
Exemple #26
0
def train(task_ids, model):
    tasks = [args.tasks[task_id] for task_id in task_ids]

    logger.info("start to train { task: %s, seq train type: %s }" %
                (tasks, args.seq_train_type))
    model_dir = get_model_dir(tasks)
    make_dir(model_dir)

    #train_dataset = [(TASK_DICT[t]["train"] if not args.seq_distil else TASK_DICT[t]["train"].replace("train", "distil")) for t in tasks]
    train_dataset = [
        swap_name(TASK_DICT[t]["train"], args.seq_distil, args.ref1)
        for t in tasks
    ]
    train_extra_data = []
    if "lll" in args.seq_train_type and task_ids[0] > 0 and not args.skip_tasks:
        prev_task = args.tasks[task_ids[0] - 1]
        with torch.no_grad():
            create_extra_data(tasks[0], prev_task, model, train_extra_data)
    elif "gem" in args.seq_train_type and task_ids[0] > 0:
        get_real_data(tasks[0], train_extra_data, accum=False, encode=True)
        args.memory_data.append(train_extra_data)
        train_extra_data = []
    logger.info('extra training data size: {}'.format(len(train_extra_data)))

    if not model:
        # which_model_to_load = model_dir if os.path.isfile(os.path.join(model_dir, FINAL_SAVE_NAME)) else args.model_name
        model = MODEL_CLASS.from_pretrained(args.model_name).cuda()
        model.resize_token_embeddings(len(TOKENIZER))
        if not args.fp32:
            model = FP16_Module(model)

    gen_token = get_gen_token(tasks[0])
    TOKENIZER.add_tokens([gen_token])
    TOKENIZER.save_pretrained(model_dir)
    SPECIAL_TOKENS[tasks[0]] = gen_token
    SPECIAL_TOKEN_IDS[tasks[0]] = TOKENIZER.convert_tokens_to_ids(gen_token)
    logger.info('gen token = {} , gen token id = {}'.format(
        gen_token, SPECIAL_TOKEN_IDS[tasks[0]]))
    MODEL_CONFIG.vocab_size = len(TOKENIZER)
    MODEL_CONFIG.to_json_file(os.path.join(model_dir, CONFIG_NAME))
    global TOKENS_WEIGHT
    if len(TOKENIZER) != TOKENS_WEIGHT.shape[0]:
        TOKENS_WEIGHT = torch.cat((TOKENS_WEIGHT, torch.ones([1]).cuda()))

    if args.skip_tasks and len(tasks) == 1:
        logger.info("*********** skip task: {} ***********".format(tasks[0]))
        if tasks[0] in args.skip_tasks:
            if len(args.skip_tasks) == 1:
                model_dir = get_model_dir(tasks)
                model_path = os.path.join(model_dir, FINAL_SAVE_NAME)
                config_path = os.path.join(model_dir, CONFIG_NAME)
                model_config = CONFIG_CLASS.from_json_file(config_path)
                model = MODEL_CLASS(model_config).cuda()
                state_dict = torch.load(model_path)
                model.load_state_dict(state_dict)
                if not args.fp32:
                    model = FP16_Module(model)
                if args.seq_train_type in REG_TYPE_KEYS:
                    logger.info("calulating reg_params ...")
                    train_qadata = QADataset(train_dataset, "train",
                                             SPECIAL_TOKEN_IDS[tasks[0]],
                                             train_extra_data)
                    max_train_batch_size = max(
                        len(train_qadata) // args.min_n_steps,
                        args.min_batch_size)
                    train_dataloader = create_dataloader(
                        train_qadata, "train", max_train_batch_size)
                    parallel_model = DataParallelModel(WrapModel(model),
                                                       args.device_ids)
                    regularizer = REG_TYPES[args.seq_train_type](
                        model, parallel_model, [train_dataloader], tasks[0])
                    regularizer.task_start_do()
                    regularizer.task_end_do()
                    torch.save(model.state_dict(),
                               os.path.join(model_dir, FINAL_SAVE_NAME))
                    logger.info("done reg_params!")
            args.skip_tasks.remove(tasks[0])
            return model

    model.resize_token_embeddings(
        len(TOKENIZER) if not args.multitask_specific else len(TOKENIZER) + 4)
    if args.multitask_specific:
        for i in range(4):
            TOKENS_WEIGHT = torch.cat((TOKENS_WEIGHT, torch.ones([1]).cuda()))
    if args.distil:
        teacher_model = MODEL_CLASS.from_pretrained(args.model_name).cuda()
        teacher_vocab_size = json.load(
            open("models/gpt2/lll/{task}_0.2/{task}/config.json".format(
                task=tasks[0])))['vocab_size']
        teacher_model.resize_token_embeddings(teacher_vocab_size)
        print("load teacher model from {}".format(
            "models/gpt2/lll/{task}_0.2/{task}/model-finish".format(
                task=tasks[0])))
        teacher_model.load_state_dict(
            torch.load("models/gpt2/lll/{task}_0.2/{task}/model-finish".format(
                task=tasks[0])))
        if not args.fp32:
            teacher_model = FP16_Module(teacher_model)
        teacher_model.eval()
        teacher_model = DataParallelModel(WrapModel(teacher_model),
                                          args.device_ids)

    if not args.fp32:  # again because resize_token_embeddings makes embedding layer fp32
        model = FP16_Module(model)

    parallel_model = DataParallelModel(WrapModel(model), args.device_ids)

    train_qadata = QADataset(train_dataset, "train",
                             SPECIAL_TOKEN_IDS[tasks[0]], train_extra_data)
    max_train_batch_size = max(
        len(train_qadata) // args.min_n_steps, args.min_batch_size)
    train_dataloader = create_dataloader(train_qadata, "train",
                                         max_train_batch_size)
    if not args.unbound and args.seq_train_type not in [
            "multitask", "multilm"
    ]:
        #n_train_epochs = TASK_DICT[tasks[0]]["n_train_epochs"]
        n_train_epochs = args.n_train_epochs[tasks[0]]
    else:
        n_train_epochs = args.n_train_epochs['_'.join(tasks)]
    n_train_optimization_steps = len(train_qadata) * n_train_epochs
    logger.info(
        'len of train dataset: {} , max train batch size {} , num of opt steps: {}'
        .format(len(train_qadata), max_train_batch_size,
                n_train_optimization_steps))

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if "gem" in args.seq_train_type:
        model.task_id = task_ids[0]
        if not hasattr(model, "grad_dims"):
            model.grad_dims = []
            for param in model.parameters():
                model.grad_dims.append(param.data.numel())
        if not hasattr(model, "grads"):
            model.grads = torch.zeros(sum(model.grad_dims), len(args.tasks))
            model.grads = model.grads.cuda()

    if args.seq_train_type in REG_TYPE_KEYS:
        optimizer = Weight_Regularized_AdamW(optimizer_grouped_parameters,
                                             lr=args.learning_rate,
                                             eps=args.adam_epsilon)
    else:
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
    if not args.fp32:
        optimizer = FP16_Optimizer(optimizer,
                                   static_loss_scale=None,
                                   dynamic_loss_scale=True,
                                   dynamic_loss_args={
                                       'scale_window': 100,
                                       'min_scale': 1,
                                       'delayed_shift': 2
                                   })

    scheduler = AnnealingLR(optimizer,
                            start_lr=args.learning_rate,
                            warmup_iter=int(args.n_warmup_ratio *
                                            len(train_qadata)),
                            num_iters=int(n_train_optimization_steps),
                            decay_style=args.decay_style)
    train_loss_fct = DataParallelCriterion(
        CrossEntropyLoss(ignore_index=FILL_VAL, weight=TOKENS_WEIGHT),
        args.device_ids)
    if args.distil:
        kd_loss_fct = DataParallelCriterion(
            nn.KLDivLoss(reduction="batchmean"), args.device_ids)

    if args.seq_train_type in REG_TYPE_KEYS:
        copy_train_dataloader = create_dataloader(train_qadata, "train",
                                                  max_train_batch_size)
        prev_task = args.tasks[task_ids[0] - 1]
        regularizer = REG_TYPES[args.seq_train_type](model, parallel_model,
                                                     [copy_train_dataloader],
                                                     tasks[0], prev_task)
        regularizer.task_start_do()

    tot_n_steps = 0
    train_once = TrainStep(model, optimizer, scheduler)
    if "gem" in args.seq_train_type and task_ids[0] != 0:
        gem_step = GEMStep(model, parallel_model, train_loss_fct, optimizer)
    model.train()
    for ep in range(n_train_epochs):
        cum_loss, cum_qa_loss, cum_lm_loss, cur_n_inputs = 0, 0, 0, 0
        for n_steps, (_, _, cqa, _, Y, gen_X, gen_Y,
                      is_extra) in enumerate(train_dataloader):

            n_inputs = sum(_cqa.shape[0] for _cqa in cqa)
            if args.multitask_specific:
                for i in range(len(is_extra)):
                    gen_X[i][:, 0] += is_extra[i]
                    is_extra[i] = is_extra[i] * 0

            for i in range(len(cqa)):
                cqa[i] = (cqa[i].to(args.device_ids[i]), )
                Y[i] = Y[i].to(args.device_ids[i])
                gen_X[i] = (gen_X[i].to(args.device_ids[i]), )
                gen_Y[i] = gen_Y[i].to(args.device_ids[i])
                is_extra[i] = is_extra[i].to(args.device_ids[i])

            if args.distil:
                losses = get_distil_losses(teacher_model,
                                           parallel_model,
                                           cqa,
                                           Y,
                                           gen_X,
                                           gen_Y,
                                           is_extra,
                                           kd_loss_fct,
                                           train_loss_fct,
                                           args.temperature_kd,
                                           pad_idx=FILL_VAL)
            else:
                losses = get_losses(parallel_model, cqa, Y, gen_X, gen_Y,
                                    train_loss_fct)
            loss = sum(losses)
            if "gem" in args.seq_train_type and task_ids[0] != 0:
                gem_step(task_ids[0])
            train_once(loss, n_inputs)

            qa_loss = losses[0].item() * n_inputs
            lm_loss = losses[1].item() * n_inputs
            cum_loss += (qa_loss + lm_loss)
            cum_qa_loss += qa_loss
            cum_lm_loss += lm_loss
            cur_n_inputs += n_inputs

            if (n_steps + 1) % args.logging_steps == 0:
                logger.info(
                    'progress {:.3f} , lr {:.1E} , loss {:.3f} , qa loss {:.3f} , lm loss {:.3f} , avg batch size {:.1f}'
                    .format(ep + cur_n_inputs / len(train_qadata),
                            scheduler.get_lr(), cum_loss / cur_n_inputs,
                            cum_qa_loss / cur_n_inputs,
                            cum_lm_loss / cur_n_inputs,
                            cur_n_inputs / (n_steps + 1)))

        torch.save(model.state_dict(),
                   os.path.join(model_dir, SAVE_NAME + str(ep + 1)))
        tot_n_steps += (n_steps + 1)
        logger.info(
            'epoch {}/{} done , tot steps {} , lr {:.1E} , loss {:.2f} , qa loss {:.2f} , lm loss {:.2f} , avg batch size {:.1f}'
            .format(ep + 1, n_train_epochs, tot_n_steps, scheduler.get_lr(),
                    cum_loss / cur_n_inputs, cum_qa_loss / cur_n_inputs,
                    cum_lm_loss / cur_n_inputs, cur_n_inputs / (n_steps + 1)))

    # task end do for reg
    if args.seq_train_type in REG_TYPE_KEYS:
        regularizer.task_end_do()
    torch.save(model.state_dict(), os.path.join(model_dir, FINAL_SAVE_NAME))

    return model
Exemple #27
0
def main():
    args = setup_parser()
    args.final_eval = False

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup CUDA, GPU & distributed training
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels(args.data_dir)
    num_labels = len(label_list)
    args.num_labels = num_labels

    # Load pretrained model and tokenizer
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case)
    model = model_class.from_pretrained(args.model_name_or_path, config=config)
    model.to(args.device)

    # logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)

        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)
        if args.n_gpu > 1:
            model = DataParallelModel(model)

    # Evaluation
    results = {}
    if args.do_eval:
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            if args.n_gpu > 1:
                model = DataParallelModel(model)
            args.final_eval = True
            result = evaluate(args, model, tokenizer, prefix=global_step)
            result = dict(
                (k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

    if args.save_embeddings:
        save_embeddings(args, model, tokenizer)

    return results
Exemple #28
0
for (key, value) in vars(args).items():
    print("{0:16} | {1}".format(key, value))

# check if processed data file exists or not

data_mean = [0.485, 0.456, 0.406]
data_std = [0.229, 0.224, 0.225]

# load the model
model = BiSalNet()
model.eval()

if args.onGPU and torch.cuda.device_count() > 1:
    # model = torch.nn.DataParallel(model)
    model = DataParallelModel(model)
if args.onGPU:
    model = model.cuda()

# compose the data with transforms
val_transforms = transforms.Compose([
    transforms.Resize((args.inHeight, args.inWidth)),
    transforms.ToTensor(),
    transforms.Normalize(data_mean, data_std)
])

if os.path.isfile(join(args.savedir, "checkpoint.pth")):
    print("=> loading checkpoint '{}'".format(
        join(args.savedir, "checkpoint.pth")))
    checkpoint = torch.load(join(args.savedir, "checkpoint.pth"))["state_dict"]
    if list(checkpoint.keys())[0][:7] == "module." and not isinstance(
Exemple #29
0
 def parallelize(self):
     self.parallel = True
     self.model = DataParallelModel(self.model)
     self.criterion = DataParallelCriterion(self.criterion)
Exemple #30
0
    def __init__(self,
                 model,
                 vocab_size,
                 train_dataloader,
                 test_dataloader=None,
                 lr: float = 1e-4,
                 betas=(0.9, 0.999),
                 weight_decay: float = 0.01,
                 warmup_steps=10000,
                 with_cuda: bool = True,
                 cuda_devices=None,
                 log_freq: int = 10,
                 include_next=False,
                 include_vision=True,
                 total_epochs=1):
        """
        :param bert: BERT model which you want to train
        :param vocab_size: total word vocab size
        :param train_dataloader: train dataset data loader
        :param test_dataloader: test dataset data loader [can be None]
        :param lr: learning rate of optimizer
        :param betas: Adam optimizer betas
        :param weight_decay: Adam optimizer weight decay param
        :param with_cuda: traning with cuda
        :param log_freq: logging frequency of the batch iteration
        """

        # Setup cuda device for BERT training, argument -c, --cuda should be true
        cuda_condition = torch.cuda.is_available() and with_cuda
        self.device = torch.device("cuda:0" if cuda_condition else "cpu")

        n_gpu = torch.cuda.device_count()
        print("device", device, "n_gpu", n_gpu)

        # Initialize the BERT Language Model, with BERT model
        self.model = model.to(self.device)
        self.bert = self.model.bert
        self.padding_idx = 0
        self.include_next = include_next
        self.include_vision = include_vision

        # Distributed GPU training if CUDA can detect more than 1 GPU
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count()))
            self.model = DataParallelModel(self.model,
                                           device_ids=range(
                                               torch.cuda.device_count()))

        # Setting the train and test data loader
        self.train_data = train_dataloader
        self.test_data = test_dataloader

        # Setting the Adam optimizer with hyper-param
        self.optim = optim.Adamax(self.model.parameters(),
                                  lr=lr,
                                  betas=betas,
                                  weight_decay=weight_decay)
        if self.model.__class__.__name__ in [
                'DataParallel', 'DataParallelModel'
        ]:
            self.optim_schedule = ScheduledOptim(
                self.optim,
                self.model.module.bert.transformer_hidden_size,
                n_warmup_steps=warmup_steps)
        else:
            self.optim_schedule = ScheduledOptim(
                self.optim,
                self.model.bert.transformer_hidden_size,
                n_warmup_steps=warmup_steps)

        # Using Negative Log Likelihood Loss function for predicting the masked_token
        self.criterion = nn.NLLLoss(ignore_index=0)
        if with_cuda and torch.cuda.device_count() > 1:
            print("Using %d GPUS for BERT" % torch.cuda.device_count())
            #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count()))
            self.criterion = DataParallelCriterion(
                self.criterion, device_ids=range(torch.cuda.device_count()))

        self.log_freq = log_freq
        self.total_iters = total_epochs * len(train_dataloader)

        print("Total Parameters:",
              sum([p.nelement() for p in self.model.parameters()]))