Example #1
0
    def train(self):
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)


        # logger.info(f'Fold {split_index + 1}')
        train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader()

        num_train_optimization_steps = self.train_steps

        # Prepare model
        config = BertConfig.from_pretrained(self.model_name_or_path)
        model = BertForTokenClassification.from_pretrained(self.model_name_or_path,self.args, config=config)
        model.to(self.device)
        model.train()
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay': self.weight_decay},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", self.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        best_MRR = 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        train_dataloader = cycle(train_dataloader)

        for step in range(num_train_optimization_steps):
            batch = next(train_dataloader)
            batch = tuple(t.to(self.device) for t in batch)
            input_ids, input_mask, segment_ids,label_domainslot, label_domain,label_dependcy = batch

            loss_domainslot,loss_domain,loss_dependcy = model(
                input_ids=input_ids,
                token_type_ids=segment_ids,
                attention_mask=input_mask,
                label_domainslot = label_domainslot,
                label_domain=label_domain,
                label_dependcy = label_dependcy
            )
            loss = loss_domainslot+loss_domain+loss_dependcy
            tr_loss += loss.item()
            train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            loss.backward()
            if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1

            if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels_domainslot = []
                    gold_labels_domain = []
                    gold_labels_dependcy = []
                    inference_logits = []
                    scores_domainslot = []
                    scores_domain = []
                    scores_dependcy = []
                    # ID = [x.guid for x in eval_examples]
                    dialogueID = [x.guid for x in eval_examples]
                    utterance_text = [x.text_eachturn for x in eval_examples]
                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", self.eval_batch_size)

                    model.eval()
                    eval_loss_domainslot,eval_loss_domain,eval_loss_dependcy = 0,0,0
                    eval_accuracy_domainslot,eval_accuracy_domain,eval_accuracy_dependcy = 0,0,0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids,label_domainslot,label_domain,label_dependcy in eval_dataloader:
                        input_ids = input_ids.to(self.device)
                        input_mask = input_mask.to(self.device)
                        segment_ids = segment_ids.to(self.device)
                        label_domainslot = label_domainslot.to(self.device)
                        label_domain = label_domain.to(self.device)
                        label_dependcy = label_dependcy.to(self.device)

                        with torch.no_grad():
                            batch_eval_loss_domainslot,batch_eval_loss_domain,batch_eval_loss_dependcy = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                label_domainslot = label_domainslot,
                                label_domain=label_domain,
                                label_dependcy=label_dependcy
                            )
                            logits_domainslot,logits_domain,logits_dependcy = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask

                            )

                        logits_domainslot = torch.sigmoid(logits_domainslot)
                        logits_domainslot = (logits_domainslot > 0.4).float()
                        logits_domainslot = logits_domainslot.cpu().long().numpy()


                        logits_domain = logits_domain.view(-1, self.num_labels_domain).detach().cpu().numpy()
                        logits_dependcy = logits_dependcy.view(-1, self.num_labels_dependcy).detach().cpu().numpy()

                        label_domainslot = label_domainslot.to('cpu').numpy()
                        label_domain = label_domain.view(-1).to('cpu').numpy()
                        label_dependcy = label_dependcy.view(-1).to('cpu').numpy()

                        scores_domainslot.append(logits_domainslot)
                        scores_domain.append(logits_domain)
                        scores_dependcy.append(logits_dependcy)

                        gold_labels_domainslot.append(label_domainslot)
                        gold_labels_domain.append(label_domain)
                        gold_labels_dependcy.append(label_dependcy)

                        eval_loss_domainslot += batch_eval_loss_domainslot.mean().item()
                        eval_loss_domain += batch_eval_loss_domain.mean().item()
                        eval_loss_dependcy += batch_eval_loss_dependcy.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels_domainslot = np.concatenate(gold_labels_domainslot, 0)
                    gold_labels_domain = np.concatenate(gold_labels_domain, 0)
                    gold_labels_dependcy = np.concatenate(gold_labels_dependcy, 0)
                    scores_domainslot = np.concatenate(scores_domainslot, 0)
                    scores_domain = np.concatenate(scores_domain, 0)
                    scores_dependcy = np.concatenate(scores_dependcy, 0)
                    model.train()
                    eval_loss_domainslot = eval_loss_domainslot/nb_eval_steps
                    eval_loss_domain = eval_loss_domain / nb_eval_steps
                    eval_loss_dependcy = eval_loss_dependcy / nb_eval_steps

                    # print(scores_domainslot.shape)
                    # print(gold_labels_domainslot.shape)
                    # print(scores_domainslot)
                    # print(gold_labels_domainslot)
                    # exit()
                    eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain,mode='domain')
                    eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy ,mode= 'dependcy')
                    eval_jointGoal_domainslot = compute_jointGoal_domainslot(
                        dialogueID,
                        utterance_text,
                        scores_domainslot,
                        gold_labels_domainslot,
                        scores_domain,
                        gold_labels_domain,
                        scores_dependcy,
                        gold_labels_dependcy
                    )


                    print(
                        'eval_jointGoal_domainslot',eval_jointGoal_domainslot,
                        'eval_F1_domain',eval_accuracy_domain,
                        'eval_F1_dependcy', eval_accuracy_dependcy,
                        'global_step',global_step,
                        'loss',train_loss
                    )
                    result = {
                        'eval_jointGoal_domainslot':eval_jointGoal_domainslot,
                        'eval_loss_domainslot':eval_loss_domainslot,
                        'eval_loss_domain': eval_loss_domain,
                        'eval_loss_dependcy':eval_loss_dependcy,
                        'eval_F1_domain': eval_accuracy_domain,
                        'eval_F1_dependcy': eval_accuracy_dependcy,
                        'global_step': global_step,
                        'loss': train_loss}

                    output_eval_file = os.path.join(self.output_dir, "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_accuracy_domain > best_acc :
                        print("=" * 80)
                        print("Best F1", eval_accuracy_domain)
                        print("Saving Model......")
                        # best_acc = eval_accuracy
                        best_acc = eval_accuracy_domain
                        # Save a trained model
                        model_to_save = model.module if hasattr(model,'module') else model
                        output_model_file = os.path.join(self.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(), output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
Example #2
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--meta_path",
        default=None,
        type=str,
        required=False,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--predict_eval",
                        action='store_true',
                        help="Whether to predict eval set.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--eval_steps", default=-1, type=int, help="")
    parser.add_argument("--lstm_hidden_size", default=300, type=int, help="")
    parser.add_argument("--lstm_layers", default=2, type=int, help="")
    parser.add_argument("--lstm_dropout", default=0.5, type=float, help="")

    parser.add_argument("--train_steps", default=-1, type=int, help="")
    parser.add_argument("--report_steps", default=-1, type=int, help="")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--split_num", default=3, type=int, help="text split")
    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument("--freeze",
                        default=0,
                        type=int,
                        required=False,
                        help="freeze bert.")
    parser.add_argument("--not_do_eval_steps",
                        default=0.35,
                        type=float,
                        help="not_do_eval_steps.")
    args = parser.parse_args()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    try:
        os.makedirs(args.output_dir)
    except:
        pass

    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                              do_lower_case=args.do_lower_case)

    config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3)

    # Prepare model
    model = BertForSequenceClassification.from_pretrained(
        args.model_name_or_path, args, config=config)

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    if args.do_train:

        # Prepare data loader

        train_examples = read_examples(os.path.join(args.data_dir,
                                                    'train.csv'),
                                       is_training=True)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer,
                                                      args.max_seq_length,
                                                      args.split_num, True)
        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(select_field(train_features,
                                                   'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(train_features,
                                                    'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features],
                                 dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size //
                                      args.gradient_accumulation_steps)

        num_train_optimization_steps = args.train_steps

        # Prepare optimizer

        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=args.train_steps //
                                         args.gradient_accumulation_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        bar = tqdm(range(num_train_optimization_steps),
                   total=num_train_optimization_steps)
        train_dataloader = cycle(train_dataloader)

        # 先做一个eval
        for file in ['dev.csv']:
            inference_labels = []
            gold_labels = []
            inference_logits = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=True)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)

            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)

            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    tmp_eval_loss, logits = model(input_ids=input_ids,
                                                  token_type_ids=segment_ids,
                                                  attention_mask=input_mask,
                                                  labels=label_ids)
                    # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(np.argmax(logits, axis=1))
                gold_labels.append(label_ids)
                inference_logits.append(logits)
                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            gold_labels = np.concatenate(gold_labels, 0)
            inference_logits = np.concatenate(inference_logits, 0)
            model.train()
            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = accuracy(inference_logits, gold_labels)

            result = {
                'eval_loss': eval_loss,
                'eval_F1': eval_accuracy,
                'global_step': global_step
            }

            output_eval_file = os.path.join(args.output_dir,
                                            "eval_results.txt")
            with open(output_eval_file, "a") as writer:
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
                writer.write('*' * 80)
                writer.write('\n')
            if eval_accuracy > best_acc and 'dev' in file:
                print("=" * 80)
                print("Best F1", eval_accuracy)
                print("Saving Model......")
                best_acc = eval_accuracy
                # Save a trained model
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(args.output_dir,
                                                 "pytorch_model.bin")
                torch.save(model_to_save.state_dict(), output_model_file)
                print("=" * 80)
            else:
                print("=" * 80)

        model.train()

        for step in bar:
            batch = next(train_dataloader)
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss, _ = model(input_ids=input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=input_mask,
                            labels=label_ids)
            nb_tr_examples += input_ids.size(0)
            del input_ids, input_mask, segment_ids, label_ids
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.fp16 and args.loss_scale != 1.0:
                loss = loss * args.loss_scale
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()
            train_loss = round(
                tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1),
                4)
            bar.set_description("loss {}".format(train_loss))

            nb_tr_steps += 1

            if args.fp16:
                optimizer.backward(loss)
            else:

                loss.backward()

            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = args.learning_rate * warmup_linear.get_lr(
                        global_step, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1

            if (step + 1) % (args.eval_steps *
                             args.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if args.do_eval and step > num_train_optimization_steps * args.not_do_eval_steps and (
                    step + 1) % (args.eval_steps *
                                 args.gradient_accumulation_steps) == 0:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels = []
                    inference_logits = []
                    eval_examples = read_examples(os.path.join(
                        args.data_dir, file),
                                                  is_training=True)
                    eval_features = convert_examples_to_features(
                        eval_examples, tokenizer, args.max_seq_length,
                        args.split_num, False)
                    all_input_ids = torch.tensor(select_field(
                        eval_features, 'input_ids'),
                                                 dtype=torch.long)
                    all_input_mask = torch.tensor(select_field(
                        eval_features, 'input_mask'),
                                                  dtype=torch.long)
                    all_segment_ids = torch.tensor(select_field(
                        eval_features, 'segment_ids'),
                                                   dtype=torch.long)
                    all_label = torch.tensor([f.label for f in eval_features],
                                             dtype=torch.long)

                    eval_data = TensorDataset(all_input_ids, all_input_mask,
                                              all_segment_ids, all_label)

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", args.eval_batch_size)

                    # Run prediction for full data
                    eval_sampler = SequentialSampler(eval_data)
                    eval_dataloader = DataLoader(
                        eval_data,
                        sampler=eval_sampler,
                        batch_size=args.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            tmp_eval_loss, logits = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                labels=label_ids)
                            # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        inference_labels.append(np.argmax(logits, axis=1))
                        gold_labels.append(label_ids)
                        inference_logits.append(logits)
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_logits = np.concatenate(inference_logits, 0)
                    model.train()
                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = accuracy(inference_logits, gold_labels)

                    result = {
                        'eval_loss': eval_loss,
                        'eval_F1': eval_accuracy,
                        'global_step': global_step,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(args.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_accuracy > best_acc and 'dev' in file:
                        print("=" * 80)
                        print("Best F1", eval_accuracy)
                        print("Saving Model......")
                        best_acc = eval_accuracy
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            args.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
    if args.do_test:
        del model
        gc.collect()
        args.do_train = False
        model = BertForSequenceClassification.from_pretrained(os.path.join(
            args.output_dir, "pytorch_model.bin"),
                                                              args,
                                                              config=config)
        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]:
            inference_labels = []
            gold_labels = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=False)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(
                        input_ids=input_ids,
                        token_type_ids=segment_ids,
                        attention_mask=input_mask).detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(logits)
                gold_labels.append(label_ids)
            gold_labels = np.concatenate(gold_labels, 0)
            logits = np.concatenate(inference_labels, 0)
            print(flag, accuracy(logits, gold_labels))
            if flag == 'test':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df['label_2'] = logits[:, 2]
                df[['id', 'label_0', 'label_1',
                    'label_2']].to_csv(os.path.join(args.output_dir,
                                                    "sub.csv"),
                                       index=False)
            if flag == 'dev':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df['label_2'] = logits[:, 2]
                df[['id', 'label_0', 'label_1',
                    'label_2']].to_csv(os.path.join(args.output_dir,
                                                    "sub_dev.csv"),
                                       index=False)

    if args.predict_eval:
        del model
        gc.collect()
        args.do_train = False
        model = BertForSequenceClassification.from_pretrained(os.path.join(
            args.output_dir, "pytorch_model.bin"),
                                                              args,
                                                              config=config)
        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        for file, flag in [('dev.csv', 'dev')]:
            inference_labels = []
            gold_labels = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=False)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(
                        input_ids=input_ids,
                        token_type_ids=segment_ids,
                        attention_mask=input_mask).detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(logits)
                gold_labels.append(label_ids)
            gold_labels = np.concatenate(gold_labels, 0)
            logits = np.concatenate(inference_labels, 0)
            print(flag, accuracy(logits, gold_labels))
            if flag == 'dev':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df['label_2'] = logits[:, 2]
                df[['id', 'label_0', 'label_1',
                    'label_2']].to_csv(os.path.join(args.output_dir,
                                                    "sub_dev.csv"),
                                       index=False)
Example #3
0
def train(args, train_dataset, val_dataset, model, tokenizer):
    """ Train the model """
    pretrained_model = model[0]
    adapter_model = model[1]
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in adapter_model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in adapter_model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        adapter_model, optimizer = amp.initialize(
            adapter_model, optimizer, opt_level=args.fp16_opt_level)
    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        pretrained_model = torch.nn.DataParallel(pretrained_model)
        adapter_model = torch.nn.DataParallel(adapter_model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        pretrained_model = torch.nn.parallel.DistributedDataParallel(
            pretrained_model,
            device_ids=[args.local_rank],
            output_device=args.local_rank)
        adapter_model = torch.nn.parallel.DistributedDataParallel(
            adapter_model,
            device_ids=[args.local_rank],
            output_device=args.local_rank)

    # Train!
    logger.info("***** Running training *****")
    logger.info(
        "  Num train examples = %d", len(train_dataset)
    )  # logging.info(f"  Num train_examples = {len(train_examples)}")
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    logger.info("Try resume from checkpoint")

    if args.restore:
        if os.path.exists(os.path.join(args.output_dir, 'global_step.bin')):
            logger.info("Load last checkpoint data")
            global_step = torch.load(
                os.path.join(args.output_dir, 'global_step.bin'))
            output_dir = os.path.join(args.output_dir,
                                      'checkpoint-{}'.format(global_step))
            logger.info("Load from output_dir {}".format(output_dir))

            optimizer.load_state_dict(
                torch.load(os.path.join(output_dir, 'optimizer.bin')))
            scheduler.load_state_dict(
                torch.load(os.path.join(output_dir, 'scheduler.bin')))
            # args = torch.load(os.path.join(output_dir, 'training_args.bin'))
            if hasattr(adapter_model, 'module'):
                adapter_model.module.load_state_dict(
                    torch.load(os.path.join(output_dir, 'pytorch_model.bin')))
            else:  # Take care of distributed/parallel training
                adapter_model.load_state_dict(
                    torch.load(os.path.join(output_dir, 'pytorch_model.bin')))

            global_step += 1
            start_epoch = int(global_step / len(train_dataloader))
            start_step = global_step - start_epoch * len(train_dataloader) - 1
            logger.info("Start from global_step={} epoch={} step={}".format(
                global_step, start_epoch, start_step))

            if args.local_rank in [-1, 0]:
                tb_writer = SummaryWriter(log_dir="runs/" + args.my_model_name,
                                          purge_step=global_step)

        else:
            global_step = 0
            start_epoch = 0
            start_step = 0
            if args.local_rank in [-1, 0]:
                tb_writer = SummaryWriter(log_dir="runs/" + args.my_model_name,
                                          purge_step=global_step)

            logger.info("Start from scratch")
    else:
        global_step = 0
        start_epoch = 0
        start_step = 0
        if args.local_rank in [-1, 0]:
            tb_writer = SummaryWriter(log_dir="runs/" + args.my_model_name,
                                      purge_step=global_step)
        logger.info("Start from scratch")

    tr_loss, logging_loss = 0.0, 0.0
    pretrained_model.zero_grad()
    adapter_model.zero_grad()
    # model.zero_grad()

    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)

    for epoch in range(start_epoch, int(args.num_train_epochs)):
        for step, batch in enumerate(train_dataloader):
            start = time.time()
            if args.restore and (step < start_step):
                continue
            # if args.restore and (flag_count < global_step):
            #     flag_count+=1
            #     continue
            pretrained_model.eval()
            adapter_model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                'token_type_ids':
                batch[2] if args.model_type in ['bert', 'xlnet'] else None,
                # XLM and RoBERTa don't use segment_ids
                'labels':
                batch[3],
                'subj_special_start_id':
                batch[4],
                'obj_special_start_id':
                batch[5]
            }
            pretrained_model_outputs = pretrained_model(**inputs)
            outputs = adapter_model(pretrained_model_outputs, **inputs)

            loss = outputs[
                0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            # epoch_iterator.set_description("loss {}".format(loss))
            logger.info(
                "Epoch {}/{} - Iter {} / {}, loss = {:.5f}, time used = {:.3f}s"
                .format(epoch, int(args.num_train_epochs), step,
                        len(train_dataloader), loss.item(),
                        time.time() - start))
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(adapter_model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                # model.zero_grad()
                pretrained_model.zero_grad()
                adapter_model.zero_grad()
                global_step += 1
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = adapter_model.module if hasattr(
                        adapter_model, 'module'
                    ) else adapter_model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(
                        output_dir
                    )  # save to pytorch_model.bin  model.state_dict()

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, 'optimizer.bin'))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, 'scheduler.bin'))
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    torch.save(
                        global_step,
                        os.path.join(args.output_dir, 'global_step.bin'))

                    logger.info(
                        "Saving model checkpoint, optimizer, global_step to %s",
                        output_dir)
                    if (global_step /
                            args.save_steps) > args.max_save_checkpoints:
                        try:
                            shutil.rmtree(
                                os.path.join(
                                    args.output_dir, 'checkpoint-{}'.format(
                                        global_step -
                                        args.max_save_checkpoints *
                                        args.save_steps)))
                        except OSError as e:
                            print(e)
                if args.local_rank == -1 and args.evaluate_during_training and global_step % args.eval_steps == 0:  # Only evaluate when single GPU otherwise metrics may not average well
                    model = (pretrained_model, adapter_model)
                    results = evaluate(args, val_dataset, model, tokenizer)
                    for key, value in results.items():
                        tb_writer.add_scalar('eval_{}'.format(key), value,
                                             global_step)
            if args.max_steps > 0 and global_step > args.max_steps:
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #4
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                'token_type_ids':
                batch[2] if args.model_type in ['bert', 'xlnet'] else
                None,  # XLM and RoBERTa don't use segment_ids
                'labels':
                batch[3]
            }
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #5
0
update_count = 0
start = time.time()

for ep in range(5):
    "Training"
    pb = tqdm.tqdm(train_dataloader)
    encoder.train()
    decoder.train()

    for batch in pb:
        record_loss, perplexity = train_one_iter(batch, fp16=True)
        update_count += 1

        if update_count % num_gradients_accumulation == num_gradients_accumulation - 1:
            scheduler.step()
            optimizer.step()
            optimizer.zero_grad()

            # speed measure
            end = time.time()
            speed = batch_size * num_gradients_accumulation / (end - start)
            start = end

            pb.set_postfix(loss=record_loss,
                           perplexity=perplexity,
                           speed=speed)

    "Evaluation"
    encoder.eval()
    decoder.eval()
    ppl = validate(val_dataloader)
Example #6
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters(即required=True的参数必须在命令上出现)
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "数据集路径. The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="模型类型(这里为bert). Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help=
        "下载好的预训练模型. Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--meta_path",
        default=None,
        type=str,
        required=False,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "模型预测和断点文件的存放路径. The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help=
        "预训练的配置名字或路径. Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help=
        "预训练分词器名字或路径. Pretrained tokenizer name or path if not the same as model_name"
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "从亚马逊s3下载的预训练模型存放路径. Where do you want to store the pre-trained models downloaded from s3"
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "最长序列长度. The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="是否训练. Whether to run training.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="是否测试. Whether to run testing.")
    parser.add_argument("--predict_eval",
                        action='store_true',
                        help="是否预测验证集. Whether to predict eval set.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="是否验证. Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="是否训练中跑验证. Run evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="是否用小写模型. Set this flag if you are using an uncased model.")

    parser.add_argument(
        "--per_gpu_train_batch_size",
        default=8,
        type=int,
        help="训练时每个GPU/CPU上的batch size. Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=8,
        type=int,
        help="验证时每个GPU/CPU上的batch size. Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "反向传播前梯度累计的次数. Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="Adam的初始学习率. The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="权重衰减系数. Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Adam的Epsilon系数. Epsilon for Adam optimizer.")
    parser.add_argument(
        "--max_grad_norm",
        default=1.0,
        type=float,
        help=
        " 如果所有参数的gradient组成的向量的L2 norm大于max norm,那么需要根据L2 norm/max_norm进行缩放。从而使得L2 norm小于预设的clip_norm. Max gradient norm."
    )
    parser.add_argument(
        "--num_train_epochs",
        default=3.0,
        type=float,
        help="训练epoch数. Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--eval_steps", default=-1, type=int, help="")
    parser.add_argument("--lstm_hidden_size", default=300, type=int, help="")
    parser.add_argument("--lstm_layers", default=2, type=int, help="")
    parser.add_argument("--lstm_dropout", default=0.5, type=float, help="")

    parser.add_argument("--train_steps", default=-1, type=int, help="")
    parser.add_argument("--report_steps", default=-1, type=int, help="")
    parser.add_argument(
        "--warmup_steps",
        default=0,
        type=int,
        help="线性warmup的steps. Linear warmup over warmup_steps.")
    parser.add_argument("--split_num",
                        default=3,
                        type=int,
                        help="测试集划分. text split")
    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="日志更新steps. Log every X updates steps.")
    parser.add_argument(
        '--save_steps',
        type=int,
        default=50,
        help="断点文件保存steps. Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "评估所有的断点. Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="不用cuda. Avoid using CUDA when available")
    parser.add_argument(
        '--overwrite_output_dir',
        action='store_true',
        help="重写输出路径. Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="重写训练和评估的缓存. Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="初始化用的随机种子. random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "是否用16位混合精度. Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "fp16的优化level. For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="为了分布式训练. For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="远程debug用的ip. For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="远程debug用的端口. For distant debugging.")
    parser.add_argument("--freeze",
                        default=0,
                        type=int,
                        required=False,
                        help="冻结BERT. freeze bert.")
    parser.add_argument("--not_do_eval_steps",
                        default=0.35,
                        type=float,
                        help="not_do_eval_steps.")
    args = parser.parse_args()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        # 如果无指定GPU或允许使用CUDA,就使用当前所有GPU
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        # 指定使用哪个GPU(local_rank代表当前程序进程使用的GPU标号)
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging 初始化日志
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed 设置种子数
    set_seed(args)

    # 创建存放路径
    try:
        os.makedirs(args.output_dir)
    except:
        pass

    # 载入预训练好的BERT分词器
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                              do_lower_case=args.do_lower_case)

    # 载入预设好的BERT配置文件
    config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3)

    # Prepare model 载入并配置好基于BERT的序列分类模型
    model = BertForSequenceClassification.from_pretrained(
        args.model_name_or_path, args, config=config)

    # 开启FP16
    if args.fp16:
        model.half()
    model.to(device)
    # 如果是指定了单个GPU,用DistributedDataParallel进行GPU训练
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    # 如果有多个GPU,就直接用torch.nn.DataParallel,会自动调用当前可用的多个GPU
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # 总batch size = GPU数量 * 每个GPU上的mbatch size
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    if args.do_train:
        # Prepare data loader 导入数据并准备符合格式的输入
        train_examples = read_examples(os.path.join(args.data_dir,
                                                    'train.csv'),
                                       is_training=True)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer,
                                                      args.max_seq_length,
                                                      args.split_num, True)
        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(select_field(train_features,
                                                   'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(train_features,
                                                    'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features],
                                 dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label)
        # 如果无指定GPU就随机采样,如果指定了GPU就分布式采样
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        # 准备dataloader
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size //
                                      args.gradient_accumulation_steps)
        # 训练steps
        num_train_optimization_steps = args.train_steps

        # Prepare optimizer 准备优化器
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        # no_dacay内的参数不参与权重衰减
        # BN是固定C,[B,H,W]进行归一化处理(处理为均值0,方差1的正太分布上),适用于CNN
        # LN是固定N,[C,H,W]进行归一化处理,适用于RNN(BN适用于固定深度的前向神经网络,而RNN因输入序列长度不一致而深度不固定,因此BN不合适,而LN不依赖于batch的大小和输入sequence的深度,因此可以用于batchsize为1和RNN中对边长的输入sequence的normalize操作)
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        # 配置优化器和warmup机制
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=args.train_steps //
                                         args.gradient_accumulation_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        bar = tqdm(range(num_train_optimization_steps),
                   total=num_train_optimization_steps)
        train_dataloader = cycle(train_dataloader)  # 循环遍历

        # 先做一个eval
        for file in ['dev.csv']:
            inference_labels = []
            gold_labels = []
            inference_logits = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=True)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)

            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)

            # Run prediction for full data 准备验证集的dataloader
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)
            # 开启预测模式(不用dropout和BN)
            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                # 将数据放在GPU上
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                # 禁止进行梯度更新
                with torch.no_grad():
                    tmp_eval_loss, logits = model(input_ids=input_ids,
                                                  token_type_ids=segment_ids,
                                                  attention_mask=input_mask,
                                                  labels=label_ids)
                    # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(np.argmax(logits, axis=1))
                gold_labels.append(label_ids)
                inference_logits.append(logits)
                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            gold_labels = np.concatenate(gold_labels, 0)
            inference_logits = np.concatenate(inference_logits, 0)
            model.train()
            eval_loss = eval_loss / nb_eval_steps  # 计算验证集的预测损失
            eval_accuracy = accuracy(inference_logits,
                                     gold_labels)  # 计算验证集的预测准确性

            result = {
                'eval_loss': eval_loss,
                'eval_F1': eval_accuracy,
                'global_step': global_step
            }
            # 将验证集的预测评价写入到evel_results.txt中
            output_eval_file = os.path.join(args.output_dir,
                                            "eval_results.txt")
            with open(output_eval_file, "a") as writer:
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
                writer.write('*' * 80)
                writer.write('\n')
            # 如果当前训练的模型表现最佳,则保存该模型
            if eval_accuracy > best_acc and 'dev' in file:
                print("=" * 80)
                print("Best F1", eval_accuracy)
                print("Saving Model......")
                best_acc = eval_accuracy
                # Save a trained model
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(args.output_dir,
                                                 "pytorch_model.bin")
                torch.save(model_to_save.state_dict(), output_model_file)
                print("=" * 80)
            else:
                print("=" * 80)

        model.train()

        # 分batch循环迭代训练模型
        for step in bar:
            batch = next(train_dataloader)
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss, _ = model(input_ids=input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=input_mask,
                            labels=label_ids)
            nb_tr_examples += input_ids.size(0)
            del input_ids, input_mask, segment_ids, label_ids
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.fp16 and args.loss_scale != 1.0:
                loss = loss * args.loss_scale
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()
            train_loss = round(
                tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1),
                4)
            bar.set_description("loss {}".format(train_loss))

            nb_tr_steps += 1

            # 用FP16去做反向传播
            if args.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()

            # 梯度累计后进行更新
            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = args.learning_rate * warmup_linear.get_lr(
                        global_step, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                optimizer.step()  # 梯度更新
                scheduler.step()  # 梯度更新
                optimizer.zero_grad()  # 清空现有梯度,避免累计
                global_step += 1

            # 每隔args.eval_steps*args.gradient_accumulation_steps,打印训练过程中的结果
            if (step + 1) % (args.eval_steps *
                             args.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            # 每隔args.eval_steps*args.gradient_accumulation_steps,预测验证集并评估结果
            if args.do_eval and step > num_train_optimization_steps * args.not_do_eval_steps and (
                    step + 1) % (args.eval_steps *
                                 args.gradient_accumulation_steps) == 0:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels = []
                    inference_logits = []
                    eval_examples = read_examples(os.path.join(
                        args.data_dir, file),
                                                  is_training=True)
                    eval_features = convert_examples_to_features(
                        eval_examples, tokenizer, args.max_seq_length,
                        args.split_num, False)
                    all_input_ids = torch.tensor(select_field(
                        eval_features, 'input_ids'),
                                                 dtype=torch.long)
                    all_input_mask = torch.tensor(select_field(
                        eval_features, 'input_mask'),
                                                  dtype=torch.long)
                    all_segment_ids = torch.tensor(select_field(
                        eval_features, 'segment_ids'),
                                                   dtype=torch.long)
                    all_label = torch.tensor([f.label for f in eval_features],
                                             dtype=torch.long)

                    eval_data = TensorDataset(all_input_ids, all_input_mask,
                                              all_segment_ids, all_label)

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", args.eval_batch_size)

                    # Run prediction for full data
                    eval_sampler = SequentialSampler(eval_data)
                    eval_dataloader = DataLoader(
                        eval_data,
                        sampler=eval_sampler,
                        batch_size=args.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            tmp_eval_loss, logits = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                labels=label_ids)
                            # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        inference_labels.append(np.argmax(logits, axis=1))
                        gold_labels.append(label_ids)
                        inference_logits.append(logits)
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_logits = np.concatenate(inference_logits, 0)
                    model.train()
                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = accuracy(inference_logits, gold_labels)

                    result = {
                        'eval_loss': eval_loss,
                        'eval_F1': eval_accuracy,
                        'global_step': global_step,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(args.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_accuracy > best_acc and 'dev' in file:
                        print("=" * 80)
                        print("Best F1", eval_accuracy)
                        print("Saving Model......")
                        best_acc = eval_accuracy
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            args.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)

    # 预测测试集
    if args.do_test:
        del model
        gc.collect()  # 清理内存
        args.do_train = False  # 停止训练
        # 载入训练好的的最佳模型文件
        model = BertForSequenceClassification.from_pretrained(os.path.join(
            args.output_dir, "pytorch_model.bin"),
                                                              args,
                                                              config=config)
        if args.fp16:
            # nn.Module中的half()方法将模型中的float32转化为float16
            model.half()
        model.to(device)  # 将模型放在GPU上

        # 设置GPU训练方式
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        #  预测验证集和测试集
        for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]:
            inference_labels = []
            gold_labels = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=False)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(
                        input_ids=input_ids,
                        token_type_ids=segment_ids,
                        attention_mask=input_mask).detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(logits)
                gold_labels.append(label_ids)
            gold_labels = np.concatenate(gold_labels, 0)
            logits = np.concatenate(inference_labels, 0)
            print(flag, accuracy(logits, gold_labels))
            # 保存预测结果文件
            if flag == 'test':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df['label_2'] = logits[:, 2]
                df[['id', 'label_0', 'label_1',
                    'label_2']].to_csv(os.path.join(args.output_dir,
                                                    "sub.csv"),
                                       index=False)
            if flag == 'dev':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df['label_2'] = logits[:, 2]
                df[['id', 'label_0', 'label_1',
                    'label_2']].to_csv(os.path.join(args.output_dir,
                                                    "sub_dev.csv"),
                                       index=False)
    # 只预测验证集
    if args.predict_eval:
        del model
        gc.collect()
        args.do_train = False
        model = BertForSequenceClassification.from_pretrained(os.path.join(
            args.output_dir, "pytorch_model.bin"),
                                                              args,
                                                              config=config)
        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        for file, flag in [('dev.csv', 'dev')]:
            inference_labels = []
            gold_labels = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=False)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(
                        input_ids=input_ids,
                        token_type_ids=segment_ids,
                        attention_mask=input_mask).detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(logits)
                gold_labels.append(label_ids)
            gold_labels = np.concatenate(gold_labels, 0)
            logits = np.concatenate(inference_labels, 0)
            print(flag, accuracy(logits, gold_labels))
            if flag == 'dev':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df['label_2'] = logits[:, 2]
                df[['id', 'label_0', 'label_1',
                    'label_2']].to_csv(os.path.join(args.output_dir,
                                                    "sub_dev.csv"),
                                       index=False)
def train_and_eval(args, train_dataset, dev_dataset, test_dataset, model,
                   tokenizer):
    """Train and evaluate the model on some steps"""
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter(
            comment=f"_baseline_{args.task_name}_{args.num_train_epochs}"
        )  # TensorboardX

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    # Each epoch take 5 evaluate on dev and test
    args.logging_steps = t_total // 10
    logger.info(f"Evaluate at each {args.logging_steps}")

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    best_dev_model = None
    best_dev_result = 0.0
    best_dev_step = 0
    no_improve_step = 0  # no improved in this steps, then end training
    no_improve_step_num = 100

    evaluate_f1 = "macro_f1"
    if args.task_name == "tacred":
        evaluate_f1 = "f1_micro"

    for epoch in train_iterator:

        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        if no_improve_step >= no_improve_step_num:
            logger.info(
                f"No improved in {no_improve_step_num}, total global step: {global_step}"
            )
            break
        for step, batch in enumerate(epoch_iterator):
            model.train()
            # Move all data to device in batch
            batch = tuple(t.to(args.device) for t in batch)

            # Traverse bag=[batch_size, source_num, *]
            # source_num=4: human-label, google, baidu, xiaoniu
            # for i in range(batch[0].size()[1]):
            inputs = {
                'input_ids':
                batch[0].select(1, 0),  # along dim to select index
                'attention_mask':
                batch[1].select(1, 0),
                'token_type_ids':
                batch[2].select(1, 0) if args.model_type in ['bert', 'xlnet']
                else None,  # XLM and RoBERTa don't use segment_ids
                'entity_position':
                batch[3].select(1, 0),
                'labels':
                batch[4]
            }

            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1
                """
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        result = evaluate(args, dev_dataset, model, tokenizer)
                        if result[args.task_name][evaluate_f1] > best_dev_result:
                            best_dev_result = result[args.task_name][evaluate_f1]
                            best_dev_model = copy.deepcopy(model)
                            best_dev_step = global_step
                            no_improve_step = 0
                        else:
                            no_improve_step += 1
                        for key, value in result[args.task_name].items():
                            tb_writer.add_scalar('dev_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', 
                                (tr_loss - logging_loss)/args.logging_steps, global_step)
                    logging_loss = tr_loss

                    # Log test
                    if args.local_rank == -1 and args.evaluate_during_training:
                        result = evaluate(args, test_dataset, model, tokenizer)                        
                        for key, value in result[args.task_name].items():
                            tb_writer.add_scalar('test_{}'.format(key), value, global_step)                    
                
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(model, 'module') else model
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)
                """
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break

            if no_improve_step >= no_improve_step_num:
                logger.info(
                    f"No improved in {no_improve_step_num}, total global step: {global_step}"
                )
                break
        """
        # we only test on each epoch's end
        if epoch != args.num_train_epochs - 1:
            continue
        result = evaluate(args, dev_dataset, model, tokenizer)
        if result[args.task_name][evaluate_f1] > best_dev_result:
            best_dev_result = result[args.task_name][evaluate_f1]
            best_dev_model = copy.deepcopy(model)
            best_dev_step = global_step
            no_improve_step = 0
        else:
            no_improve_step += 1
        for key, value in result[args.task_name].items():
            tb_writer.add_scalar('dev_{}'.format(key), value, global_step)
        tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
        tb_writer.add_scalar('loss', 
                    (tr_loss - logging_loss)/args.logging_steps, global_step)
        logging_loss = tr_loss
        
        # Log test
        if args.local_rank == -1 and args.evaluate_during_training:
            result = evaluate(args, test_dataset, model, tokenizer)                        
            for key, value in result[args.task_name].items():
                tb_writer.add_scalar('test_{}'.format(key), value, global_step)    
        
        """

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #8
0
    def train(self):


        try:
            os.makedirs(args.output_dir)
        except:
            pass

        tokenizer = BertTokenizer.from_pretrained(self.model_name_or_path, do_lower_case=self.do_lower_case)
        config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=3)

        # Prepare model
        model = BertForSequenceClassification.from_pretrained(self.model_name_or_path, args, config=config)
        model.to(self.device)

        train_batch_size = self.per_gpu_train_batch_size
        eval_batch_size = self.per_gpu_eval_batch_size
        for i in range(1):

            # Prepare data loader

            train_examples = self.read_examples(os.path.join(self.data_dir, 'train.csv'), is_training=True)
            train_features = self.convert_examples_to_features(
                train_examples, tokenizer, self.max_seq_length)
            all_input_ids = torch.tensor(self.select_field(train_features, 'input_ids'), dtype=torch.long)
            all_input_mask = torch.tensor(self.select_field(train_features, 'input_mask'), dtype=torch.long)
            all_segment_ids = torch.tensor(self.select_field(train_features, 'segment_ids'), dtype=torch.long)
            all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
            train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)

            train_sampler = RandomSampler(train_data)
            train_dataloader = DataLoader(train_data, sampler=train_sampler,batch_size=train_batch_size )

            num_train_optimization_steps = self.train_steps

            # Prepare optimizer

            param_optimizer = list(model.named_parameters())

            # hack to remove pooler, which is not used
            # thus it produce None grad that break apex
            param_optimizer = [n for n in param_optimizer]

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                 'weight_decay': args.weight_decay},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

            optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)
            scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps)

            global_step = 0

            logger.info("***** Running training *****")
            logger.info("  Num examples = %d", len(train_examples))
            logger.info("  Batch size = %d", train_batch_size)
            logger.info("  Num steps = %d", num_train_optimization_steps)

            best_acc = 0
            model.train()
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            train_dataloader = cycle(train_dataloader)

            for step in range(num_train_optimization_steps):
                batch = next(train_dataloader)
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
                tr_loss += loss.item()
                train_loss = round(tr_loss / (nb_tr_steps + 1), 4)

                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                loss.backward()

                if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0:

                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                    tr_loss = 0
                    nb_tr_examples, nb_tr_steps = 0, 0
                    logger.info("***** Report result *****")
                    logger.info("  %s = %s", 'global_step', str(global_step))
                    logger.info("  %s = %s", 'train loss', str(train_loss))

                if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0:
                    for file in ['dev.csv']:
                        inference_labels = []
                        gold_labels = []
                        inference_logits = []
                        eval_examples = self.read_examples(os.path.join(self.data_dir, file), is_training=True)
                        eval_features = self.convert_examples_to_features(eval_examples, tokenizer, self.max_seq_length)
                        all_input_ids = torch.tensor(self.select_field(eval_features, 'input_ids'), dtype=torch.long)
                        all_input_mask = torch.tensor(self.select_field(eval_features, 'input_mask'), dtype=torch.long)
                        all_segment_ids = torch.tensor(self.select_field(eval_features, 'segment_ids'), dtype=torch.long)
                        all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)

                        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)

                        logger.info("***** Running evaluation *****")
                        logger.info("  Num examples = %d", len(eval_examples))
                        logger.info("  Batch size = %d", eval_batch_size)

                        # Run prediction for full data
                        eval_sampler = SequentialSampler(eval_data)
                        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

                        model.eval()
                        eval_loss, eval_accuracy = 0, 0
                        nb_eval_steps, nb_eval_examples = 0, 0
                        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                            input_ids = input_ids.to(self.device)
                            input_mask = input_mask.to(self.device)
                            segment_ids = segment_ids.to(self.device)
                            label_ids = label_ids.to(self.device)

                            with torch.no_grad():
                                tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids,
                                                      attention_mask=input_mask, labels=label_ids)
                                logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

                            logits = logits.detach().cpu().numpy()
                            label_ids = label_ids.to('cpu').numpy()
                            inference_labels.append(np.argmax(logits, axis=1))
                            gold_labels.append(label_ids)
                            inference_logits.append(logits)
                            eval_loss += tmp_eval_loss.mean().item()
                            nb_eval_examples += input_ids.size(0)
                            nb_eval_steps += 1

                        gold_labels = np.concatenate(gold_labels, 0)
                        inference_logits = np.concatenate(inference_logits, 0)
                        model.train()
                        eval_loss = eval_loss / nb_eval_steps
                        eval_accuracy = self.accuracy(inference_logits, gold_labels)

                        result = {'eval_loss': eval_loss,
                                  'eval_F1': eval_accuracy,
                                  'global_step': global_step,
                                  'loss': train_loss}

                        output_eval_file = os.path.join(self.output_dir, "eval_results.txt")
                        with open(output_eval_file, "a") as writer:
                            for key in sorted(result.keys()):
                                logger.info("  %s = %s", key, str(result[key]))
                                writer.write("%s = %s\n" % (key, str(result[key])))
                            writer.write('*' * 80)
                            writer.write('\n')
                        if eval_accuracy > best_acc and 'dev' in file:
                            print("=" * 80)
                            print("Best F1", eval_accuracy)
                            print("Saving Model......")
                            best_acc = eval_accuracy
                            # Save a trained model
                            model_to_save = model.module if hasattr(model,
                                                                    'module') else model  # Only save the model it-self
                            output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
                            torch.save(model_to_save.state_dict(), output_model_file)
                            print("=" * 80)
                        else:
                            print("=" * 80)
        if args.do_test:
            del model
            gc.collect()
            args.do_train = False
            model = BertForSequenceClassification.from_pretrained(os.path.join(args.output_dir, "pytorch_model.bin"), args,
                                                                  config=config)
            if args.fp16:
                model.half()
            model.to(self.device)
            if args.local_rank != -1:
                try:
                    from apex.parallel import DistributedDataParallel as DDP
                except ImportError:
                    raise ImportError(
                        "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

                model = DDP(model)
            elif args.n_gpu > 1:
                model = torch.nn.DataParallel(model)

            for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]:
            # for file, flag in [ ('test.csv', 'test')]:
                inference_labels = []
                gold_labels = []
                eval_examples = self.read_examples(os.path.join(args.data_dir, file), is_training=False)
                print('exa',len(eval_examples))
                # exit()
                eval_features = self.convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length)
                all_input_ids = torch.tensor(self.select_field(eval_features, 'input_ids'), dtype=torch.long)
                all_input_mask = torch.tensor(self.select_field(eval_features, 'input_mask'), dtype=torch.long)
                all_segment_ids = torch.tensor(self.select_field(eval_features, 'segment_ids'), dtype=torch.long)
                all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)

                eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
                # Run prediction for full data
                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

                model.eval()
                eval_loss, eval_accuracy = 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                    input_ids = input_ids.to(self.device)
                    input_mask = input_mask.to(self.device)
                    segment_ids = segment_ids.to(self.device)
                    label_ids = label_ids.to(self.device)

                    with torch.no_grad():
                        logits = model(input_ids=input_ids, token_type_ids=segment_ids,
                                       attention_mask=input_mask).detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    inference_labels.append(logits)
                    gold_labels.append(label_ids)
                gold_labels = np.concatenate(gold_labels, 0)
                logits = np.concatenate(inference_labels, 0)
                if flag == 'dev':
                    print(flag,self.accuracy(logits, gold_labels))
                if flag == 'test':
                    df = pd.read_csv(os.path.join(args.data_dir, file),names = ['id', 'content', 'title', 'label'])
                    predict = np.argmax(logits, axis=1).tolist()
                    print(df.shape[0])
                    print(len(predict))
                    df['labelpre'] = predict
                    df[['id','labelpre']].to_csv(os.path.join(args.output_dir, "sub.csv"),index=False,header = False)
Example #9
0
class TransformerBase(TrainableModel):
    """
    Transformers base model (for working with pytorch-transformers models)
    """
    MODEL_CONFIGURATIONS = {
        'bert': (BertConfig, BertTokenizer),
        'quant_bert': (QuantizedBertConfig, BertTokenizer),
        'xlnet': (XLNetConfig, XLNetTokenizer),
        'xlm': (XLMConfig, XLMTokenizer),
    }

    def __init__(self, model_type: str, model_name_or_path: str, labels: List[str] = None,
                 num_labels: int = None, config_name=None,
                 tokenizer_name=None, do_lower_case=False, output_path=None,
                 device='cpu', n_gpus=0):
        """
        Transformers base model (for working with pytorch-transformers models)

        Args:
            model_type (str): transformer model type
            model_name_or_path (str): model name or path to model
            labels (List[str], optional): list of labels. Defaults to None.
            num_labels (int, optional): number of labels. Defaults to None.
            config_name ([type], optional): configuration name. Defaults to None.
            tokenizer_name ([type], optional): tokenizer name. Defaults to None.
            do_lower_case (bool, optional): lower case input words. Defaults to False.
            output_path ([type], optional): model output path. Defaults to None.
            device (str, optional): backend device. Defaults to 'cpu'.
            n_gpus (int, optional): num of gpus. Defaults to 0.

        Raises:
            FileNotFoundError: [description]
        """
        assert model_type in self.MODEL_CONFIGURATIONS.keys(), "unsupported model_type"
        self.model_type = model_type
        self.model_name_or_path = model_name_or_path
        self.labels = labels
        self.num_labels = num_labels
        self.do_lower_case = do_lower_case
        if output_path is not None and not os.path.exists(output_path):
            raise FileNotFoundError('output_path is not found')
        self.output_path = output_path

        self.model_class = None
        config_class, tokenizer_class = self.MODEL_CONFIGURATIONS[model_type]
        self.config_class = config_class
        self.tokenizer_class = tokenizer_class

        self.tokenizer_name = tokenizer_name
        self.tokenizer = self._load_tokenizer(self.tokenizer_name)
        self.config_name = config_name
        self.config = self._load_config(config_name)

        self.model = None
        self.device = device
        self.n_gpus = n_gpus

        self._optimizer = None
        self._scheduler = None

    def to(self, device='cpu', n_gpus=0):
        if self.model is not None:
            self.model.to(device)
            if n_gpus > 1:
                self.model = torch.nn.DataParallel(self.model)
        self.device = device
        self.n_gpus = n_gpus

    @property
    def optimizer(self):
        return self._optimizer

    @optimizer.setter
    def optimizer(self, opt):
        self._optimizer = opt

    @property
    def scheduler(self):
        return self._scheduler

    @scheduler.setter
    def scheduler(self, sch):
        self._scheduler = sch

    def setup_default_optimizer(self,
                                weight_decay: float = 0.0,
                                learning_rate: float = 5e-5,
                                adam_epsilon: float = 1e-8,
                                warmup_steps: int = 0,
                                total_steps: int = 0):
        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(
                nd in n for nd in no_decay)], 'weight_decay': weight_decay},
            {'params': [p for n, p in self.model.named_parameters() if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        self.optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
        self.scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=warmup_steps,
                                              t_total=total_steps)

    def _load_config(self, config_name=None):
        config = self.config_class.from_pretrained(config_name if config_name
                                                   else self.model_name_or_path,
                                                   num_labels=self.num_labels)
        return config

    def _load_tokenizer(self, tokenizer_name=None):
        tokenizer = self.tokenizer_class.from_pretrained(
            tokenizer_name if tokenizer_name
            else self.model_name_or_path, do_lower_case=self.do_lower_case)
        return tokenizer

    def save_model(self, output_dir: str, save_checkpoint: bool = False, args=None):
        """
        Save model/tokenizer/arguments to given output directory

        Args:
            output_dir (str): path to output directory
            save_checkpoint (bool, optional): save as checkpoint. Defaults to False.
            args ([type], optional): arguments object to save. Defaults to None.
        """
        # Create output directory if needed
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        logger.info("Saving model checkpoint to %s", output_dir)
        model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
        model_to_save.save_pretrained(output_dir)
        if not save_checkpoint:
            if self.tokenizer is not None:
                self.tokenizer.save_pretrained(output_dir)
            with io.open(output_dir + os.sep + 'labels.txt', 'w', encoding='utf-8') as fw:
                for l in self.labels:
                    fw.write('{}\n'.format(l))
            if args is not None:
                torch.save(args, os.path.join(output_dir, 'training_args.bin'))

    @classmethod
    def load_model(cls, model_path: str, model_type: str, *args, **kwargs):
        """
        Create a TranformerBase deom from given path

        Args:
            model_path (str): path to model
            model_type (str): model type

        Returns:
            TransformerBase: model
        """
        # Load a trained model and vocabulary from given path
        if not os.path.exists(model_path):
            raise FileNotFoundError
        with io.open(model_path + os.sep + 'labels.txt') as fp:
            labels = [l.strip() for l in fp.readlines()]
        return cls(model_type=model_type, model_name_or_path=model_path, labels=labels, *args, **kwargs)

    @staticmethod
    def get_train_steps_epochs(max_steps: int,
                               num_train_epochs: int,
                               gradient_accumulation_steps: int,
                               num_samples: int):
        """
        get train steps and epochs

        Args:
            max_steps (int): max steps
            num_train_epochs (int): num epochs
            gradient_accumulation_steps (int): gradient accumulation steps
            num_samples (int): number of samples

        Returns:
            Tuple: total steps, number of epochs
        """
        if max_steps > 0:
            t_total = max_steps
            num_train_epochs = max_steps // (num_samples // gradient_accumulation_steps) + 1
        else:
            t_total = num_samples // gradient_accumulation_steps * num_train_epochs
        return t_total, num_train_epochs

    def get_logits(self, batch):
        self.model.eval()
        inputs = self._batch_mapper(batch)
        outputs = self.model(**inputs)
        return outputs[-1]

    def _train(self,
               data_set: DataLoader,
               dev_data_set: Union[DataLoader, List[DataLoader]] = None,
               test_data_set: Union[DataLoader, List[DataLoader]] = None,
               gradient_accumulation_steps: int = 1,
               per_gpu_train_batch_size: int = 8,
               max_steps: int = -1,
               num_train_epochs: int = 3,
               max_grad_norm: float = 1.0,
               logging_steps: int = 50,
               save_steps: int = 100):
        """Run model training
            batch_mapper: a function that maps a batch into parameters that the model
                          expects in the forward method (for use with custom heads and models).
                          If None it will default to the basic models input structure.
            logging_callback_fn: a function that is called in each evaluation step
                          with the model as a parameter.

        """
        t_total, num_train_epochs = self.get_train_steps_epochs(max_steps,
                                                                num_train_epochs,
                                                                gradient_accumulation_steps,
                                                                len(data_set))
        if self.optimizer is None and self.scheduler is None:
            logger.info("Loading default optimizer and scheduler")
            self.setup_default_optimizer(total_steps=t_total)

        train_batch_size = per_gpu_train_batch_size * max(1, self.n_gpus)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(data_set.dataset))
        logger.info("  Num Epochs = %d", num_train_epochs)
        logger.info("  Instantaneous batch size per GPU/CPU = %d", per_gpu_train_batch_size)
        logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                    train_batch_size * gradient_accumulation_steps)
        logger.info("  Gradient Accumulation steps = %d", gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        self.model.zero_grad()
        train_iterator = trange(num_train_epochs, desc="Epoch")
        for _ in train_iterator:
            epoch_iterator = tqdm(data_set, desc="Train iteration")
            for step, batch in enumerate(epoch_iterator):
                self.model.train()
                batch = tuple(t.to(self.device) for t in batch)
                inputs = self._batch_mapper(batch)
                outputs = self.model(**inputs)
                loss = outputs[0]  # get loss

                if self.n_gpus > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu parallel training
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)

                tr_loss += loss.item()
                if (step + 1) % gradient_accumulation_steps == 0:
                    self.optimizer.step()
                    self.scheduler.step()
                    self.model.zero_grad()
                    global_step += 1

                    if logging_steps > 0 and global_step % logging_steps == 0:
                        # Log metrics and run evaluation on dev/test
                        for ds in [dev_data_set, test_data_set]:
                            if ds is None:  # got no data loader
                                continue
                            if isinstance(ds, DataLoader):
                                ds = [ds]
                            for d in ds:
                                logits, label_ids = self._evaluate(d)
                                self.evaluate_predictions(logits, label_ids)
                        logger.info('lr = {}'.format(self.scheduler.get_lr()[0]))
                        logger.info('loss = {}'.format((tr_loss - logging_loss) / logging_steps))
                        logging_loss = tr_loss

                    if save_steps > 0 and global_step % save_steps == 0:
                        # Save model checkpoint
                        self.save_model_checkpoint(output_path=self.output_path,
                                                   name='checkpoint-{}'.format(global_step))

                if 0 < max_steps < global_step:
                    epoch_iterator.close()
                    break
            if 0 < max_steps < global_step:
                train_iterator.close()
                break

        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    def _evaluate(self, data_set: DataLoader):
        logger.info("***** Running inference *****")
        logger.info(" Batch size: {}".format(data_set.batch_size))
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        for batch in tqdm(data_set, desc="Inference iteration"):
            self.model.eval()
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                inputs = self._batch_mapper(batch)
                outputs = self.model(**inputs)
                if 'labels' in inputs:
                    tmp_eval_loss, logits = outputs[:2]
                    eval_loss += tmp_eval_loss.mean().item()
                else:
                    logits = outputs[0]
            nb_eval_steps += 1
            model_output = logits.detach().cpu()
            model_out_label_ids = inputs['labels'].detach().cpu(
            ) if 'labels' in inputs else None
            if preds is None:
                preds = model_output
                out_label_ids = model_out_label_ids
            else:
                preds = torch.cat((preds, model_output), dim=0)
                out_label_ids = torch.cat((out_label_ids, model_out_label_ids),
                                          dim=0) if out_label_ids is not None else None
        if out_label_ids is None:
            return preds
        return preds, out_label_ids

    def _batch_mapper(self, batch):
        mapping = {'input_ids': batch[0],
                   'attention_mask': batch[1],
                   # XLM don't use segment_ids
                   'token_type_ids': batch[2] if self.model_type in ['bert', 'quant_bert', 'xlnet']
                   else None}
        if len(batch) == 4:
            mapping.update({'labels': batch[3]})
        return mapping

    def evaluate_predictions(self, logits, label_ids):
        raise NotImplementedError('evaluate_predictions method must be implemented in order to'
                                  'be used for dev/test set evaluation')

    def save_model_checkpoint(self, output_path: str, name: str):
        """
        save model checkpoint

        Args:
            output_path (str): output path
            name (str): name of checkpoint
        """
        output_dir_path = os.path.join(output_path, name)
        self.save_model(output_dir_path, save_checkpoint=True)
def main(parser):
    # Config
    args = parser.parse_args()
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)

    # data_config = Config(json_path=data_dir / 'config.json')
    model_config = Config(json_path=model_dir / 'config.json')

    # Vocab & Tokenizer
    tok_path = get_tokenizer()  # ./tokenizer_78b3253a26.model
    ptr_tokenizer = SentencepieceTokenizer(tok_path)

    _, vocab_of_gluonnlp = get_pytorch_kobert_model()
    token_to_idx = vocab_of_gluonnlp.token_to_idx

    model_config.vocab_size = len(token_to_idx)
    vocab = Vocabulary(token_to_idx=token_to_idx)

    print("len(token_to_idx): ", len(token_to_idx))
    with open(model_dir / "token2idx_vocab.json", 'w', encoding='utf-8') as f:
        json.dump(token_to_idx, f, ensure_ascii=False, indent=4)

    # save vocab & tokenizer
    with open(model_dir / "vocab.pkl", 'wb') as f:
        pickle.dump(vocab, f)

    # load vocab & tokenizer
    with open(model_dir / "vocab.pkl", 'rb') as f:
        vocab = pickle.load(f)

    tokenizer = Tokenizer(vocab=vocab,
                          split_fn=ptr_tokenizer,
                          pad_fn=keras_pad_fn,
                          maxlen=model_config.maxlen)
    ner_formatter = NamedEntityRecognitionFormatter(vocab=vocab,
                                                    tokenizer=tokenizer,
                                                    maxlen=model_config.maxlen,
                                                    model_dir=model_dir)

    # Train & Val Datasets
    cwd = Path.cwd()
    data_in = cwd / "data_in"
    train_data_dir = data_in / "NER-master" / "말뭉치 - 형태소_개체명"
    tr_ds = NamedEntityRecognitionDataset(train_data_dir=train_data_dir,
                                          model_dir=model_dir)
    tr_ds.set_transform_fn(
        transform_source_fn=ner_formatter.transform_source_fn,
        transform_target_fn=ner_formatter.transform_target_fn)
    tr_dl = DataLoader(tr_ds,
                       batch_size=model_config.batch_size,
                       shuffle=True,
                       num_workers=4,
                       drop_last=False)

    val_data_dir = data_in / "NER-master" / "validation_set"
    val_ds = NamedEntityRecognitionDataset(train_data_dir=val_data_dir,
                                           model_dir=model_dir)
    val_ds.set_transform_fn(
        transform_source_fn=ner_formatter.transform_source_fn,
        transform_target_fn=ner_formatter.transform_target_fn)
    val_dl = DataLoader(val_ds,
                        batch_size=model_config.batch_size,
                        shuffle=True,
                        num_workers=4,
                        drop_last=False)

    # Model
    model = KobertCRF(config=model_config, num_classes=len(tr_ds.ner_to_index))
    model.train()

    # optim
    train_examples_len = len(tr_ds)
    val_examples_len = len(val_ds)
    print("num of train: {}, num of val: {}".format(train_examples_len,
                                                    val_examples_len))

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    # num_train_optimization_steps = int(train_examples_len / model_config.batch_size / model_config.gradient_accumulation_steps) * model_config.epochs
    t_total = len(
        tr_dl
    ) // model_config.gradient_accumulation_steps * model_config.epochs
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=model_config.learning_rate,
                      eps=model_config.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=model_config.warmup_steps,
                                     t_total=t_total)

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    n_gpu = torch.cuda.device_count()
    # if n_gpu > 1:
    #     model = torch.nn.DataParallel(model)
    model.to(device)

    # save
    tb_writer = SummaryWriter('{}/runs'.format(model_dir))
    checkpoint_manager = CheckpointManager(model_dir)
    summary_manager = SummaryManager(model_dir)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(tr_ds))
    logger.info("  Num Epochs = %d", model_config.epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                model_config.batch_size)
    logger.info("  Gradient Accumulation steps = %d",
                model_config.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_dev_acc, best_dev_loss = 0.0, 99999999999.0
    best_steps = 0
    model.zero_grad()
    set_seed()  # Added here for reproductibility (even between python 2 and 3)

    # Train
    train_iterator = trange(int(model_config.epochs), desc="Epoch")
    for _epoch, _ in enumerate(train_iterator):
        epoch_iterator = tqdm(tr_dl, desc="Iteration")
        epoch = _epoch
        for step, batch in enumerate(epoch_iterator):
            model.train()
            x_input, token_type_ids, y_real = map(lambda elm: elm.to(device),
                                                  batch)
            log_likelihood, sequence_of_tags = model(x_input, token_type_ids,
                                                     y_real)

            # loss: negative log-likelihood
            loss = -1 * log_likelihood

            if n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if model_config.gradient_accumulation_steps > 1:
                loss = loss / model_config.gradient_accumulation_steps

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           model_config.max_grad_norm)
            tr_loss += loss.item()

            if (step + 1) % model_config.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                with torch.no_grad():
                    sequence_of_tags = torch.tensor(sequence_of_tags).to(
                        device)
                    mb_acc = (sequence_of_tags == y_real
                              ).float()[y_real != vocab.PAD_ID].mean()

                tr_acc = mb_acc.item()
                tr_loss_avg = tr_loss / global_step
                tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc}

                # if step % 50 == 0:
                print(
                    'epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'
                    .format(epoch + 1, global_step, tr_summary['loss'],
                            tr_summary['acc']))

                # training & evaluation log
                if model_config.logging_steps > 0 and global_step % model_config.logging_steps == 0:
                    if model_config.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        eval_summary, list_of_y_real, list_of_pred_tags = evaluate(
                            model, val_dl)
                        tb_writer.add_scalar('lr',
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalars(
                            'loss', {
                                'train': (tr_loss - logging_loss) /
                                model_config.logging_steps,
                                'val': eval_summary["eval_loss"]
                            }, global_step)
                        tb_writer.add_scalars('acc', {
                            'train': tr_acc,
                            'val': eval_summary["eval_acc"]
                        }, global_step)
                        print(
                            "eval acc: {}, loss: {}, global steps: {}".format(
                                eval_summary['eval_acc'],
                                eval_summary['eval_loss'], global_step))
                    print("Average loss: {} at global step: {}".format(
                        (tr_loss - logging_loss) / model_config.logging_steps,
                        global_step))
                    logging_loss = tr_loss

                # save model
                if model_config.save_steps > 0 and global_step % model_config.save_steps == 0:
                    eval_summary, list_of_y_real, list_of_pred_tags = evaluate(
                        model, val_dl)

                    # Save model checkpoint
                    output_dir = os.path.join(model_config.output_dir,
                                              'epoch-{}'.format(epoch + 1))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    print("Saving model checkpoint to %s", output_dir)
                    state = {
                        'global_step': global_step + 1,
                        'model_state_dict': model.state_dict(),
                        'opt_state_dict': optimizer.state_dict()
                    }
                    summary = {'train': tr_summary, 'eval': eval_summary}
                    summary_manager.update(summary)
                    print("summary: ", summary)
                    summary_manager.save('summary.json')

                    # Save
                    is_best = eval_summary[
                        "eval_acc"] >= best_dev_acc  # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야)
                    if is_best:
                        best_dev_acc = eval_summary["eval_acc"]
                        best_dev_loss = eval_summary["eval_loss"]
                        best_steps = global_step
                        # if args.do_test:
                        # results_test = evaluate(model, test_dl, test=True)
                        # for key, value in results_test.items():
                        #     tb_writer.add_scalar('test_{}'.format(key), value, global_step)
                        # logger.info("test acc: %s, loss: %s, global steps: %s", str(eval_summary['eval_acc']), str(eval_summary['eval_loss']), str(global_step))

                        checkpoint_manager.save_checkpoint(
                            state,
                            'best-epoch-{}-step-{}-acc-{:.3f}.bin'.format(
                                epoch + 1, global_step, best_dev_acc))
                        print(
                            "Saving model checkpoint as best-epoch-{}-step-{}-acc-{:.3f}.bin"
                            .format(epoch + 1, global_step, best_dev_acc))

                        # print classification report and save confusion matrix
                        cr_save_path = model_dir / 'best-epoch-{}-step-{}-acc-{:.3f}-cr.csv'.format(
                            epoch + 1, global_step, best_dev_acc)
                        cm_save_path = model_dir / 'best-epoch-{}-step-{}-acc-{:.3f}-cm.png'.format(
                            epoch + 1, global_step, best_dev_acc)
                        save_cr_and_cm(val_dl,
                                       list_of_y_real,
                                       list_of_pred_tags,
                                       cr_save_path=cr_save_path,
                                       cm_save_path=cm_save_path)
                    else:
                        torch.save(
                            state,
                            os.path.join(
                                output_dir,
                                'model-epoch-{}-step-{}-acc-{:.3f}.bin'.format(
                                    epoch + 1, global_step,
                                    eval_summary["eval_acc"])))
                        print(
                            "Saving model checkpoint as model-epoch-{}-step-{}-acc-{:.3f}.bin"
                            .format(epoch + 1, global_step,
                                    eval_summary["eval_acc"]))

    tb_writer.close()
    print("global_step = {}, average loss = {}".format(global_step,
                                                       tr_loss / global_step))

    return global_step, tr_loss / global_step, best_steps
Example #11
0
def train(train_dataset, model, tokenizer):
    tb_writer = SummaryWriter()
    
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size'])
    print("len(train_dataloader) "+ str(len(train_dataloader))) 
    t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs']
    
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args['warmup_steps'], t_total=t_total)
    
    if args['fp16']:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args['fp16_opt_level'])
        
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args['num_train_epochs'])
    logger.info("  Total train batch size  = %d", args['train_batch_size'])
    logger.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args['num_train_epochs']), desc="Epoch")
    
    for _ in train_iterator:
        epoch_iterator = tqdm_notebook(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
            loss = outputs[0].mean()  # model outputs are always tuple in pytorch-transformers (see doc)
            print("\r%f" % loss, end='')

            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']

            if args['fp16']:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args['max_grad_norm'])
                
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])

            tr_loss += loss.item()
            if (step + 1) % args['gradient_accumulation_steps'] == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
                    # Log metrics
                    if args['evaluate_during_training']:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args['logging_steps'], global_step)
                    logging_loss = tr_loss

                if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args['output_dir'], 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    logger.info("Saving model checkpoint to %s", output_dir)


    return global_step, tr_loss / global_step
def train(train_dataset, model, train_params, class_weights=None):
    # TODO: magic numbers, defaults in run_glue.py
    batch_size = train_params['batch_size']
    n_epochs = train_params['epochs']
    weight_decay = train_params['weight_decay']
    learning_rate = train_params['learning_rate']
    adam_epsilon = train_params['adam_epsilon']
    warmup_steps = train_params['warmup_steps']
    seed = train_params['seed']
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    max_grad_norm = train_params['max_grad_norm']

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=batch_size,
    )

    if class_weights is not None:
        class_weights = torch.from_numpy(class_weights).float().to(device)

    no_decay = {'bias', 'LayerNorm.weight'}
    optimizer_grouped_parameters = [
        {
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            weight_decay,
        },
        {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0,
        },
    ]
    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=learning_rate,
        eps=adam_epsilon,
    )
    scheduler = WarmupLinearSchedule(
        optimizer=optimizer,
        warmup_steps=warmup_steps,
        t_total=len(train_dataloader) // n_epochs,
    )

    global_step = 0
    tr_loss = 0.0
    model.zero_grad()
    train_iter = trange(n_epochs, desc='Epoch')
    set_seed(seed=seed)
    for _ in train_iter:
        epoch_iter = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iter):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                #'token_type_ids': batch[2],  # probably used for distilbert
                'labels': batch[3],
                'class_weights': class_weights,
            }

            outputs = model(**inputs)
            loss = outputs[0]

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            tr_loss += loss.item()

            optimizer.step()
            scheduler.step()
            model.zero_grad()
            global_step += 1

    return global_step, tr_loss / global_step
def main(args):
    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    if args.dataset == 'sim-R':
        from BERTDST_utils.simR_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'sim-M':
        from BERTDST_utils.simM_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'DSTC2':
        from BERTDST_utils.DSTC2_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'WOZ2.0':
        from BERTDST_utils.WOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'MultiWOZ2.1':
        from BERTDST_utils.MultiWOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, OP, make_slot_meta
        ontology = json.load(open(args.ontology_data_path))
        SLOT, ontology = make_slot_meta(ontology)

    n_gpu = 0
    if torch.cuda.is_available():
        n_gpu = torch.cuda.device_count()

    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    rng = random.Random(args.random_seed)
    torch.manual_seed(args.random_seed)
    if n_gpu > 0:
        torch.cuda.manual_seed(args.random_seed)
        torch.cuda.manual_seed_all(args.random_seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    slot_meta = SLOT
    op2id = OP
    print(op2id)
    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)

    train_data_raw = prepare_dataset(data_scale=args.train_scale,
                                     data_path=args.train_data_path,
                                     tokenizer=tokenizer,
                                     slot_meta=slot_meta,
                                     size_window=args.train_size_window,
                                     max_seq_length=args.max_seq_length,
                                     multi_granularity=args.train_MG,
                                     data_type='train')

    train_data = MultiWozDataset(train_data_raw, tokenizer, slot_meta,
                                 args.max_seq_length, rng, args.word_dropout)
    print("# train examples %d" % len(train_data_raw))

    dev_data_raw = prepare_dataset(data_scale=1.0,
                                   data_path=args.dev_data_path,
                                   tokenizer=tokenizer,
                                   slot_meta=slot_meta,
                                   size_window=args.test_size_window,
                                   max_seq_length=args.max_seq_length,
                                   multi_granularity=args.test_MG,
                                   data_type='dev')
    print("# dev examples %d" % len(dev_data_raw))

    test_data_raw = prepare_dataset(data_scale=1.0,
                                    data_path=args.test_data_path,
                                    tokenizer=tokenizer,
                                    slot_meta=slot_meta,
                                    size_window=args.test_size_window,
                                    max_seq_length=args.max_seq_length,
                                    multi_granularity=args.test_MG,
                                    data_type='test')
    print("# test examples %d" % len(test_data_raw))

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
    model_config.hidden_dropout_prob = args.hidden_dropout_prob
    model = MGDST(model_config, len(op2id), len(slot_meta))

    ckpt = torch.load(args.bert_ckpt_path, map_location='cpu')
    ckpt1 = {
        k.replace('bert.', '').replace('gamma',
                                       'weight').replace('beta', 'bias'): v
        for k, v in ckpt.items() if 'cls.' not in k
    }
    model.encoder.bert.load_state_dict(ckpt1)
    #model.encoder.bert.from_pretrained(args.bert_ckpt_path)

    model.to(device)

    num_train_steps = int(
        len(train_data_raw) / args.batch_size * args.n_epochs)

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    enc_param_optimizer = list(model.encoder.named_parameters())
    enc_optimizer_grouped_parameters = [{
        'params': [
            p for n, p in enc_param_optimizer
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr)
    enc_scheduler = WarmupLinearSchedule(enc_optimizer,
                                         int(num_train_steps *
                                             args.enc_warmup),
                                         t_total=num_train_steps)

    dec_param_optimizer = list(model.decoder.parameters())
    dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr)
    dec_scheduler = WarmupLinearSchedule(dec_optimizer,
                                         int(num_train_steps *
                                             args.dec_warmup),
                                         t_total=num_train_steps)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size,
                                  collate_fn=train_data.collate_fn,
                                  num_workers=args.num_workers,
                                  worker_init_fn=worker_init_fn)

    loss_fnc = nn.CrossEntropyLoss()
    best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0}
    total_step = 0
    for epoch in range(args.n_epochs):
        batch_loss = []
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch = [
                b.to(device) if not isinstance(b, int) else b for b in batch
            ]
            input_ids, input_mask, segment_ids, op_ids, gen_ids = batch

            state_scores, span_scores = model(input_ids=input_ids,
                                              token_type_ids=segment_ids,
                                              attention_mask=input_mask)

            loss_state = loss_fnc(
                state_scores.contiguous().view(-1, len(op2id)),
                op_ids.contiguous().view(-1))
            try:
                loss_span = masked_cross_entropy_for_value(
                    span_scores.contiguous(), gen_ids.contiguous(),
                    tokenizer.vocab['[PAD]'])
            except Exception as e:
                print(e)
            loss = loss_state * 0.8 + loss_span * 0.2
            batch_loss.append(loss.item())

            loss.backward()
            enc_optimizer.step()
            enc_scheduler.step()
            dec_optimizer.step()
            dec_scheduler.step()
            model.zero_grad()

            total_step += 1

            if step % 100 == 0:
                print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, span_loss : %.3f" \
                          % (epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_state.item(), loss_span.item()))
                batch_loss = []

        if (epoch + 1) % args.eval_epoch == 0:
            print('total_step: ', total_step)
            eval_res = model_evaluation(make_turn_label, postprocessing,
                                        state_equal, OP, model, dev_data_raw,
                                        tokenizer, slot_meta, epoch + 1,
                                        args.test_size_window, args.test_MG)
            if eval_res['joint_acc'] > best_score['joint_acc']:
                best_score = eval_res
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                save_path = os.path.join(
                    args.save_dir,
                    'model_best_gran[%s]_scale[%s]_seed[%s].bin' %
                    (str(args.train_size_window), str(
                        args.train_scale), args.random_seed))
                torch.save(model_to_save.state_dict(), save_path)
            print("Best Score : ", best_score)
            print("\n")

            if epoch > args.patience_start_epoch and best_score[
                    'epoch'] + args.patience < epoch:
                print("out of patience...")
                break

    print("Test using best model...")
    best_epoch = best_score['epoch']
    ckpt_path = os.path.join(
        args.save_dir, 'model_best_gran[%s]_scale[%s]_seed[%s].bin' %
        (str(args.train_size_window), str(args.train_scale), args.random_seed))
    model = MGDST(model_config, len(op2id), len(slot_meta))
    ckpt = torch.load(ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)
    model.to(device)

    model_evaluation(make_turn_label, postprocessing, state_equal, OP, model,
                     test_data_raw, tokenizer, slot_meta, best_epoch,
                     args.test_size_window, args.test_MG)
Example #14
0
def train(args, train_dataloader, model_vae, encoder_tokenizer, decoder_tokenizer, table_name):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    # train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    # train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)


    # model_encoder, model_decoder, model_connector = model_vae.encoder,  model_vae.decoder, model_vae.linear
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model_vae.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model_vae.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)


    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model_vae, optimizer = amp.initialize(model_vae, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model_vae = torch.nn.DataParallel(model_vae, device_ids=range(args.n_gpu)).to(args.device)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model_vae = torch.nn.parallel.DistributedDataParallel(model_vae, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)


    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", train_dataloader.num_examples)
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0


    model_vae.zero_grad()
   
    # model_vae = model_vae.module if hasattr(model_vae, 'module') else model_vae  # Take care of distributed/parallel training   
    
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])

    n_iter = int(args.num_train_epochs) * len(train_dataloader)
    beta_t_list = frange_cycle_zero_linear(n_iter, start=0.0, stop=args.beta,  n_cycle=1, ratio_increase=args.ratio_increase, ratio_zero=args.ratio_zero)

    tmp_list = []
    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
    for epoch in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            tokenized_text0, tokenized_text1, tokenized_text_lengths = batch
            # tokenized_text0 = tokenized_text0.to(args.device)
            # tokenized_text1 = tokenized_text1.to(args.device)
            # prepare input-output data for reconstruction

            if (tokenized_text0>len(encoder_tokenizer)).sum().item()>0.0 or (tokenized_text1>len(decoder_tokenizer)).sum().item()>0.0: 
                pdb.set_trace()
                continue

            inputs, labels = mask_tokens(tokenized_text0, encoder_tokenizer, args) if args.mlm else (tokenized_text0, tokenized_text1)
            labels = tokenized_text1

            tokenized_text1 = tokenized_text1.to(args.device)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)

            model_vae.train()

            beta_t = beta_t_list[step +  epoch*len(epoch_iterator)]
            model_vae.module.args.beta = beta_t

            if beta_t == 0.0:
                model_vae.module.args.fb_mode = 0
            else:
                model_vae.module.args.fb_mode = 1
            
            if args.use_deterministic_connect:
                model_vae.module.args.fb_mode = 2


            loss_rec, loss_kl, loss = model_vae(inputs, labels)


            # Chunyuan: loss_rec size is [4], while latent_z size is [12]
            if args.n_gpu > 1:
                loss_rec = loss_rec.mean()  # mean() to average on multi-gpu parallel training
                loss_kl = loss_kl.mean()
                loss = loss.mean()

            if args.use_philly:
                print("PROGRESS: {}%".format(round(100 * (step +  epoch*len(epoch_iterator) ) /(int(args.num_train_epochs) *  len(epoch_iterator)) , 4))) 
                print("EVALERR: {}%".format(loss_rec)) 

            epoch_iterator.set_description(
                (
                    f'iter: {step +  epoch*len(epoch_iterator) }; loss: {loss.item():.3f}; '
                    f'loss_rec: {loss_rec.item():.3f}; loss_kl: {loss_kl.item():.3f}; '
                    f'beta: {model_vae.module.args.beta:.3f}'
                )
            )

            # if global_step % 5 == 0:
            #     row = {
            #             'PartitionKey': 'MILU_Rule_Rule_Template',
            #             'RowKey': str(datetime.now()),
            #             'ExpName' : args.ExpName, 
            #             'iter': str( step +  epoch*len(epoch_iterator) ),
            #             'loss': str( loss.item()),
            #             'loss_rec': str(loss_rec.item()),
            #             'loss_kl': str(loss_kl.item()),
            #             'beta': str(model_vae.args.beta)
            #         }
            #     # pdb.set_trace()
            #     ts.insert_entity(table_name, row)

            # pdb.set_trace()

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()                                   
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model_vae.parameters(), args.max_grad_norm)

                optimizer.step()

                scheduler.step()  # Update learning rate schedule

                model_vae.zero_grad()

                global_step += 1


                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model_vae, encoder_tokenizer, decoder_tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    
                    # Save encoder model checkpoint
                    output_encoder_dir = os.path.join(args.output_dir, 'checkpoint-encoder-{}'.format(global_step))

                    if not os.path.exists(output_encoder_dir):
                        os.makedirs(output_encoder_dir)

                    model_encoder_to_save = model_vae.module.encoder if hasattr(model_vae, 'module') else model_vae.encoder  # Take care of distributed/parallel training
                    if args.use_philly:
                        save_solid = False
                        while not save_solid:
                            try:
                                model_encoder_to_save.save_pretrained(output_encoder_dir)
                                torch.save(args, os.path.join(output_encoder_dir, 'training_args.bin'))
                                logger.info("Saving model checkpoint to %s", output_encoder_dir)
                                save_solid = True
                            except:
                                pass
                    else:
                        model_encoder_to_save.save_pretrained(output_encoder_dir)
                        torch.save(args, os.path.join(output_encoder_dir, 'training_args.bin'))
                        logger.info("Saving model checkpoint to %s", output_encoder_dir)

                    # Save decoder model checkpoint
                    output_decoder_dir = os.path.join(args.output_dir, 'checkpoint-decoder-{}'.format(global_step))

                    if not os.path.exists(output_decoder_dir):
                        os.makedirs(output_decoder_dir)

                    model_decoder_to_save = model_vae.module.decoder if hasattr(model_vae, 'module') else model_vae.decoder  # Take care of distributed/parallel training
                    if args.use_philly:
                        save_solid = False
                        while not save_solid:
                            try:
                                model_decoder_to_save.save_pretrained(output_decoder_dir)
                                torch.save(args, os.path.join(output_decoder_dir, 'training_args.bin'))
                                logger.info("Saving model checkpoint to %s", output_decoder_dir)
                                save_solid = True
                            except:
                                pass
                    else:
                        model_decoder_to_save.save_pretrained(output_decoder_dir)
                        torch.save(args, os.path.join(output_decoder_dir, 'training_args.bin'))
                        logger.info("Saving model checkpoint to %s", output_decoder_dir)


            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break

            
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default='/hdd/lujunyu/dataset/multi_turn_corpus/ubuntu/',
        type=str,
        required=False,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--task_name",
                        default='ubuntu',
                        type=str,
                        required=False,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default='/hdd/lujunyu/model/chatbert/ubuntu_roberta_si_aug/',
        type=str,
        required=False,
        help="The output directory where the model checkpoints will be written."
    )
    parser.add_argument(
        "--dialog_augmentation_path",
        default=
        '/hdd/lujunyu/dataset/multi_turn_corpus/ubuntu/ubuntu_train_augment_largethan_13.txt',
        type=str,
        help="Whether to use augmentation")
    ## Other parameters
    parser.add_argument(
        "--init_model_name",
        default='roberta-base',
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument(
        "--max_seq_length",
        default=256,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=True,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_test",
                        default=True,
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument("--train_batch_size",
                        default=600,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=200,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=10.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_steps",
        default=0.0,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay",
                        default=1e-3,
                        type=float,
                        help="weight_decay")
    parser.add_argument("--save_checkpoints_steps",
                        default=3000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=10,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = RobertaConfig.from_pretrained(args.init_model_name,
                                                num_labels=2)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}"
            .format(args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        if args.do_train:
            raise ValueError(
                "Output directory ({}) already exists and is not empty.".
                format(args.output_dir))
    else:
        os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = RobertaTokenizer.from_pretrained(
        args.init_model_name, do_lower_case=args.do_lower_case)
    if args.dialog_augmentation_path:
        train_dataset = UbuntuDatasetForRoberta(
            file_path=args.dialog_augmentation_path,
            max_seq_length=args.max_seq_length,
            tokenizer=tokenizer)
    else:
        train_dataset = UbuntuDatasetForRoberta(
            file_path=os.path.join(args.data_dir, "train.txt"),
            max_seq_length=args.max_seq_length,
            tokenizer=tokenizer)
    eval_dataset = UbuntuDatasetForRoberta(file_path=os.path.join(
        args.data_dir, "test.txt"),
                                           max_seq_length=args.max_seq_length,
                                           tokenizer=tokenizer)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.train_batch_size,
        sampler=RandomSampler(train_dataset),
        num_workers=4)
    eval_dataloader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=args.eval_batch_size,
        sampler=SequentialSampler(eval_dataset),
        num_workers=8)

    model = RobertaForSequenceClassification.from_pretrained(
        args.init_model_name, config=bert_config)
    model.config.type_vocab_size = 2
    single_emb = model.roberta.embeddings.token_type_embeddings
    model.roberta.embeddings.token_type_embeddings = torch.nn.Embedding(
        2, single_emb.embedding_dim)
    model.roberta.embeddings.token_type_embeddings.weight = torch.nn.Parameter(
        single_emb.weight.repeat([2, 1]))
    model.to(device)

    num_train_steps = None
    if args.do_train:
        num_train_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        # remove pooler, which is not used thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=num_train_steps)
    else:
        optimizer = None
        scheduler = None

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    global_step = 0
    best_metric = 0.0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                labels=label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    scheduler.step()
                    model.zero_grad()
                    global_step += 1

                if (step + 1) % args.save_checkpoints_steps == 0:
                    model.eval()
                    f = open(os.path.join(args.output_dir, 'logits_test.txt'),
                             'w')
                    eval_loss = 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    logits_all = []
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            tmp_eval_loss, logits = model(
                                input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                labels=label_ids)

                        logits = logits.detach().cpu().numpy()
                        logits_all.append(logits)
                        label_ids = label_ids.cpu().numpy()

                        for logit, label in zip(logits, label_ids):
                            logit = '{},{}'.format(logit[0], logit[1])
                            f.write('_\t{}\t{}\n'.format(logit, label))

                        eval_loss += tmp_eval_loss.mean().item()

                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    f.close()
                    logits_all = np.concatenate(logits_all, axis=0)
                    eval_loss = eval_loss / nb_eval_steps

                    result = evaluate(
                        os.path.join(args.output_dir, 'logits_test.txt'))
                    result.update({'eval_loss': eval_loss})

                    output_eval_file = os.path.join(args.output_dir,
                                                    "eval_results_test.txt")
                    with open(output_eval_file, "a") as writer:
                        logger.info("***** Eval results *****")
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))

                    ### Save the best checkpoint
                    if best_metric < result['R10@1'] + result['R10@2']:
                        try:  ### Remove 'module' prefix when using DataParallel
                            state_dict = model.module.state_dict()
                        except AttributeError:
                            state_dict = model.state_dict()
                        torch.save(state_dict,
                                   os.path.join(args.output_dir, "model.pt"))
                        best_metric = result['R10@1'] + result['R10@2']
                        logger.info('Saving the best model in {}'.format(
                            os.path.join(args.output_dir, "model.pt")))

                        ### visualize bad cases of the best model
                        logger.info('Saving Bad cases...')
                        visualize_bad_cases(logits=logits_all,
                                            input_file_path=os.path.join(
                                                args.data_dir, 'test.txt'),
                                            output_file_path=os.path.join(
                                                args.output_dir,
                                                'test_bad_cases.txt'))

                    model.train()
def train(args, train_dataset, model, tokenizer, label_2test_array):
    """ Train the model """

    num_labels = len(label_2test_array)

    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])

    ## track best loss on eval set ??
    eval_loss = np.inf
    last_best = 0

    set_seed(
        args)  # Added here for reproducibility (even between python 2 and 3)
    for epoch_counter in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            # inputs, labels, attention_mask = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
            ## !!!  WE ARE NOT GOING TO TRAIN MASKED-LM

            ## also, the batch will have this ordering
            # return (torch.tensor(self.attention_mask[item]), torch.tensor(self.examples[item]),
            #       torch.LongTensor(self.label1hot[item]), torch.LongTensor(self.label_mask[item]),
            #       torch.tensor(self.token_type[item]) )

            max_len_in_batch = int(torch.max(torch.sum(
                batch[0], 1)))  ## only need max len
            attention_mask = batch[0][:, 0:max_len_in_batch].to(args.device)
            inputs = batch[1][:, 0:max_len_in_batch].to(args.device)
            labels = batch[2].to(
                args.device)  ## already in batch_size x num_label
            labels_mask = batch[3][:, 0:max_len_in_batch].to(
                args.device
            )  ## extract out labels from the array input... probably doesn't need this to be in GPU
            token_type = batch[4][:, 0:max_len_in_batch].to(args.device)

            ppi_vec = batch[5].unsqueeze(1).expand(
                inputs.shape[0], max_len_in_batch,
                256).to(args.device)  ## make 3D batchsize x 1 x dim

            model.train()

            # call to the @model
            # def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
            #   position_ids=None, head_mask=None, attention_mask_label=None):

            outputs = model(inputs,
                            token_type_ids=token_type,
                            attention_mask=attention_mask,
                            labels=labels,
                            position_ids=None,
                            attention_mask_label=labels_mask,
                            prot_vec=ppi_vec
                            )  # if args.mlm else model(inputs, labels=labels)

            loss = outputs[
                0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer,
                                           label_2test_array)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)

                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break

        ## end 1 epoch
        results = evaluate(args, model, tokenizer, label_2test_array)
        if results['eval_loss'] < eval_loss:
            eval_loss = results['eval_loss']
            last_best = epoch_counter
            break_early = False
            print(
                '\nupdate lowest loss on epoch {}, {}\nreset break_early to False, see break_early variable {}'
                .format(epoch_counter, eval_loss, break_early))
        else:
            if epoch_counter - last_best > 5:  ## break counter after 5 epoch
                # break ## break early
                break_early = True
                print(
                    'epoch {} set break_early to True, see break_early variable {}'
                    .format(epoch_counter, break_early))

        if break_early:
            train_iterator.close()
            print("**** break early ****")
            break

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_dir",
                        default='345M_origin',
                        type=str,
                        required=False,
                        help="The directory of the model to be tuned.")
    parser.add_argument(
        "--output_dir",
        default='mi_tuned',
        type=str,
        required=False,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=10)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    # parser.add_argument('--learning_rate', type=float, default=1e-3)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.1)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--snli', action='store_true')
    parser.add_argument('--eval', action='store_true')
    args = parser.parse_args()
    print(args)

    # ====== Set random seed =========
    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    #======= Prepare ==========
    logging.basicConfig(level=logging.INFO)
    USE_CUDA = torch.cuda.is_available()
    FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
    LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
    ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

    # ====== Load GPT2 model ========
    model_dir = "../models/" + args.model_dir
    model = GPT2ClassHeadsModel.from_pretrained(model_dir)
    # model = GPT2ClassHeadsModel.from_pretrained('gpt2')
    if USE_CUDA:
        model.cuda()
    tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
    # tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    print('Model loaded.')
    # =============== Load & process data ==============
    pickle_handler = open('../data_processed/x_y_meta_10turn', 'rb')
    x_y_meta = pickle.load(pickle_handler)
    if args.snli:
        print("Using SNLI data.")
        gpt_data = SnliDataset(
            tokenizer)  # use the output model name as pattern name
    else:
        print("Using mi data.")
        gpt_data = GptDataset_nli(x_y_meta,
                                  tokenizer,
                                  augment=True,
                                  num_turns=10)

    print("Dataset initialized.")
    print("samples:", len(gpt_data))
    test_size = int(len(gpt_data) * 0.10)
    val_size = int(len(gpt_data) * 0.05)
    gpt_train, gpt_test, gpt_val = torch.utils.data.random_split(
        gpt_data, [len(gpt_data) - test_size - val_size, test_size, val_size])

    data_loader = DataLoader(dataset=gpt_train,
                             batch_size=args.train_batch_size,
                             shuffle=True,
                             drop_last=True,
                             collate_fn=collate_fn_nli)
    test_loader = DataLoader(dataset=gpt_test,
                             batch_size=1,
                             shuffle=False,
                             drop_last=False,
                             collate_fn=collate_fn_nli)
    val_loader = DataLoader(dataset=gpt_val,
                            batch_size=1,
                            shuffle=False,
                            drop_last=False,
                            collate_fn=collate_fn_nli)
    if args.eval:
        print(eval(test_loader, model))
        return

    # ========== Prepare optimizer =============
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in param_optimizer
            if not any(nd in n for nd in no_decay) and 'classifier' not in n
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in param_optimizer
            if any(nd in n for nd in no_decay) and 'classifier' not in n
        ],
        'weight_decay':
        0.0
    }]

    optimizer_grouped_parameters_classifier = [{
        'params': [
            p for n, p in param_optimizer
            if 'classifier' in n and 'bias' not in n
        ],
        'weight_decay':
        0.01,
        'lr':
        args.learning_rate
    }, {
        'params':
        [p for n, p in param_optimizer if 'classifier' in n and 'bias' in n],
        'weight_decay':
        0.00,
        'lr':
        args.learning_rate
    }]
    num_train_optimization_steps = len(
        gpt_train) * args.num_train_epochs // args.train_batch_size
    num_warmup_steps = int(num_train_optimization_steps) * 0.1

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      correct_bias=True)
    optimizer_classifier = AdamW(optimizer_grouped_parameters_classifier,
                                 lr=args.learning_rate,
                                 correct_bias=True)
    # scheduler = pytorch_transformers.optimization.WarmupCosineSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_train_optimization_steps,cycles=1.5)
    scheduler = pytorch_transformers.optimization.WarmupLinearSchedule(
        optimizer,
        warmup_steps=num_warmup_steps,
        t_total=num_train_optimization_steps)

    # Training
    print("Start training.")
    model.train()
    exp_average_loss = None
    max_eval_accuracy = 0
    early_terminate_counter = 0
    train_losses = []
    eval_losses = []
    for epo in trange(int(args.num_train_epochs), desc="Epoch"):
        # for epo in range(int(args.num_train_epochs)):
        tqdm_bar = tqdm(data_loader, desc="Training")
        accuracy = 0
        for x, type_x, pos_x, lm_x, label in data_loader:
            # import pdb;pdb.set_trace()
            # for i in range(x.shape[0]):
            #     if label[i].item()==0:
            #         x[i].fill_(0)
            #     else:
            #         x[i].fill_(1)
            loss, logits = model(x,
                                 position_ids=pos_x,
                                 token_type_ids=type_x,
                                 labels=label)
            pred = torch.argmax(logits, dim=1)
            for i in range(x.shape[0]):
                if pred[i].item() == label[i].item():
                    accuracy += 1
            loss.backward()
            optimizer.step()
            optimizer_classifier.step()
            scheduler.step()
            optimizer.zero_grad()
            optimizer_classifier.zero_grad()
            exp_average_loss = loss.item(
            ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
            )
            tqdm_bar.update(1)
            tqdm_bar.set_postfix(loss=exp_average_loss, correct=accuracy)

        accuracy /= len(gpt_train)
        print("Accuracy for epoch {} is {}.\t Average loss:{}".format(
            epo, accuracy, exp_average_loss))
        train_losses.append([accuracy, exp_average_loss])

        eval_accuracy = eval(val_loader, model)
        print("Eval accuracy: {}".format(eval_accuracy))
        eval_losses.append(eval_accuracy)

        # if eval_accuracy < max_eval_accuracy:
        if False:
            print("eval accuracy decreasing!")
            early_terminate_counter += 1
            if early_terminate_counter > 10:
                break
        else:
            early_terminate_counter = 0
            max_eval_accuracy = eval_accuracy
            # ==== Save the model ====
            # Save a trained model, configuration and tokenizer
            model_to_save = model.module if hasattr(
                model, 'module') else model  # Only save the model it-self

            # If we save using the predefined names, we can load using `from_pretrained`
            output_dir = '../models/'
            output_model_file = os.path.join(output_dir + args.output_dir,
                                             WEIGHTS_NAME)
            output_config_file = os.path.join(output_dir + args.output_dir,
                                              CONFIG_NAME)

            torch.save(model_to_save.state_dict(), output_model_file)
            model_to_save.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(output_dir + args.output_dir)
    with open(output_dir + 'exp_info.txt', 'wb') as f:
        d = {
            'args': args,
            'train_info': train_losses,
            'eval_info': eval_accuracy
        }
        pickle.dump(d, f)
def train(args, train_dataloader, model_vae, encoder_tokenizer,
          decoder_tokenizer, table_name):
    """ Train the model """
    #gpus = list(gpu_indices())

    if args.local_rank in [-1, 0]: tb_writer = SummaryWriter()

    args.n_gpu = (torch.distributed.get_world_size()
                  if args.local_rank != -1 else 1)
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    # train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    # train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps  #* args.num_train_epochs

    if args.distributed:
        t_total = t_total // ompi_size()

    # Prepare optimizer and schedule (linear warmup and decay)

    # model_encoder, model_decoder, model_connector = model_vae.encoder,  model_vae.decoder, model_vae.linear
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model_vae.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model_vae.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model_vae, optimizer = amp.initialize(model_vae,
                                              optimizer,
                                              opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    #if args.n_gpu > 1:
    #    model_vae = torch.nn.DataParallel(model_vae, device_ids=range(args.n_gpu)).to(args.device)

    # Distributed training (should be after apex fp16 initialization)
    #if args.local_rank != -1:
    #model_vae = torch.nn.parallel.DistributedDataParallel(model_vae, device_ids=gpus, output_device=args.local_rank, find_unused_parameters=True)
    #model_vae = torch.nn.parallel.DistributedDataParallel(model_vae, device_ids=gpus)

    files = Path(args.train_data_file)
    num_files = len(list(files.glob('*seq64*.json')))

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num files = %d", num_files)
    logger.info("  Num examples of first file = %d",
                train_dataloader.num_examples)
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0

    model_vae.zero_grad()
    num_train_epochs_iterator = trange(
        int(args.num_train_epochs),
        desc="Epoch")  #, disable=args.local_rank not in [-1, 0])

    #n_iter = int(args.num_train_epochs) * len(train_dataloader)
    n_iter_per_file = train_dataloader.num_examples / args.train_batch_size
    n_iter = int(args.num_train_epochs * n_iter_per_file * num_files)
    beta_t_list = frange_cycle_zero_linear(n_iter,
                                           start=0.0,
                                           stop=args.beta,
                                           n_cycle=10,
                                           ratio_increase=args.ratio_increase,
                                           ratio_zero=args.ratio_zero)
    logger.info(
        f"Total iters (estimated): {n_iter}; Length of beta schedule: {len(beta_t_list)}; #Iter per file {n_iter_per_file}"
    )

    beta_t = 0.0
    tmp_list = []
    dict_token_length = defaultdict(int)

    set_seed(
        args)  # Added here for reproducibility (even between python 2 and 3)
    for epoch in range(int(
            args.num_train_epochs)):  # num_train_epochs_iterator:
        train_dataloader.reset()
        for idx_file in range(num_files - 1):

            logger.info(
                f"Rank {ompi_rank()}, Epoch {epoch}, File idx {train_dataloader.file_idx}"
            )
            #epoch_iterator = tqdm(train_dataloader, desc="Iteration") #disable=disable=args.local_rank not in [-1, 0])
            for step, batch in enumerate(train_dataloader):
                tokenized_text0, tokenized_text1, tokenized_text_lengths = batch

                #dict_token_length[tokenized_text_lengths[0,0].item()] += 1
                # continue
                # tokenized_text0 = tokenized_text0.to(args.device)
                # tokenized_text1 = tokenized_text1.to(args.device)
                # prepare input-output data for reconstruction

                inputs, labels = mask_tokens(
                    tokenized_text0, encoder_tokenizer,
                    args) if args.mlm else (tokenized_text0, tokenized_text1)
                labels = tokenized_text1

                tokenized_text1 = tokenized_text1.to(args.device)
                inputs = inputs.to(args.device)
                labels = labels.to(args.device)

                model_vae.train()

                if args.use_beta_schedule:
                    if global_step >= len(beta_t_list):
                        beta_t = 1.0
                    else:
                        beta_t = beta_t_list[global_step]

                    #try:
                    #    beta_t = beta_t_list[global_step] #[step + idx_file* n_iter_per_file]
                    #except:
                    #    beta_t = 0.0

                #beta_t = 0.0 # beta_t_list[step +  epoch*len(epoch_iterator)]
                model_vae.module.args.beta = beta_t

                if beta_t == 0.0:
                    model_vae.module.args.fb_mode = 0
                else:
                    model_vae.module.args.fb_mode = 1

                if args.use_deterministic_connect:
                    model_vae.module.args.fb_mode = 2

                loss_rec, loss_kl, loss = model_vae(inputs, labels)

                loss_rec = loss_rec.mean(
                )  # mean() to average on multi-gpu parallel training
                loss_kl = loss_kl.mean()
                loss = loss.mean()

                if args.use_philly:
                    #if args.local_rank in [-1, 0]:
                    if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        logger.info(
                            "Steps {}, Rank {}, File {}, Epoch: [{}/{}][{}/{}], Beta: {}, Loss: {}"
                            .format(global_step, ompi_rank(),
                                    train_dataloader.file_idx, epoch,
                                    args.num_train_epochs, step,
                                    n_iter_per_file,
                                    model_vae.module.args.beta, loss_rec))
                        logger.info("PROGRESS: {}%".format(
                            round(100 * global_step / n_iter, 4)))
                        logger.info("EVALERR: {}%".format(loss_rec))

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model_vae.parameters(),
                                                       args.max_grad_norm)

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model_vae.zero_grad()

                    global_step += 1

                    if args.local_rank in [
                            -1, 0
                    ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        # Log metrics
                        if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                            results = evaluate(args, model_vae,
                                               encoder_tokenizer,
                                               decoder_tokenizer)
                            for key, value in results.items():
                                tb_writer.add_scalar('eval_{}'.format(key),
                                                     value, global_step)
                        tb_writer.add_scalar('lr',
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                             args.logging_steps, global_step)
                        logging_loss = tr_loss

                    if args.local_rank in [
                            -1, 0
                    ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                        save_checkpoint(model_vae, optimizer, global_step,
                                        args)

                if args.max_steps > 0 and global_step > args.max_steps:
                    #epoch_iterator.close()
                    break

    # print(dict_token_length)
    # with open('wikipedia_stats.json', 'w') as fp:
    #     json.dump(dict_token_length, fp)

    return global_step, tr_loss / global_step, optimizer
Example #19
0
def train(cfg, model: JointSentiGPT2Model, train_dataloader: DataLoader,
          val_dataloader: DataLoader):
    steps_per_batch = len(train_dataloader)
    t_total = steps_per_batch * cfg.num_train_epochs
    warmup_steps = int(cfg.warmup_proportion * t_total)

    cfg.calc_hid_dist_step = cfg.calc_hid_dist_step * steps_per_batch
    cfg.only_nll_step = cfg.only_nll_step * steps_per_batch
    cfg.whole_step = t_total

    print("***** Running training *****")
    print(f"temperature = {cfg.temperature}")
    print(f"alpha = {cfg.alpha}")
    print(f"beta = {cfg.beta}")
    print(f"leak_emotion_step = {cfg.leak_emotion_step}")
    print(f"temperature_update = {cfg.adapt}")
    print(f"parallel = {cfg.parallel}")
    print(f"seed = {cfg.seed}")
    print(f"max_seq_length = {cfg.max_sequence_length}")
    print(f"emotion type = {cfg.emotion_cls}")
    print(f"num epochs = {cfg.num_train_epochs}")
    print(f"whole training steps = {t_total}")
    print(f"warmup steps = {warmup_steps}")
    print(f"train batch size = {cfg.train_batch_size}")
    print(f"learning rate = {cfg.learning_rate}")

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        cfg.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=cfg.learning_rate,
                      eps=cfg.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warmup_steps,
                                     t_total=t_total)

    record_txt = open(os.path.join(cfg.save_dir, "record.log"),
                      'w',
                      encoding='utf-8')
    distribution_distance_record = open(os.path.join(cfg.save_dir,
                                                     "dist_record.log"),
                                        'w',
                                        encoding='utf-8')

    if cfg.parallel:
        loss_cls_fct = DataParallelCriterion(
            torch.nn.CrossEntropyLoss(ignore_index=-1))
        if cfg.dist_loss == 'mse':
            loss_dist_fct = DataParallelCriterion(torch.nn.MSELoss())
        elif cfg.dist_loss == 'cos':
            loss_dist_fct = DataParallelCriterion(
                torch.nn.CosineEmbeddingLoss())

    else:
        loss_cls_fct = torch.nn.CrossEntropyLoss(ignore_index=-1)
        if cfg.dist_loss == 'mse':
            loss_dist_fct = torch.nn.MSELoss()
        elif cfg.dist_loss == 'cos':
            loss_dist_fct = torch.nn.CosineEmbeddingLoss()

    model.zero_grad()
    global_step = 0
    for epo in range(1, cfg.num_train_epochs + 1):
        model.train()
        tqdm_bar = tqdm(train_dataloader, desc="Training")
        avg_train_loss, avg_train_loss_nll, avg_train_loss_emotion, avg_train_loss_dist, step = 0, 0, 0, 0, 0
        step_calc_dist, step_calc_emotion = 0, 0
        for batch in tqdm_bar:
            batch = tuple(t.to(cfg.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'token_type_ids': batch[1],
                'attention_mask': batch[2],
                'cls_mask': batch[3],
                'emotion_ids': batch[5],
                'gold_response_emotion_masks': batch[6],
                'decoding': False,
                'step': global_step
            }

            outputs = model(**inputs)
            if cfg.parallel:
                # make the right input for parallel criterion
                lm_logits, emotion_logits, src_hidden_states, target_hidden_states = list(
                    zip(*outputs))[:4]
                lm_logits = [(element, ) for element in lm_logits]
                emotion_logits = [(element, ) for element in emotion_logits]
                if cfg.normalize:
                    src_hidden_states = [(F.normalize(element, p=2, dim=-1), )
                                         for element in src_hidden_states]
                    target_hidden_states = F.normalize(torch.cat(
                        target_hidden_states, dim=0),
                                                       p=2,
                                                       dim=-1).detach()
                else:
                    src_hidden_states = [(element, )
                                         for element in src_hidden_states]
                    target_hidden_states = torch.cat(target_hidden_states,
                                                     dim=0).detach()

            else:
                lm_logits, emotion_logits, src_hidden_states, target_hidden_states = outputs[:
                                                                                             4]
                if cfg.normalize:
                    src_hidden_states = F.normalize(src_hidden_states,
                                                    p=2,
                                                    dim=-1)
                    target_hidden_states = F.normalize(target_hidden_states,
                                                       p=2,
                                                       dim=-1).detach()

            loss, loss_nll, loss_emotion, loss_hid_dist = calc_loss(
                cfg, global_step, lm_logits, emotion_logits, src_hidden_states,
                target_hidden_states, batch[4], batch[7], loss_cls_fct,
                loss_dist_fct)
            loss.backward()
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()

            if global_step >= cfg.only_nll_step:
                step_calc_emotion += 1
            if global_step >= cfg.calc_hid_dist_step:
                distribution_distance_record.writelines(
                    str(loss_hid_dist.item(
                    ) if isinstance(loss_hid_dist, torch.Tensor) else 0) +
                    '\n')
                step_calc_dist += 1

            avg_train_loss += loss.item()
            avg_train_loss_nll += loss_nll.item()
            avg_train_loss_emotion += loss_emotion.item() if isinstance(
                loss_emotion, torch.Tensor) else 0
            avg_train_loss_dist += loss_hid_dist.item() if isinstance(
                loss_hid_dist, torch.Tensor) else 0
            step += 1
            global_step += 1

            loss_dist = 0 if step_calc_dist == 0 else avg_train_loss_dist / step_calc_dist
            loss_emotion = 0 if step_calc_emotion == 0 else avg_train_loss_emotion / step_calc_emotion
            tqdm_bar.desc = f"epoch:[{epo}/{cfg.num_train_epochs}],step:[{step}/{steps_per_batch}],loss:{avg_train_loss / step:.4f},nll:{avg_train_loss_nll / step:.4f}," \
                f"emotion:{loss_emotion:.4f},dist:{loss_dist:.4f}"

        if global_step >= cfg.calc_hid_dist_step:
            distribution_distance_record.writelines("\n")

        avg_train_loss = avg_train_loss / step
        avg_train_loss_nll = avg_train_loss_nll / step
        avg_train_loss_emotion = 0 if step_calc_emotion == 0 else avg_train_loss_emotion / step_calc_emotion
        avg_train_loss_dist = 0 if step_calc_dist == 0 else avg_train_loss_dist / step_calc_dist

        dev_loss_nll, dev_loss_emotion_hist_resp, dev_loss_hid_dist = validation(
            cfg, model, val_dataloader)
        print(
            f"in epoch {epo}, dev_loss_nll: {dev_loss_nll:.4f}, dev_loss_emotion: {dev_loss_emotion_hist_resp:.4f}, dev_loss_hid_dist: {dev_loss_hid_dist:.4f}, train_loss: {avg_train_loss:.4f}, train_loss_dist: {avg_train_loss_dist:.4f}"
        )

        torch.save(
            model.module.state_dict()
            if hasattr(model, 'module') else model.state_dict(),
            os.path.join(cfg.save_dir, f"epo{epo}.pt"))
        record_txt.writelines(
            f"epoch: {epo}\t\t"
            f"train_loss: {avg_train_loss:.4f}\t\ttrain_hid_dist: {avg_train_loss_dist:.4f}\t\ttrain_emotion: {avg_train_loss_emotion:.4f}\t\ttrain_nll: {avg_train_loss_nll:.4f}\t\t"
            f"dev_emotion: {dev_loss_emotion_hist_resp:.4f}\t\tdev_hid_dist: {dev_loss_hid_dist:.4f}\t\tdev loss nll: {dev_loss_nll}\n"
        )
def train(args, train_dataset, model, tokenizer, sample=False):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()


#     if sample == False:
    dataset = train_dataset(args, args.task_name, tokenizer, evaluate=False)
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        if sample:
            dataset = train_dataset(args,
                                    args.task_name,
                                    tokenizer,
                                    evaluate=False)
            args.train_batch_size = args.per_gpu_train_batch_size * max(
                1, args.n_gpu)
            train_sampler = RandomSampler(
                dataset) if args.local_rank == -1 else DistributedSampler(
                    train_dataset)
            train_dataloader = DataLoader(dataset,
                                          sampler=train_sampler,
                                          batch_size=args.train_batch_size)
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                'token_type_ids':
                batch[2] if args.model_type in ['bert', 'xlnet'] else
                None,  # XLM don't use segment_ids
                'labels':
                batch[3]
            }
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
def train(model_filename='model_{loss}.pt',
          lr=6.5e-5,
          epochs=1000,
          inferencehook=None,
          load_model=None,
          inference_verses=2,
          batch_size=64):
    dataset = BibleCommentaryDataset(dir_='trainingdata',
                                     filenames=['Beal.txt'],
                                     min_sequence_length=20,
                                     max_sequence_length=300)
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=1)

    # creates tfidf model ONLY ON SEQUENCE SIZE 30 (for now)
    dataset.current_sequence_length = 30
    tfidf_model = CPULinear(number_of_sentences_as_seed=2,
                            knowledge_utterances=[
                                dataset.tokenizer.decode(
                                    dataset[i][0].tolist())
                                for i in range(len(dataset))
                            ])
    dataset.current_sequence_length = dataset.min_sequence_length

    if load_model:
        print('loading model {} ...'.format(load_model))
        model = torch.load(os.path.join('modeldata', load_model))
    else:
        model = GPT2Generator()
    model = model.to('cuda')

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(params=model.parameters(), lr=lr, correct_bias=False)

    optimizer.zero_grad()

    epoch_losses = []
    last_saved_epoch_loss = None

    for epoch_i, epoch in enumerate(range(epochs)):

        for i, batch in enumerate(dataloader):

            X, y = batch

            # push X and y to cuda
            X = X.to('cuda')
            y = y.to('cuda')

            predictions = model(X)
            loss = criterion(predictions, y)
            epoch_losses.append(loss.item())
            loss.backward()

            nn.utils.clip_grad_norm_(model.parameters(), 1.)

            optimizer.step()
            optimizer.zero_grad()
            print(
                'EPOCH {}, current_sequence_length {}, Batch {} of {}: loss == {:.8f}'
                .format(epoch, dataset.current_sequence_length, i,
                        (len(dataset) + 1) // batch_size, loss.item()))

            if inferencehook and i % 100 == 0:
                inferencehook(dataset,
                              model,
                              tfidf_model,
                              inference_verses=inference_verses,
                              words2add=150,
                              k=20)

        this_epoch_loss = sum(epoch_losses) / len(epoch_losses)
        if last_saved_epoch_loss is None or this_epoch_loss < last_saved_epoch_loss:
            print('Saving {} with loss {:.8f}'.format(model_filename,
                                                      this_epoch_loss))
            torch.save(
                model,
                'modeldata/' + model_filename.format(loss=this_epoch_loss))
            last_saved_epoch_loss = this_epoch_loss
        epoch_losses = []

        if dataset.current_sequence_length == dataset.max_sequence_length:
            dataset.current_sequence_length = dataset.min_sequence_length
        else:
            dataset.current_sequence_length += 1
        neg_net_output = net(neg_ids, attn_masks=neg_mask, type_ids=neg_type_ids)
        # # TODO: do i need a softmax or not ?

        # Computing loss
        # loss = criterion(net_output, label.float())
        loss = criterion(pos_net_output, neg_net_output, label.float())
        batch_loss = loss.item()
        # total_train_loss += loss.item()

        # Back propagating the gradients
        loss.backward()
        if config.training['gradient_clipping']['use']:
            torch.nn.utils.clip_grad_norm_(net.parameters(), config.training['gradient_clipping']['clip_value'])

        # Optimization step
        opti.step()

        # Progress update every display_step batches.
        # if batch_idx % display_step == 0 and not batch_idx == 0:
        #     elapsed = format_time(time.time() - t0)
        #     # print('  Batch {:>5,}  of  {:>5,}  :  loss - {:>5,.2f}    Elapsed: {:}.'.format(batch_idx,
        #     #                                                                                 len(train_dataloader),
        #     #                                                                                 loss, elapsed))
        #     print('  Epoch {:>5,}  of  {:>5,}  :  Batch {:>5,}  of  {:>5,}  :  \
        #     loss - {:>5,.2f}    Elapsed: {:}.'.format(epoch_idx + 1, num_epochs,
        #                                               batch_idx + 1, len(train_dataloader),
        #                                               loss, elapsed))
        #     training_stats.append(
        #         {
        #             'epoch': epoch_idx + 1,
        #             'batch': batch_idx + 1,
def train(args, train_dataset, model, tokenizer, context_model):
    """ Train the model """
    assert context_model

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    # optimizer_grouped_parameters += [
    #     {'params': [p for n, p in context_model.named_parameters() if not any(nd in n for nd in no_decay)],
    #      'weight_decay': args.weight_decay},
    #     {'params': [p for n, p in context_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    # ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    context_model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    cat_context_model = ContextModel(model, context_model)

    for epoch_num in train_iterator:
        model.train()
        context_model.train()
        for step, batch in enumerate(train_dataloader):

            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                'token_type_ids':
                batch[2] if args.model_type in ['bert', 'xlnet'] else
                None,  # XLM don't use segment_ids
                'labels':
                batch[3]
            }
            outputs = cat_context_model.concat(**inputs)
            # outputs = model(**inputs)
            # context_outputs = context_model(**inputs)
            loss = outputs  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                torch.nn.utils.clip_grad_norm_(context_model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                context_model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    logger.info("Step {0}, Loss: {1}".format(
                        global_step, loss.item()))

        if args.do_eval and (args.local_rank == -1
                             or torch.distributed.get_rank() == 0):
            eval_examples = read_cmsqa_examples(args.predict_file,
                                                is_training=True,
                                                num_choices=args.num_choices)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, True)
            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)
            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(
                eval_data,
                sampler=eval_sampler,
                batch_size=args.per_gpu_eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                device = args.device
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = None
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    tmp_eval_loss = cat_context_model.concat(
                        input_ids, segment_ids, input_mask, label_ids)
                    logits = cat_context_model.concat(input_ids, segment_ids,
                                                      input_mask)

                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                tmp_eval_accuracy = accuracy(logits, label_ids)

                eval_loss += tmp_eval_loss.mean().item()
                eval_accuracy += tmp_eval_accuracy

                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = eval_accuracy / nb_eval_examples

            result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy}

            if args.do_train:
                result['global_step'] = global_step

            if not os.path.exists(args.output_dir):
                os.makedirs(args.output_dir)

            output_eval_file = os.path.join(args.output_dir,
                                            "eval_results.txt")
            with open(output_eval_file, "a") as writer:
                writer.write('Epoch' + str(epoch_num) + '\n')
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

            output_dir = os.path.join(args.output_dir, str(epoch_num))
            os.makedirs(output_dir)
            logger.info("Saving model checkpoint to %s", output_dir)
            # Save a trained model, configuration and tokenizer using `save_pretrained()`.
            # They can then be reloaded using `from_pretrained()`
            model_to_save = model.module if hasattr(
                model, 'module'
            ) else model  # Take care of distributed/parallel training
            model_to_save.save_pretrained(output_dir)

    return global_step, tr_loss / global_step
Example #24
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, default='gpt2',
                        help='pretrained model name')
    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument('--train_dataset', type=str, default='../Data/debug_data.txt')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=10000000)
    parser.add_argument('--save_step', type=int, default=10000)
    parser.add_argument('--gen_step', type=int, default=20000)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training \
                        steps to perform. Override num_train_epochs.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before\
                        performing a backward/update pass.")
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset

    # Read dataset config
    with open(os.path.join(args.train_dataset, "dataset_config.json"), 'r', encoding='utf-8') as f_config:
        config = json.load(f_config)
        f_config.close()

    special_tokens = config["special_tokens"]
    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = GPT2LMHeadModel.from_pretrained(args.model_name)
    model.to(device)
    # Prepare optimizer
    if args.do_train:
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (config["num_of_batch"] // args.gradient_accumulation_steps) + 1
        else:
            t_total = config["num_of_batch"] // args.gradient_accumulation_steps * args.num_train_epochs

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay': args.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)

    list_files = os.listdir(args.train_dataset)
    list_files.pop(0)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        global_step = 0
        for ei in tqdm(range(int(args.num_train_epochs)), desc="EPOCHS: ", unit=" Epoch"):
            #logger.info("Epoch {}/{}:".format(ei, args.num_train_epochs))
            count_file = 0
            tqdm_bar = tqdm(list_files, desc="    Training")
            for file in tqdm_bar:
                tqdm_bar.desc = "   FILE {}/{}: ".format(count_file, len(list_files))
                tqdm_bar.unit = "Training loss: {} lr: {}".format(exp_average_loss, scheduler.get_lr()[0])
                count_file += 1
                f_train = open(os.path.join(args.train_dataset, file), 'r', encoding='utf-8')
                encoded_datasets = json.load(f_train)
                # Prepare inputs tensors and dataloaders
                tensor_datasets = pre_process_datasets(encoded_datasets,
                                                       config["input_lenght"],
                                                       config["max_length"],
                                                       *special_tokens_ids)
                train_tensor_dataset = tensor_datasets[-1]
                train_data = TensorDataset(*train_tensor_dataset)
                train_sampler = RandomSampler(train_data)
                train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

                tr_loss = 0
                nb_tr_steps = 0
                tqdm_batch_bar = tqdm(train_dataloader, desc="    FILE:{}".format(file))
                for batch in tqdm_batch_bar:
                    global_step += 1

                    batch = tuple(t.to(device) for t in batch)
                    input_ids, lm_labels = batch
                    loss = model(input_ids, labels=lm_labels)
                    loss = args.lm_coef * loss[0]

                    tqdm_batch_bar.unit = " Training loss: {} lr: {}".format(loss.item(), scheduler.get_lr()[0])
                    loss.backward()
                    optimizer.step()
                    scheduler.step()
                    
                    optimizer.zero_grad()
                    tr_loss += loss.item()
                    exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
                    nb_tr_steps += 1

                    if not global_step % args.save_step:
                        model_to_save = model.module if hasattr(model, 'module') else model
                        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
                        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

                        torch.save(model_to_save.state_dict(), output_model_file)
                        model_to_save.config.to_json_file(output_config_file)
                        tokenizer.save_vocabulary(args.output_dir)

                    if not global_step % args.gen_step:
                        vocab_sample = ["anybody who has ever been a fan of",
                                        "i am wanting to make",
                                        "this is the only movie i",
                                        "okay first of all",
                                        "can any movie become more naive than this ?",
                                        "there is",
                                        "there isn't much of one .",
                                        "this movie really shows its age",
                                        "shop", 
                                        "i go to", 
                                        "the grandchildren", 
                                        "i", 
                                        "today", 
                                        "finally ,", 
                                        "this"]
                        rand_idx = random.randint(0, len(vocab_sample)-1)
                        raw_text = vocab_sample[rand_idx]
                        context_tokens = tokenizer.encode(raw_text)
                        model.eval()
                        out = sample_sequence(
                            model=model,
                            context=context_tokens,
                            length=100,
                            temperature=1,
                            top_k=0,
                            top_p=0.9,
                            device=device,
                            is_xlnet=False
                        )
                        out = out[0, len(context_tokens):].tolist()
                        text = raw_text+" " + tokenizer.decode(out, clean_up_tokenization_spaces=True)
                        print("\n Generate:  ")
                        print(text)
                        print("\n Finished!")
                        model.train()
                        #time.sleep(20)

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model\

        # If we save using the predefined names, we can load using 'from_pretrained'
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        model = GPT2LMHeadModel.from_pretrained(args.output_dir)
        tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir)
        model.to(device)
Example #25
0
def main():
    def evaluate(data_source, split_encode=False):
        model.eval()
        total_loss = 0
        total_words = 0
        total_n = 0
        batch_idx = 0
        for batch in data_source:
            _, queries = batch
            total_words += sum(len(space_tokenize(x)) for x in queries)
            try:
                queries, mask, total_chars = gpt_encode(tokenizer, queries, sos_idx, split_encode=split_encode, 
                    condition_model=args.conditioned_model)
            except KeyError:
                continue
            mask = torch.Tensor(mask).cuda()
            queries = torch.LongTensor(queries).cuda()

            with torch.no_grad():
                output = model(queries[:, :-1])[0].permute(0, 2, 1)
            targets = queries[:, 1:]
            crit = criterion(output, targets)
            mask_tot = mask[:, 1:].sum()
            raw_loss = (crit * mask[:, 1:]).sum() / mask_tot
            loss = raw_loss

            total_loss += raw_loss.item() * mask_tot.item()
            total_n += total_chars
            # print(total_loss / (math.log(2) * total_n))

        cur_loss = total_loss / total_n
        elapsed = time.time() - start_time
        word_ppl = math.exp(total_loss / total_words)
        dual_print('-' * 89)
        dual_print('| end of epoch {:3d} | lr {:05.5f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
            epoch, optimizer.param_groups[0]['lr'],
            elapsed * 1000 / args.log_interval, cur_loss, word_ppl, cur_loss / math.log(2)))
        dual_print('-' * 89)
        return cur_loss / math.log(2)

    parser = argparse.ArgumentParser()
    add_dict_options(parser, ARGS)
    args = parser.parse_args()
    set_seed(args.seed)
    sd = torch.load(args.cache_file)

    tokenizer = GPT2Tokenizer.from_pretrained(args.gpt2_model, cache_dir=args.cache_dir)
    model = GPT2LMHeadModel.from_pretrained(args.gpt2_model, cache_dir=args.cache_dir)
    if args.reset: model.apply(model.init_weights)
    sos_idx = init_sos(model)
    if not args.use_sos: sos_idx = None
    train_ds, dev_ds, test_ds = sd['splits']
    criterion = nn.CrossEntropyLoss(reduction='none')

    train_loader = tud.DataLoader(train_ds, batch_size=args.train_batch_size, shuffle=True, drop_last=args.drop_last)
    dev_loader = tud.DataLoader(dev_ds, batch_size=args.eval_batch_size, shuffle=False, drop_last=args.drop_last)
    test_loader = tud.DataLoader(test_ds, batch_size=args.eval_batch_size, shuffle=False, drop_last=args.drop_last)

    no_decay = ['bias']
    params = list(model.named_parameters())
    optimizer_grouped_parameters = [
        {'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    num_train_optimization_steps = args.num_train_epochs * len(train_loader)
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=int(args.warmup_proportion * num_train_optimization_steps),
                                     t_total=num_train_optimization_steps)

    if args.resume:
        model.load_state_dict(torch.load(args.resume, map_location=lambda s, l: s))
    if args.test_eval:
        while True:
            query = input("> ")
            print(sample_query(model, tokenizer, query))
        return

    model = nn.DataParallel(model).cuda()
    start_time = time.time()
    best_bpc = 1000000

    if not args.do_train:
        evaluate(test_loader, split_encode=False)
        return

    for epoch in range(args.num_train_epochs):
        epoch += 1
        total_loss = 0
        total_words = 0
        total_n = 0
        batch_idx = 0
        for batch in train_loader:
            model.train()
            _, queries = batch
            total_words += sum(len(space_tokenize(x)) for x in queries)
            try:
                queries, mask, total_chars = gpt_encode(tokenizer, queries, sos_idx, split_encode=args.split_encode, 
                    condition_model=args.conditioned_model)
            except KeyError:
                dual_print('Skipped batch')
                continue
            mask = torch.Tensor(mask).cuda()
            queries = torch.LongTensor(queries).cuda()
            optimizer.zero_grad()

            output = model(queries[:, :-1])[0].permute(0, 2, 1)
            targets = queries[:, 1:]
            crit = criterion(output, targets)
            mask_tot = mask[:, 1:].sum()
            raw_loss = (crit * mask[:, 1:]).sum() / mask_tot

            loss = raw_loss
            loss.backward()
            scheduler.step()
            optimizer.step()

            total_loss += raw_loss.item() * mask_tot.item()
            total_n += total_chars
            if batch_idx % args.log_interval == 0 and batch_idx > 0:
                cur_loss = total_loss / total_n
                word_ppl = math.exp(total_loss / total_words)
                total_words = 0
                elapsed = time.time() - start_time
                dual_print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
                        'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                    epoch, batch_idx, len(train_loader), optimizer.param_groups[0]['lr'],
                    elapsed * 1000 / args.log_interval, cur_loss, word_ppl, cur_loss / math.log(2)))
                total_loss = 0
                total_n = 0
                start_time = time.time()
            batch_idx += 1
        bpc = evaluate(dev_loader)
        if bpc < best_bpc:
            best_bpc = bpc
            torch.save(model.module.state_dict(), args.save)
    evaluate(test_loader)
def main():
    my_parser = argparse.ArgumentParser()

    # Required parameters
    my_parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    my_parser.add_argument("--src_file",
                           default=None,
                           type=str,
                           help="The input data file name.")
    my_parser.add_argument("--model_type",
                           default=None,
                           type=str,
                           required=True,
                           help="Model type selected in the list: " +
                           ", ".join(MODEL_CLASSES.keys()))
    my_parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    my_parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    my_parser.add_argument(
        "--log_dir",
        default='',
        type=str,
        help="The output directory where the log will be written.")
    my_parser.add_argument("--model_recover_path",
                           default=None,
                           type=str,
                           help="The file of fine-tuned pretraining model.")
    my_parser.add_argument("--optim_recover_path",
                           default=None,
                           type=str,
                           help="The file of pretraining optimizer.")
    my_parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    my_parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")

    # Other parameters
    my_parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    my_parser.add_argument('--max_position_embeddings',
                           type=int,
                           default=None,
                           help="max position embeddings")
    my_parser.add_argument("--do_train",
                           action='store_true',
                           help="Whether to run training.")
    my_parser.add_argument("--do_eval",
                           action='store_true',
                           help="Whether to run eval on the dev set.")
    my_parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    my_parser.add_argument("--train_batch_size",
                           default=32,
                           type=int,
                           help="Total batch size for training.")
    my_parser.add_argument("--eval_batch_size",
                           default=64,
                           type=int,
                           help="Total batch size for eval.")
    my_parser.add_argument("--learning_rate",
                           default=5e-5,
                           type=float,
                           help="The initial learning rate for Adam.")
    my_parser.add_argument("--label_smoothing",
                           default=0.1,
                           type=float,
                           help="The initial learning rate for Adam.")
    my_parser.add_argument("--weight_decay",
                           default=0.01,
                           type=float,
                           help="The weight decay rate for Adam.")
    my_parser.add_argument("--adam_epsilon",
                           default=1e-8,
                           type=float,
                           help="Epsilon for Adam optimizer.")
    my_parser.add_argument("--max_grad_norm",
                           default=1.0,
                           type=float,
                           help="Max gradient norm.")
    my_parser.add_argument("--num_train_epochs",
                           default=3.0,
                           type=float,
                           help="Total number of training epochs to perform.")
    my_parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    my_parser.add_argument("--hidden_dropout_prob",
                           default=0.1,
                           type=float,
                           help="Dropout rate for hidden states.")
    my_parser.add_argument("--attention_probs_dropout_prob",
                           default=0.1,
                           type=float,
                           help="Dropout rate for attention probabilities.")
    my_parser.add_argument("--no_cuda",
                           action='store_true',
                           help="Whether not to use CUDA when available")
    my_parser.add_argument("--local_rank",
                           type=int,
                           default=-1,
                           help="local_rank for distributed training on gpus")
    my_parser.add_argument('--seed',
                           type=int,
                           default=42,
                           help="random seed for initialization")
    my_parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    my_parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    my_parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    my_parser.add_argument('--tokenized_input',
                           action='store_true',
                           help="Whether the input is tokenized.")
    my_parser.add_argument(
        '--max_len_a',
        type=int,
        default=0,
        help="Truncate_config: maximum length of segment A.")
    my_parser.add_argument(
        '--max_len_b',
        type=int,
        default=0,
        help="Truncate_config: maximum length of segment B.")
    my_parser.add_argument(
        '--trunc_seg',
        default='',
        help="Truncate_config: first truncate segment A/B (option: a, b).")
    my_parser.add_argument(
        '--always_truncate_tail',
        action='store_true',
        help="Truncate_config: Whether we should always truncate tail.")
    my_parser.add_argument(
        "--mask_prob",
        default=0.20,
        type=float,
        help=
        "Number of prediction is sometimes less than max_pred when sequence is short."
    )
    my_parser.add_argument(
        "--mask_prob_eos",
        default=0,
        type=float,
        help=
        "Number of prediction is sometimes less than max_pred when sequence is short."
    )
    my_parser.add_argument('--max_pred',
                           type=int,
                           default=69,
                           help="Max tokens of prediction.")
    my_parser.add_argument("--num_workers",
                           default=0,
                           type=int,
                           help="Number of workers for the data loader.")

    my_parser.add_argument('--mask_source_words',
                           action='store_true',
                           help="Whether to mask source words for training")
    my_parser.add_argument('--skipgram_prb',
                           type=float,
                           default=0.0,
                           help='prob of ngram mask')
    my_parser.add_argument('--skipgram_size',
                           type=int,
                           default=1,
                           help='the max size of ngram mask')
    my_parser.add_argument('--mask_whole_word',
                           action='store_true',
                           help="Whether masking a whole word.")

    args = my_parser.parse_args()

    if not (args.model_recover_path
            and Path(args.model_recover_path).exists()):
        args.model_recover_path = None

    args.output_dir = args.output_dir.replace('[PT_OUTPUT_DIR]',
                                              os.getenv('PT_OUTPUT_DIR', ''))
    args.log_dir = args.log_dir.replace('[PT_OUTPUT_DIR]',
                                        os.getenv('PT_OUTPUT_DIR', ''))

    os.makedirs(args.output_dir, exist_ok=True)
    if args.log_dir:
        os.makedirs(args.log_dir, exist_ok=True)
    json.dump(args.__dict__,
              open(os.path.join(args.output_dir, 'opt.json'), 'w'),
              sort_keys=True,
              indent=2)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        dist.init_process_group(backend='nccl')
    my_logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if args.local_rank not in (-1, 0):
        # Make sure only the first process in distributed training will download model & vocab
        dist.barrier()
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        max_position_embeddings=args.max_position_embeddings,
        label_smoothing=args.label_smoothing)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case)
    data_tokenizer = WhitespaceTokenizer(
    ) if args.tokenized_input else tokenizer
    if args.local_rank == 0:
        dist.barrier()

    if args.do_train:
        print("Loading Train Dataset", args.data_dir)
        bi_uni_pipeline = [
            utils_seq2seq.Preprocess4Seq2seq(
                args.max_pred,
                args.mask_prob,
                list(tokenizer.vocab.keys()),
                tokenizer.convert_tokens_to_ids,
                args.max_seq_length,
                mask_source_words=False,
                skipgram_prb=args.skipgram_prb,
                skipgram_size=args.skipgram_size,
                mask_whole_word=args.mask_whole_word,
                tokenizer=data_tokenizer)
        ]

        file = os.path.join(args.data_dir,
                            args.src_file if args.src_file else 'train.tgt')
        train_dataset = utils_seq2seq.Seq2SeqDataset(
            file,
            args.train_batch_size,
            data_tokenizer,
            args.max_seq_length,
            bi_uni_pipeline=bi_uni_pipeline)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset, replacement=False)
            _batch_size = args.train_batch_size
        else:
            train_sampler = DistributedSampler(train_dataset)
            _batch_size = args.train_batch_size // dist.get_world_size()
        train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=_batch_size,
            sampler=train_sampler,
            num_workers=args.num_workers,
            collate_fn=utils_seq2seq.batch_list_to_batch_tensors,
            pin_memory=False)
        print("Loading dev dataset")
        dev_file = os.path.join(args.data_dir, 'dev_data.json')
        dev_dataset = utils_seq2seq.Seq2SeqDataset(
            dev_file,
            args.eval_batch_size,
            data_tokenizer,
            args.max_seq_length,
            bi_uni_pipeline=bi_uni_pipeline)
        dev_dataloader = torch.utils.data.DataLoader(
            dev_dataset,
            batch_size=args.eval_batch_size,
            collate_fn=utils_seq2seq.batch_list_to_batch_tensors,
            pin_memory=False,
            num_workers=args.num_workers)

    # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps)
    # t_total = int(math.ceil(len(train_dataset.ex_list) / args.train_batch_size)
    t_total = int(
        len(train_dataloader) * args.num_train_epochs /
        args.gradient_accumulation_steps)

    # Prepare model
    recover_step = _get_max_epoch_model(args.output_dir)
    if args.local_rank not in (-1, 0):
        # Make sure only the first process in distributed training will download model & vocab
        dist.barrier()
    global_step = 0
    if (recover_step is None) and (args.model_recover_path is None):
        model_recover = None
    else:
        if recover_step:
            my_logger.info("***** Recover model: %d *****", recover_step)
            model_recover = torch.load(os.path.join(
                args.output_dir, "model.{0}.bin".format(recover_step)),
                                       map_location='cpu')
            # recover_step == number of epochs
            global_step = math.floor(recover_step * t_total /
                                     args.num_train_epochs)
        elif args.model_recover_path:
            my_logger.info("***** Recover model: %s *****",
                           args.model_recover_path)
            model_recover = torch.load(args.model_recover_path,
                                       map_location='cpu')
    model = model_class.from_pretrained(args.model_name_or_path,
                                        state_dict=model_recover,
                                        config=config)
    if args.local_rank == 0:
        dist.barrier()

    model.to(device)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(args.warmup_proportion * t_total),
        num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    if args.local_rank != -1:
        try:
            from torch.nn.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("DistributedDataParallel")
        model = DDP(model,
                    device_ids=[args.local_rank],
                    output_device=args.local_rank,
                    find_unused_parameters=True)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if recover_step:
        my_logger.info("***** Recover optimizer: %d *****", recover_step)
        optim_recover = torch.load(os.path.join(
            args.output_dir, "optim.{0}.bin".format(recover_step)),
                                   map_location='cpu')
        if hasattr(optim_recover, 'state_dict'):
            optim_recover = optim_recover.state_dict()
        optimizer.load_state_dict(optim_recover)

        if os.path.exists(
                os.path.join(args.output_dir,
                             "amp.{0}.bin".format(recover_step))):
            my_logger.info("***** Recover amp: %d *****", recover_step)
            amp_recover = torch.load(os.path.join(
                args.output_dir, "amp.{0}.bin".format(recover_step)),
                                     map_location='cpu')
            amp.load_state_dict(amp_recover)

        my_logger.info("***** Recover scheduler: %d *****", recover_step)
        scheduler_recover = torch.load(os.path.join(
            args.output_dir, "sched.{0}.bin".format(recover_step)),
                                       map_location='cpu')
        scheduler.load_state_dict(scheduler_recover)

    my_logger.info("***** CUDA.empty_cache() *****")
    torch.cuda.empty_cache()

    if args.do_train:
        my_logger.info("***** Running training *****")
        my_logger.info("  Batch size = %d", args.train_batch_size)
        my_logger.info("  Num steps = %d", t_total)

        model.train()
        if recover_step:
            start_epoch = recover_step + 1
        else:
            start_epoch = 1
        for i_epoch in trange(start_epoch,
                              int(args.num_train_epochs) + 1,
                              desc="Epoch",
                              disable=args.local_rank not in (-1, 0)):
            if args.local_rank != -1:
                train_sampler.set_epoch(i_epoch)
            iter_bar = tqdm(train_dataloader,
                            desc='Iter (loss=X.XXX)',
                            disable=args.local_rank not in (-1, 0))
            final_loss = 0
            for step, batch in enumerate(iter_bar):
                batch = [
                    t.to(device) if t is not None else None for t in batch
                ]
                input_ids, segment_ids, answer_tag, input_mask, lm_label_ids, masked_pos, masked_weights, _ = batch
                if answer_tag == None:
                    print("answer tag is none")
                masked_lm_loss = model(input_ids,
                                       segment_ids,
                                       answer_tag,
                                       input_mask,
                                       lm_label_ids,
                                       masked_pos=masked_pos,
                                       masked_weights=masked_weights)
                if n_gpu > 1:  # mean() to average on multi-gpu.
                    # loss = loss.mean()
                    masked_lm_loss = masked_lm_loss.mean()
                loss = masked_lm_loss
                final_loss = loss.item()

                # logging for each step (i.e., before normalization by args.gradient_accumulation_steps)
                iter_bar.set_description('Iter (loss=%5.3f)' % loss.item())

                # ensure that accumlated gradients are normalized
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    optimizer.zero_grad()
                    global_step += 1
            # Save a trained model
            if (args.local_rank == -1 or torch.distributed.get_rank() == 0):
                my_logger.info(
                    "** ** * Saving fine-tuned model and optimizer ** ** * ")
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(
                    args.output_dir, "model.{0}.bin".format(i_epoch))
                torch.save(model_to_save.state_dict(), output_model_file)
                output_optim_file = os.path.join(
                    args.output_dir, "optim.{0}.bin".format(i_epoch))
                torch.save(optimizer.state_dict(), output_optim_file)
                if args.fp16:
                    output_amp_file = os.path.join(
                        args.output_dir, "amp.{0}.bin".format(i_epoch))
                    torch.save(amp.state_dict(), output_amp_file)
                output_sched_file = os.path.join(
                    args.output_dir, "sched.{0}.bin".format(i_epoch))
                torch.save(scheduler.state_dict(), output_sched_file)

                my_logger.info("***** CUDA.empty_cache() *****")
                torch.cuda.empty_cache()

            if args.do_eval:
                # do_eval
                iter_dev = tqdm(dev_dataloader,
                                desc='Iter (loss=X.XXX)',
                                disable=args.local_rank not in (-1, 0))
                val_losses = []
                for step, batch in enumerate(iter_dev):
                    with torch.no_grad():
                        batch = [
                            t.to(device) if t is not None else None
                            for t in batch
                        ]
                        input_ids, segment_ids, answer_tag, input_mask, lm_label_ids, masked_pos, masked_weights, _ = batch
                        masked_dev_loss = model(input_ids,
                                                segment_ids,
                                                answer_tag,
                                                input_mask,
                                                lm_label_ids,
                                                masked_pos=masked_pos,
                                                masked_weights=masked_weights)
                        val_losses.append(masked_dev_loss.item())
                val_loss = np.mean(val_losses)
                print(
                    "Epoch {} - final loss : {:.4f} - val loss :{:.4f}".format(
                        i_epoch, final_loss, val_loss))
Example #27
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval or not.")
    parser.add_argument("--eval_on",
                        default="dev",
                        help="Whether to run eval on the dev set or test set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {"ner": NerProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()
    num_labels = len(label_list) + 1

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = 0
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    # Prepare model
    config = BertConfig.from_pretrained(args.bert_model,
                                        num_labels=num_labels,
                                        finetuning_task=args.task_name)
    model = Ner.from_pretrained(args.bert_model, from_tf=False, config=config)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    warmup_steps = int(args.warmup_proportion * num_train_optimization_steps)
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warmup_steps,
                                     t_total=num_train_optimization_steps)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    label_map = {i: label for i, label in enumerate(label_list, 1)}
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in train_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids,
                                   all_valid_ids, all_lmask_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids,
                             valid_ids, l_mask)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        model_config = {
            "bert_model": args.bert_model,
            "do_lower": args.do_lower_case,
            "max_seq_length": args.max_seq_length,
            "num_labels": len(label_list) + 1,
            "label_map": label_map
        }
        json.dump(
            model_config,
            open(os.path.join(args.output_dir, "model_config.json"), "w"))
        # Load a trained model and config that you have fine-tuned
    else:
        # Load a trained model and vocabulary that you have fine-tuned
        model = Ner.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)

    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        if args.eval_on == "dev":
            eval_examples = processor.get_dev_examples(args.data_dir)
        elif args.eval_on == "test":
            eval_examples = processor.get_test_examples(args.data_dir)
        else:
            raise ValueError("eval on dev or test set only")
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in eval_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids,
                                  all_valid_ids, all_lmask_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        y_true = []
        y_pred = []
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            valid_ids = valid_ids.to(device)
            label_ids = label_ids.to(device)
            l_mask = l_mask.to(device)

            with torch.no_grad():
                logits = model(input_ids,
                               segment_ids,
                               input_mask,
                               valid_ids=valid_ids,
                               attention_mask_label=l_mask)

            logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            input_mask = input_mask.to('cpu').numpy()

            for i, label in enumerate(label_ids):
                temp_1 = []
                temp_2 = []
                for j, m in enumerate(label):
                    if j == 0:
                        continue
                    elif label_ids[i][j] == len(label_map):
                        y_true.append(temp_1)
                        y_pred.append(temp_2)
                        break
                    else:
                        temp_1.append(label_map[label_ids[i][j]])
                        temp_2.append(label_map[logits[i][j]])

        report = classification_report(y_true, y_pred, digits=4)
        logger.info("\n%s", report)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            logger.info("\n%s", report)
            writer.write(report)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument("--max_steps",
                        default=-1,
                        type=int,
                        help="If > 0: set total number of training \
                        steps to perform. Override num_train_epochs.")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before\
                        performing a backward/update pass.")
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
    tokenizer.add_special_tokens({
        'cls_token': '<CLS>',
        'sep_token': '<SEP>',
        'pad_token': '<PAD>',
        'eos_token': '<EOS>'
    })
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
    model.resize_token_embeddings(len(tokenizer))
    special_tokens_ids = [
        tokenizer.convert_tokens_to_ids(special_token)
        for special_token in ['<PAD>', '<CLS>', '<SEP>', '<EOS>']
    ]
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps //\
                (len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader)\
                // args.gradient_accumulation_steps * args.num_train_epochs

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=t_total)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    scheduler.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids,
                                                 lm_labels, mc_labels)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
def _train(args):
    # initialization
    config_class, model_class, tokenizer_class = MODEL_CLASSES[
        args['model_type']]

    config = config_class.from_pretrained(args['model_name'],
                                          num_labels=2,
                                          finetuning_task=args['task_name'])
    tokenizer = tokenizer_class.from_pretrained(args['model_name'])
    model = model_class.from_pretrained(args['model_name'])

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.device_count() > 1:
        print("Training: use", torch.cuda.device_count(), "GPUs!")
        model = torch.nn.DataParallel(model)
    model.to(device)

    logger.info("Loading  dataset")
    train_dataset = load_and_cache_examples(args, tokenizer, False)

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args['train_batch_size'])
    print("len(train_dataloader) " + str(len(train_dataloader)))
    t_total = len(train_dataloader) // args[
        'gradient_accumulation_steps'] * args['num_train_epochs']

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args['weight_decay']
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args['learning_rate'],
                      eps=args['adam_epsilon'])
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args['warmup_steps'],
                                     t_total=t_total)

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args['num_train_epochs'])
    logger.info("  Total train batch size  = %d", args['train_batch_size'])
    logger.info("  Gradient Accumulation steps = %d",
                args['gradient_accumulation_steps'])
    logger.info("  Total optimization steps = %d", t_total)
    print("args ", args)
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()

    for _ in range(args['num_train_epochs']):
        for step, batch in enumerate(train_dataloader):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                'token_type_ids':
                batch[2] if args['model_type'] in ['bert', 'xlnet'] else
                None,  # XLM don't use segment_ids
                'labels':
                batch[3]
            }
            outputs = model(**inputs)
            loss = outputs[0].mean(
            )  # model outputs are always tuple in pytorch-transformers (see doc)
            print("\r%f" % loss, end='')

            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']

            if args['fp16']:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args['max_grad_norm'])

            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args['max_grad_norm'])

            tr_loss += loss.item()
            if (step + 1) % args['gradient_accumulation_steps'] == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                if args['logging_steps'] > 0 and global_step % args[
                        'logging_steps'] == 0:

                    logging_loss = tr_loss

                if args['save_steps'] > 0 and global_step % args[
                        'save_steps'] == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args['output_dir'],
                        'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    logger.info("Saving model checkpoint to %s", output_dir)

    logger.info("starting evaluating ")
    checkpoints = list(
        os.path.dirname(c) for c in sorted(
            glob.glob(args['output_dir'] + '/**/' + WEIGHTS_NAME,
                      recursive=True)))
    logging.getLogger("pytorch_transformers.modeling_utils").setLevel(
        logging.WARN)  # Reduce logging
    logger.info("Evaluate the following checkpoints: %s", checkpoints)
    print("Evaluate the following checkpoints: ", checkpoints)
    best_result = None
    best_checkpoint = None
    results = []
    for checkpoint in checkpoints:
        global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
        model = model_class.from_pretrained(checkpoint)
        model.to(device)
        result = evaluate(model, tokenizer, prefix=global_step)

        logger.info(" result,{%s}", result)
        if best_result is None or result['matthews_corrcoef'] > best_result[
                'matthews_corrcoef']:
            best_result = result
            best_checkpoint = checkpoint
            logger.info("best result, Saving model checkpoint to %s",
                        best_checkpoint)
            print('best checkpoint ', best_checkpoint, ' eval result ', result)

        result = dict(
            (k + '_{}'.format(global_step), v) for k, v in result.items())
        results.append(result)
    # save best model
    model = model_class.from_pretrained(best_checkpoint)
    model_to_save = model.module if hasattr(
        model,
        'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(args['model_dir'])
Example #30
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model. """
    tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                'Please install apex from https://www.github.com/nvidia/apex to use fp16 training.'
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Train!
    logger.info('***** Running training *****')
    logger.info('   Num examples = %d', len(train_dataset))
    logger.info('   Num Epochs = %d', args.num_train_epochs)
    logger.info('   Instantaneous batch size per GPU = %d',
                args.per_gpu_train_batch_size)
    logger.info('   Total train batch size (w. parallel & accumulation) = %d',
                args.train_batch_size * args.gradient_accumulation_steps)
    logger.info('   Gradient Accumulation steps = %d',
                args.gradient_accumulation_steps)
    logger.info('   Total optimization steps = %d', t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc='Epoch')
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc='Iteration')
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2],
                'labels': batch[3],
                'ct_clf_input_ids': batch[4],
                'ct_clf_attention_mask': batch[5],
                'ct_clf_token_type_ids': batch[6],
                'categories': batch[7],
                'hand_features': batch[8]
            }
            outputs = model(**inputs)
            loss, clf_loss = outputs[0][0], outputs[1][
                0]  # model outputs are always tuple in pytorch_transformers (see doc)

            total_loss = loss + clf_loss
            if args.n_gpu > 1:
                total_loss = total_loss.mean()
            if args.gradient_accumulation_steps > 1:
                total_loss = total_loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_los(total_loss.optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += total_loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.evaluate_during_training:
                        result = evaluate(args, model, tokenizer)
                        for key, value in result.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args, 'training_args.bin')
                    logger.info('Saving model checkpoint to %s', output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    tb_writer.close()
    return global_step, tr_loss / global_step