Example #1
0
def main(args):
    init_logger()
    tokenizer = load_tokenizer(args)
    train_dataset = None if args.do_predict else load_and_cache_examples(
        args, tokenizer, mode="train")
    dev_dataset = None
    test_dataset = None if args.do_predict else load_and_cache_examples(
        args, tokenizer, mode="test")

    if args.do_train:
        trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)
        trainer.train()

    if args.do_eval:
        trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)
        trainer.load_model()
        trainer.evaluate("test")

    if args.do_predict:
        predict = Predict(args, tokenizer)
        predict.load_model()

        sentences = [args.sentence]
        result_json = dict()
        result_json['result'] = int(predict.predict(sentences))
        print(json.dumps(result_json, ensure_ascii=False))
def main(args):
    init_logger()
    set_seed(args)

    if args.logger:
        neptune.init("wjdghks950/NumericHGN")
        neptune.create_experiment(name="({}) NumHGN_{}_{}_{}".format(
            args.task, args.train_batch_size, args.max_seq_len,
            args.train_file))
        neptune.append_tag("BertForSequenceClassification", "finetuning",
                           "num_augmented_HGN")

    tokenizer = load_tokenizer(args)
    train_dataset = dev_dataset = test_dataset = None
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, mode="train")
    dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev")
    # test_dataset = load_and_cache_examples(args, tokenizer, mode="test")

    trainer = ParaSelectorTrainer(args, train_dataset, dev_dataset)

    if args.do_train:
        trainer.train()
        trainer.save_model()

    if args.do_eval:
        trainer.load_model()
        trainer.evaluate("dev")

    if args.logger:
        neptune.stop()
Example #3
0
def main(args):
    init_logger()
    set_seed(args)

    tokenizer = load_tokenizer(args)

    train_dataset = None
    dev_dataset = None
    test_dataset = None

    if args.do_train or args.do_eval:
        test_dataset = load_and_cache_examples(args, tokenizer, mode="test")
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, mode="train")

    trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)
    predictor = Predict(args)

    if args.do_train:
        trainer.train()

    if args.do_eval:
        trainer.load_model()
        trainer.evaluate("test", "eval")

    if args.do_predict:
        predictor.predict()
Example #4
0
def main(args):
# def main():
    init_logger()#输出信息
    # 随机数种子seed确定时,模型的训练结果将始终保持一致
    set_seed(args)
    tokenizer = load_tokenizer(args)#加载预训练模型
    device_ = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    train_dataset = load_and_cache_examples(args, tokenizer, mode="train")#获取样本特征
    dev_dataset   = load_and_cache_examples(args, tokenizer, mode="dev")
    test_dataset  = load_and_cache_examples(args, tokenizer, mode="test")

    # snips_tensors表示一个二维矩阵,代表了snips数据集中每句话对应的CLS输出
    '''
        数据格式 TensorDataset(all_corpus, all_input_ids, all_attention_mask,
                               all_token_type_ids, all_intent_label_ids, all_slot_labels_ids)
                            
    '''

    trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)
    # trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)

    if args.do_train:
        trainer.train()
        trainer.evaluate("test")
Example #5
0
def main(CFG, args):
    random.seed(CFG.seed)
    np.random.seed(CFG.seed)
    torch.manual_seed(CFG.seed)
    torch.cuda.manual_seed_all(CFG.seed)
    os.environ["PYTHONHASHSEED"] = str(CFG.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

    #Set Tokenizer and Model
    tokenizer = transformers.AutoTokenizer.from_pretrained(CFG.MODEL_NAME)
    tokenizer.add_tokens(['<e1>', '</e1>', '<e2>', '</e2>'],
                         special_tokens=True)
    # model.resize_token_embeddings(tokenizer.vocab_size + 4)
    print(tokenizer)
    train_dataset = load_and_cache_examples(args, tokenizer, mode="train")
    test_dataset = load_and_cache_examples(args, tokenizer, mode="test")

    print(train_dataset)
    print(test_dataset)

    trainer = Trainer(CFG,
                      args,
                      train_dataset=train_dataset,
                      test_dataset=test_dataset)

    if args.do_train:
        trainer.train()

    if args.do_eval:
        trainer.load_model()
        trainer.evaluate("test")
Example #6
0
def main(args):
    init_logger()
    tokenizer = load_tokenizer(args)

    train_dataset = None
    dev_dataset = None
    test_dataset = None

    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, mode="train")
        dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev")

    if args.do_eval:
        test_dataset = load_and_cache_examples(args, tokenizer, mode="test")

    trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)

    if args.do_train:
        trainer.train()

    if args.do_eval:
        trainer.load_model()
        trainer.evaluate("test")

    if args.do_pred:
        trainer.load_model()
        texts = read_prediction_text(args)
        trainer.predict(texts, tokenizer)
Example #7
0
def main():
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # ------------------判断CUDA模式----------------------
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        n_gpu = 1

    #produce data
    train_batch_size = args.per_gpu_train_batch_size * max(1, n_gpu)
    eval_batch_size = args.per_gpu_eval_batch_size * max(1, n_gpu)

    train_iter = load_and_cache_examples(mode='train',train_batch_size=train_batch_size,
                                                          eval_batch_size=eval_batch_size)
    eval_iter = load_and_cache_examples(mode='dev',
                                        train_batch_size=train_batch_size,
                                        eval_batch_size=eval_batch_size)

    #epoch_size = num_train_steps * train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs

    # pbar = ProgressBar(epoch_size=epoch_size,
    #                    batch_size=train_batch_size)

    if args.model_type == 'bert':
        model = Bert_SenAnalysis.from_pretrained(args.bert_model, num_tag = len(args.labels))
    elif args.model_type == 'xlnet':
        config = XLNetConfig.from_pretrained(args.xlnet_model, num_labels = len(args.labels))
        model = XLNet_SenAnalysis.from_pretrained(args.xlnet_model, config=config)
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

    train_iter = cycle(train_iter)
    fit(model = model,
        training_iter=train_iter,
        eval_iter=eval_iter,
        #train_steps=args.train_steps,
        #pbar=pbar,
        num_train_steps=args.train_steps,#num_train_steps,
        device=device,
        n_gpu=n_gpu,
        verbose=1)
Example #8
0
def main(args):
    if os.path.exists(args.model_dir) and len(os.listdir(args.model_dir)) > 0:
        print("The model output path '%s' already exists and is not empty." %
              args.model_dir)
        return

    init_logger(args)
    set_seed(args)
    tokenizer = load_tokenizer(args.model_name_or_path)
    logger.info("******* Running with the following arguments *********")
    for a in vars(args):
        logger.info(a + " = " + str(getattr(args, a)))
    logger.info("***********************************************")
    train_dataset, train_examples = load_and_cache_examples(args,
                                                            tokenizer,
                                                            mode="train")
    train_examples = dict([(example.guid, example)
                           for example in train_examples])
    dev_dataset, dev_examples = load_and_cache_examples(args,
                                                        tokenizer,
                                                        mode="dev")
    dev_examples = dict([(example.guid, example) for example in dev_examples])
    test_dataset, test_examples = load_and_cache_examples(args,
                                                          tokenizer,
                                                          mode="test")
    test_examples = dict([(example.guid, example)
                          for example in test_examples])

    if args.align_languages:
        alignment_dataset = generate_alignment_pairs(args=args)
    else:
        alignment_dataset = None

    trainer = Trainer(args, train_dataset, dev_dataset, test_dataset,
                      train_examples, dev_examples, test_examples, tokenizer,
                      alignment_dataset)

    if args.do_train:
        trainer.load_model(final_eval=False)
        logger.info(trainer.model)
        trainer.train()
        if args.task == Tasks.PAWS_X.value:
            trainer.evaluate_pair("dev", exp_name=args.model_dir)
        else:
            trainer.evaluate_xnlu("dev", exp_name=args.model_dir)
        if args.save_model:
            trainer.save_model()

    if args.do_eval:
        if not args.do_train:
            trainer.load_model(final_eval=True)
        if args.task == Tasks.PAWS_X.value:
            trainer.evaluate_pair("dev", exp_name=args.model_dir)
            trainer.evaluate_pair("test", exp_name=args.model_dir)
        else:
            trainer.evaluate_xnlu("dev", exp_name=args.model_dir)
            trainer.evaluate_xnlu("test", exp_name=args.model_dir)
Example #9
0
def main(args):
    init_logger()  #输出信息
    tokenizer = load_tokenizer(args)  # 加载预训练模型

    train_dataset = load_and_cache_examples(args, tokenizer, mode="train")
    dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev")
    test_dataset = load_and_cache_examples(args, tokenizer, mode="test")

    trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)

    # if args.do_train:
    # trainer.train()
    # if args.do_eval:
    # trainer.load_model()
    # trainer.evaluate("test")

    @app.route('/pred_term', methods=['GET', 'POST'])
    def get_data():
        if request.method == 'POST':
            argsJson = request.data.decode('utf-8')
            argsJson = json.loads(argsJson)
            (title, texts), = argsJson.items()
            # 结巴分词
            jieba_text = " ".join(jieba.cut(texts, cut_all=False))
            jieba_text = jieba_text.split()
            jieba_word_dict = {}
            for i in jieba_text:
                if i not in jieba_word_dict:
                    jieba_word_dict[i] = 1
                else:
                    jieba_word_dict[i] += 1

            # 术语识别
            texts = " ".join(texts)
            texts = texts.split('。')
            if len(texts[-1]) == 0:
                texts = texts[:-1]
            slot_preds_list = trainer.predict(texts, tokenizer)
            new_texts = []
            for t in texts:
                new_texts.append(t.strip().split())
            # print(new_texts)
            term_weight = get_tf_idf(new_texts, slot_preds_list,
                                     jieba_word_dict)
            term_weight = json.dumps(term_weight, ensure_ascii=False)
            return term_weight
        else:
            return " 'it's not a POST operation! "

    if args.do_pred:
        trainer.load_model()
        # texts = read_prediction_text(args)
        app.run(host='0.0.0.0', port=5001)
Example #10
0
def main():
    config = Config('config.ini')
    init_logger()
    tokenizer = load_tokenizer(config)
    train_dataset = load_and_cache_examples(config, tokenizer, evaluate=False)
    test_dataset = load_and_cache_examples(config, tokenizer, evaluate=True)
    trainer = Trainer(config, train_dataset, test_dataset)

    if config.do_train:
        trainer.train()

    if config.do_eval:
        trainer.evaluate()
Example #11
0
def main(args):
    init_logger()
    tokenizer = load_tokenizer(args)
    train_dataset = load_and_cache_examples(args, tokenizer, mode="train")
    dev_dataset = None
    test_dataset = load_and_cache_examples(args, tokenizer, mode="test")
    trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)

    if args.do_train:
        trainer.train()

    if args.do_eval:
        trainer.load_model()
        trainer.evaluate("test")
Example #12
0
    def predict(self):
        if self.args.local_rank in [-1, 0]:
            model, tokenizer = self._prepare_model(args=self.args, labels=self.labels,
                                                   num_labels=self.num_labels, mode='predict',
                                                   model_dir=self.args.output_dir)
            test_dataset = load_and_cache_examples(args=self.args,
                                                   tokenizer=tokenizer,
                                                   labels=self.labels,
                                                   pad_token_label_id=self.pad_token_label_id,
                                                   mode='test')
            result, predictions = self._evaluate(self.args, model, test_dataset, self.labels, self.pad_token_label_id,
                                       mode='test', prefix="")

            output_test_results_file = os.path.join(self.args.output_dir, "test_results.txt")
            with open(output_test_results_file, "w") as writer:
                for key in sorted(result.keys()):
                    writer.write("{} = {}\n".format(key, str(result[key])))
            # Save predictions
            output_test_predictions_file = os.path.join(self.args.output_dir, "test_predictions.txt")
            with open(output_test_predictions_file, "w") as writer:
                with open(os.path.join(self.args.data_dir, "test.txt"), "r") as f:
                    example_id = 0
                    for line in f:
                        if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                            writer.write(line)
                            if not predictions[example_id]:
                                example_id += 1
                        elif predictions[example_id]:
                            output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n"
                            writer.write(output_line)
                        else:
                            logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
Example #13
0
def main(args):
    '''
    the main process of SRGLHRE
    '''
    init_logger()
    tokenizer = load_tokenizer(args)

    train_dataset = load_and_cache_examples(args, tokenizer, mode="train")
    test_dataset = load_and_cache_examples(args, tokenizer, mode="test")

    trainer = Trainer(args,
                      train_dataset=train_dataset,
                      test_dataset=test_dataset)

    if args.do_train:
        trainer.train()

    if args.do_eval:
        trainer.load_model()
        trainer.evaluate('test')
Example #14
0
def main(args):
    init_logger(args)
    set_seed(args)
    tokenizer = load_tokenizer(args)

    train_dataset = load_and_cache_examples(args, tokenizer)

    trainer = Trainer(args, train_dataset)

    if args.do_train:
        trainer.train()
Example #15
0
def main(args):
    init_logger()
    set_seed(args)
    tokenizer = load_tokenizer(args)

    print("PREPROCESSING TRAIN DATA")
    train_dataset = load_and_cache_examples(args, tokenizer, mode="train")

    print("PREPROCESSING TRAIN DATA")
    dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev")
    # test_dataset = load_and_cache_examples(args, tokenizer, mode="test")
    print("LOAD TRAINER")
    trainer = Trainer(args, train_dataset, dev_dataset)
    print("================TRAIN==============")
    if args.do_train:
        trainer.train()

    if args.do_eval:
        trainer.load_model()
        trainer.evaluate("dev")
Example #16
0
def main(args):
    init_logger()
    set_seed(args)
    tokenizer = load_tokenizer(args)

    wandb.init(project='R-BERT', name='R-BERT w/ One-Hot')

    train_dataset = load_and_cache_examples(args, tokenizer, mode="train")
    test_dataset = load_and_cache_examples(args, tokenizer, mode="test")

    trainer = Trainer(args,
                      train_dataset=train_dataset,
                      test_dataset=test_dataset)

    if args.do_train:
        trainer.train()

    if args.do_eval:
        trainer.load_model()
        trainer.evaluate("test")
Example #17
0
 def train(self):
     model, tokenizer = self._prepare_model(args=self.args, labels=self.labels,
                                            num_labels=self.num_labels, mode='train',
                                            model_dir=self.args.model_name_or_path)
     train_dataset = load_and_cache_examples(args=self.args,
                                             tokenizer=tokenizer,
                                             labels=self.labels,
                                             pad_token_label_id=self.pad_token_label_id,
                                             mode="train")
     global_step, tr_loss = self._train(self.args, train_dataset, model, tokenizer,
                                        self.labels, self.pad_token_label_id)
     logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
Example #18
0
def main(args):
    init_logger()
    set_seed(args)
    tokenizer = load_tokenizer(args)
    train_dataset = load_and_cache_examples(args, tokenizer, mode="train")
    dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev")
    test_dataset, dataset_id = load_and_cache_examples(args, tokenizer, mode="test")
    trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)

    if args.do_train:
        trainer.train(mode="train")
    if args.do_dev:
        trainer.train(mode="dev")

    if args.do_eval:
        trainer.load_model()
        results = trainer.evaluate("test")
        print("dataset_id : ", dataset_id)
        print("results : ", results)
        results = [[data_id, result] for (data_id, result) in zip(dataset_id, results)]
        print(results)
        write_csvFile(os.path.join(args.data_dir, "result.csv"), results)
Example #19
0
 def evaluate(self):
     #TODO evaluate all checkpoint는 나중에 구현
     if self.args.local_rank in [-1, 0]:
         model, tokenizer = self._prepare_model(args=self.args, labels=self.labels,
                                                num_labels=self.num_labels, mode='eval',
                                                model_dir=self.args.output_dir)
         eval_dataset = load_and_cache_examples(args=self.args,
                                                tokenizer=tokenizer,
                                                labels=self.labels,
                                                pad_token_label_id=self.pad_token_label_id,
                                                mode='dev')
         result, _ = self._evaluate(self.args, model, eval_dataset, self.labels, self.pad_token_label_id,
                                    mode='dev', prefix="")
         self.results.update(result)
         output_eval_file = os.path.join(self.args.output_dir, "eval_results.txt")
         with open(output_eval_file, "w") as writer:
             for key in sorted(self.results.keys()):
                 writer.write("{} = {}\n".format(key, str(self.results[key])))
Example #20
0
    def evaluate(self):
        args = self.args
        eval_dataset, examples, features = load_and_cache_examples(args, self.tokenizer, evaluate=True, output_examples=True)

        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # multi-gpu evaluate
        if args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel):
            self.model = torch.nn.DataParallel(self.model)

        # Eval!
        if not os.path.exists(args.model_dir):
            raise Exception("Model doesn't exists! Train first!")
            
        model_name = (args.model_dir).split("/")[-1]
        print("model name: {}".format(model_name))
        
        logger.info("***** Running evaluation {} *****".format(model_name))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_results = []
        start_time = timeit.default_timer()

        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            self.model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }
                example_indices = batch[3]
                
                outputs = self.model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [output[i].detach().cpu().tolist() for output in outputs]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        evalTime = timeit.default_timer() - start_time
        logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset))

        # Compute predictions
        output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(model_name))
        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(model_name))

        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            self.tokenizer,
            output_prediction_file,
            output_nbest_file,
        )

        # Compute the F1 and exact scores.
        results = squad_evaluate(examples, predictions)
        logger.info(results)
        
        return results
def main(cli_args):
    # Read from config file and make args
    config_filename = "{}.json".format(cli_args.taxonomy)
    with open(os.path.join("config", config_filename)) as f:
        args = AttrDict(json.load(f))
    logger.info("Training/evaluation parameters {}".format(args))

    args.output_dir = os.path.join(args.ckpt_dir, args.output_dir)

    init_logger()
    set_seed(args)

    processor = GoEmotionsProcessor(args)
    label_list = processor.get_labels()

    config = BertConfig.from_pretrained(
        args.model_name_or_path,
        num_labels=len(label_list),
        finetuning_task=args.task,
        id2label={str(i): label
                  for i, label in enumerate(label_list)},
        label2id={label: i
                  for i, label in enumerate(label_list)})
    tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name_or_path, )
    model = BertForMultiLabelClassification.from_pretrained(
        args.model_name_or_path, config=config)

    # GPU or CPU
    args.device = "cuda" if torch.cuda.is_available(
    ) and not args.no_cuda else "cpu"
    model.to(args.device)

    # Load dataset
    train_dataset = load_and_cache_examples(
        args, tokenizer, mode="train") if args.train_file else None
    dev_dataset = load_and_cache_examples(
        args, tokenizer, mode="dev") if args.dev_file else None
    test_dataset = load_and_cache_examples(
        args, tokenizer, mode="test") if args.test_file else None

    if dev_dataset is None:
        args.evaluate_test_during_training = True  # If there is no dev dataset, only use test dataset

    if args.do_train:
        global_step, tr_loss = train(args, model, tokenizer, train_dataset,
                                     dev_dataset, test_dataset)
        logger.info(" global_step = {}, average loss = {}".format(
            global_step, tr_loss))

    results = {}
    if args.do_eval:
        checkpoints = list(
            os.path.dirname(c) for c in sorted(
                glob.glob(args.output_dir + "/**/" + "pytorch_model.bin",
                          recursive=True)))
        if not args.eval_all_checkpoints:
            checkpoints = checkpoints[-1:]
        else:
            logging.getLogger("transformers.configuration_utils").setLevel(
                logging.WARN)  # Reduce logging
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1]
            model = BertForMultiLabelClassification.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args,
                              model,
                              test_dataset,
                              mode="test",
                              global_step=global_step)
            result = dict(
                (k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as f_w:
            for key in sorted(results.keys()):
                f_w.write("{} = {}\n".format(key, str(results[key])))
Example #22
0
    def _train(self, args, train_dataset, model, tokenizer, labels, pad_token_label_id):
        """ Train the model """
        if args.local_rank in [-1, 0]:
            tb_writer = SummaryWriter()

        args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
        train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": args.weight_decay,
            },
            {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
             "weight_decay": 0.0},
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
        )

        # Check if saved optimizer or scheduler states exist
        if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")
        ):
            # Load in optimizer and scheduler states
            optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
            scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

        if args.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

        # multi-gpu training (should be after apex fp16 initialization)
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Distributed training (should be after apex fp16 initialization)
        if args.local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
            )

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Num Epochs = %d", args.num_train_epochs)
        logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            args.train_batch_size
            * args.gradient_accumulation_steps
            * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
        )
        logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)

        global_step = 0
        epochs_trained = 0
        steps_trained_in_current_epoch = 0
        # Check if continuing training from a checkpoint
        if os.path.exists(args.model_name_or_path):
            # set global_step to gobal_step of last saved checkpoint from model path
            try:
                global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
            except ValueError:
                global_step = 0
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)

        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(
            epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
        )
        set_seed(args)  # Added here for reproductibility
        for _ in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
            for step, batch in enumerate(epoch_iterator):

                # Skip past any already trained steps if resuming training
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                model.train()
                batch = tuple(t.to(args.device) for t in batch)
                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
                if args.model_type != "distilbert":
                    inputs["token_type_ids"] = (
                        batch[2] if args.model_type in ["bert", "xlnet"] else None
                    )  # XLM and RoBERTa don"t use segment_ids

                outputs = model(**inputs)
                loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

                if args.n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu parallel training
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        # Log metrics
                        if (
                                args.local_rank == -1 and args.evaluate_during_training
                        ):  # Only evaluate when single GPU otherwise metrics may not average well

                            #TODO 이걸 cache로 저장하는게 맞을지 매번 부르는게 맞을 지 고민
                            eval_dataset = load_and_cache_examples(args=self.args,
                                                                   tokenizer=tokenizer,
                                                                   labels=self.labels,
                                                                   pad_token_label_id=self.pad_token_label_id,
                                                                   mode='dev')

                            results, _ = self._evaluate(args, model, eval_dataset, labels, pad_token_label_id, mode="dev", prefix="")
                            for key, value in results.items():
                                tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                        tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                        tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                        logging_loss = tr_loss

                    if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                        # Save model checkpoint
                        output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        model_to_save = (
                            model.module if hasattr(model, "module") else model
                        )  # Take care of distributed/parallel training
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)

                        torch.save(args, os.path.join(output_dir, "training_args.bin"))
                        logger.info("Saving model checkpoint to %s", output_dir)

                        torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                        torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                        logger.info("Saving optimizer and scheduler states to %s", output_dir)

                if args.max_steps > 0 and global_step > args.max_steps:
                    epoch_iterator.close()
                    break
            if args.max_steps > 0 and global_step > args.max_steps:
                train_iterator.close()
                break

        if args.local_rank in [-1, 0]:
            tb_writer.close()

        # Saves the last model when ended.
        self._save_model(args, model, tokenizer)

        return global_step, tr_loss / global_step
Example #23
0
    def train(self):
        args = self.args
        
        train_dataset = load_and_cache_examples(args, self.tokenizer, evaluate=False, output_examples=False)

        args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": args.weight_decay,
            },
            {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
        num_warmup_steps = t_total * args.warmup_proportion
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total
        )

        # Check if saved optimizer or scheduler states exist
        if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
            os.path.join(args.model_name_or_path, "scheduler.pt")
        ):
            # Load in optimizer and scheduler states
            optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
            scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

        # multi-gpu training
        if args.n_gpu > 1:
            self.model = torch.nn.DataParallel(self.model)

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Num Epochs = %d", args.num_train_epochs)
        logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            args.train_batch_size
            * args.gradient_accumulation_steps
            ,
        )
        logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)

        global_step = 1
        epochs_trained = 0
        steps_trained_in_current_epoch = 0
        # Check if continuing training from a checkpoint
        if os.path.exists(args.model_name_or_path):
            try:
                # set global_step to gobal_step of last saved checkpoint from model path
                checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
                global_step = int(checkpoint_suffix)
                epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
                steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

                logger.info("  Continuing training from checkpoint, will skip to saved global_step")
                logger.info("  Continuing training from epoch %d", epochs_trained)
                logger.info("  Continuing training from global step %d", global_step)
                logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
            except ValueError:
                logger.info("  Starting fine-tuning.")

        tr_loss, logging_loss = 0.0, 0.0
        self.model.zero_grad()
        train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch")
        
        # Added here for reproductibility
        set_seed(args)

        for _ in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):

                # Skip past any already trained steps if resuming training
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                self.model.train()
                batch = tuple(t.to(args.device) for t in batch)

                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    "start_positions": batch[3],
                    "end_positions": batch[4],
                }


                outputs = self.model(**inputs)
                # model outputs are always tuple in transformers (see doc)
                loss = outputs[0]

                if args.n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.max_grad_norm)

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    global_step += 1
                        
                    if self.args.save_steps > 0 and global_step % self.args.save_steps == 0:
                        self.save_model(global_step)

                if args.max_steps > 0 and global_step > args.max_steps:
                    epoch_iterator.close()
                    break
            if args.max_steps > 0 and global_step > args.max_steps:
                train_iterator.close()
                break
        
        self.args.model_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
        self.save_model(global_step)
        return global_step, tr_loss / global_step