コード例 #1
0
        data_collator=collator,
    )

    # Training
    if training_args.do_train:
        trainer.train()
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(output_dir)

    # Evaluate Each Checkpoints
    dev_results = {}
    test_results = {}
    checkpoints_list = trainer._sorted_checkpoints()

    logger.info("Saved Checkpoints:")
    logger.info(str(checkpoints_list))

    cnt = 0
    max_dev_f1 = -0.1
    max_f1_checkpoint_name = None

    for checkpoint_name in checkpoints_list:
        path = (
            checkpoint_name  # os.path.join(training_args.output_dir, checkpoint_name)
        )
        model_new = SeqClassModel.from_pretrained(
            path,
            params_dict=config_dict,
コード例 #2
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    hf_logging.set_verbosity_info()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        logger.info(
            f"Output dir ({training_args.output_dir}) is not empty, will try to reload from there."
        )
        model_args.model_name_or_path = training_args.output_dir
        # raise ValueError(
        #     f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        # )

    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path:
        model = AutoModelWithLMHead.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    logger.info(model)
    num_params = sum(p.numel() for p in model.parameters())
    logger.info('Model has %d parameters' % num_params)
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    logger.info('Model has %d trainable parameters' % num_params)

    # ADD special tokens
    tokenizer.pad_token = tokenizer.eos_token
    special_tokens_dict = {
        'additional_special_tokens':
        ['<STORY>', '<QUERY>', '<PROOF>', '<ANSWER>']
    }
    # NOTE: should also have added "ent_1", "ent_2", ..., "ent_20" :/
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    logger.info(f'We have added {num_added_toks} tokens')
    '''
    if tokenizer.pad_token_id is None and data_args.line_by_line:
        # See PR 3388. Some tokenizers don't had pad tokens which causes errors at the encoding step in the collate_fn.
        # We give here the option to force the addition of a pad token. The attention mask is used to ignore this token
        # when feeding to the model.
        # tokenizer.pad_token = tokenizer.eos_token
        num_added_toks = tokenizer.add_special_tokens({"pad_token": "<pad>"})
    '''

    model.resize_token_embeddings(len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the"
            "--mlm flag (masked language modeling).")

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.model_max_length
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size,
                                   tokenizer.model_max_length)

    # Get datasets
    train_dataset = (get_dataset(
        data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir)
                     if training_args.do_train else None)
    eval_dataset = (get_dataset(data_args,
                                tokenizer=tokenizer,
                                evaluate=True,
                                cache_dir=model_args.cache_dir) if
                    (training_args.do_eval
                     or training_args.evaluate_during_training) else None)
    if config.model_type == "xlnet":
        data_collator = DataCollatorForPermutationLanguageModeling(
            tokenizer=tokenizer,
            plm_probability=data_args.plm_probability,
            max_span_length=data_args.max_span_length,
        )
    else:
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=data_args.mlm,
            mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    # start by saving tokenizer so that we can restart training!
    # if trainer.is_world_master():
    #     tokenizer.save_pretrained(training_args.output_dir)

    results = {}
    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        logger.info(f"model_path: {model_path}")
        if model_path is not None:
            # Grab the most recent checkpoint
            checkpoints_sorted = trainer._sorted_checkpoints(use_mtime=True)
            assert len(checkpoints_sorted) > 0
            checkpoint_most_recent = checkpoints_sorted[-1]
            logger.info(
                f"most recent checkpoint: {checkpoint_most_recent}. setting model_path to this."
            )
            # TODO: find a way to set:
            # - patience_best_eval_loss = None
            # - patience_evals_without_improvement = 0
            # - patience_should_stop = False
            model_path = checkpoint_most_recent
        train_results = trainer.train(model_path=model_path, )
        results["train_step"] = train_results.global_step
        results["train_loss"] = train_results.training_loss
        results["train_ppl"] = math.exp(train_results.training_loss)

        # trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        # if trainer.is_world_master():
        #     tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()
        results["valid_loss"] = eval_output["eval_loss"]
        results["valid_ppl"] = math.exp(eval_output["eval_loss"])

    output_eval_file = os.path.join(training_args.output_dir, "results_lm.txt")
    if trainer.is_world_master():
        with open(output_eval_file, "w") as writer:
            logger.info("***** results *****")
            for key in sorted(results.keys()):
                logger.info("  %s = %s", key, str(results[key]))
                writer.write("%s = %s\n" % (key, str(results[key])))

    return results