Beispiel #1
0
        evaluation_strategy: EvaluationStrategy = field(
            default=EvaluationStrategy.STEPS)
        learning_rate: float = field(default=1e-4)
        per_device_train_batch_size: int = field(default=32)
        per_device_eval_batch_size: int = field(default=32)
        num_train_epochs: float = field(default=10.0)
        save_total_limit: int = field(default=5)
        masking_probability: float = field(default=None)
        replacement_probability: float = field(default=None)
        select_labels: bool = field(default=False)

    parser = HfArgumentParser((MyTrainingArguments),
                              description="Traing script.")
    parser.add_argument("data_config_name",
                        nargs="?",
                        default="NER",
                        choices=["NER", "ROLES", "BORING", "PANELIZATION"],
                        help="Name of the dataset configuration to use.")
    parser.add_argument("--dataset_path",
                        help="The dataset to use for training.")
    parser.add_argument(
        "--no_cache",
        action="store_true",
        help=
        "Flag that forces re-donwloading the dataset rather than re-using it from the cacher."
    )
    training_args, args = parser.parse_args_into_dataclasses()
    no_cache = args.no_cache
    data_config_name = args.data_config_name
    dataset_path = args.dataset_path
    output_dir_path = Path(training_args.output_dir) / data_config_name
Beispiel #2
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataProcessingArguments, TrainingArguments))
    model_args, dataprocessing_args, training_args = parser.parse_args_into_dataclasses(
    )

    # For now, let's merge all the sets of args into one,
    # but soon, we'll keep distinct sets of args, with a cleaner separation of concerns.
    args = argparse.Namespace(**vars(model_args), **vars(dataprocessing_args),
                              **vars(training_args))

    parser.add_argument('--freeze_bert', action='store_true')
    parser.add_argument('--prune_train', type=float, default=0.0)
    parser.add_argument('--prune_eval', type=float, default=0.0)
    parser.add_argument('--prune',
                        type=str,
                        default='random',
                        help="default=random, global, l1")
    parser.add_argument('--prune_layers', type=str, default='')
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    args = parser.parse_args()

    print('Args:', args)

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    # config = AutoConfig.from_pretrained(
    #     args.config_name if args.config_name else args.model_name_or_path,
    #     num_labels=num_labels,
    #     finetuning_task=args.task_name,
    #     cache_dir=args.cache_dir,
    # )
    # tokenizer = AutoTokenizer.from_pretrained(
    #     args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir,
    # )
    # model = AutoModelForSequenceClassification.from_pretrained(
    #     args.model_name_or_path,
    #     from_tf=bool(".ckpt" in args.model_name_or_path),
    #     config=config,
    #     cache_dir=args.cache_dir,
    # )
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None)

    print('Model Size:')
    for mod_name, module in list(model.named_modules()):
        size = sum([
            np.prod(p.size())
            for p in filter(lambda p: p.requires_grad, module.parameters())
        ])
        print(mod_name, size)
        # for name, value in list(module.named_parameters()):
        #     print(mod_name, name)

    if args.freeze_bert:
        print('Freezing bert weights')
        for name, param in model.bert.named_parameters():
            if param.requires_grad:
                param.requires_grad = False
                print(name)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        # model = AutoModelForSequenceClassification.from_pretrained(args.output_dir)
        # tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        # tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                "/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            # model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)

            result = evaluate(args, model, tokenizer, prefix=prefix)
            result = dict(
                (k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)
    return results
Beispiel #3
0
    @dataclass
    class MyTrainingArguments(TrainingArguments):
        output_dir: str = field(default=LM_MODEL_PATH)
        overwrite_output_dir: bool = field(default=True)
        logging_steps: int = field(default=2000)
        evaluation_strategy: EvaluationStrategy = field(
            default=EvaluationStrategy.STEPS)
        per_device_train_batch_size: int = field(default=16)
        per_device_eval_batch_size: int = field(default=16)
        save_total_limit: int = field(default=5)

    parser = HfArgumentParser((MyTrainingArguments),
                              description="Traing script.")
    parser.add_argument("data_config_name",
                        nargs="?",
                        default="MLM",
                        choices=["MLM", "DET", "VERB", "SMALL"],
                        help="Name of the dataset configuration to use.")
    parser.add_argument("--dataset_path",
                        help="The dataset to use for training.")
    parser.add_argument(
        "--no_cache",
        action="store_true",
        help=
        "Flag that forces re-donwloading the dataset rather than re-using it from the cacher."
    )
    training_args, args = parser.parse_args_into_dataclasses()
    no_cache = args.no_cache
    dataset_path = args.dataset_path
    data_config_name = args.data_config_name
    output_dir_path = Path(training_args.output_dir)
    else:

        def compute_metrics_function(eval_pred: EvalPrediction) -> Dict:
            predictions, labels = eval_pred
            predictions = predictions[:, 0]
            return metric.compute(predictions=predictions, references=labels)

    return compute_metrics_function


if __name__ == "__main__":

    parser = HfArgumentParser(TrainingArguments)
    parser.add_argument("--task",
                        default="cola",
                        help="name of GLUE task to compute")
    parser.add_argument("--model_checkpoint",
                        default="distilbert-base-uncased")
    training_args, args = parser.parse_args_into_dataclasses()

    transformers.logging.set_verbosity_debug()

    task: str = args.task.lower()

    num_labels = num_labels_from_task(task)

    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_checkpoint, num_labels=num_labels)

    tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint,