Beispiel #1
0
def load_args():
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    elif len(sys.argv) == 1:
        # If we pass no args to the script then load args from .yaml
        with open(f'config/seq2seq/seq2seq_base.yaml', 'r') as f:
            all_args = yaml.load(f, Loader=yaml.FullLoader)
        # Also load user-specified args and override base args
        with open(f'config/seq2seq/train.yaml', 'r') as f:
            user_args = yaml.load(f, Loader=yaml.FullLoader)
        all_args.update(user_args)
        wandb = all_args.pop('wandb')
        model_args, data_args, training_args = parser.parse_dict(all_args)
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if 'tmp' in training_args.output_dir:
        training_args.overwrite_output_dir = True
        wandb = False

    training_args.learning_rate = float(training_args.learning_rate)
    os.environ["WANDB_DISABLED"] = "" if wandb else "true"
    
    return model_args, data_args, training_args
    def train(
        self,
        examples: List[InputExample],
        epochs=3,
        bath_size=16,
        seed=42,
    ):
        features, f_word_maps = TokenClassificationTask.parse_examples(
            examples,
            tokenizer=self.tokenizer,
            label2id=self.label2id,
            model_type=self.config.model_type,
            max_seq_length=self.max_seq_length,
            ignore_sub_tokens_labes=self.ignore_sub_tokens_labes,
            spliting_strategy=self.spliting_strategy,
            sentence_strategy=self.sentence_strategy,
        )

        train_dataset = TokenClassificationDataset(features)

        parser = HfArgumentParser(TrainingArguments)
        training_args = parser.parse_dict({
            "output_dir": self.output_dir,
            "num_train_epochs": epochs,
            "per_device_train_batch_size": bath_size,
            "seed": seed,
            "save_total_limit": 0,
        })[0]

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            compute_metrics=self.metric_function(),
        )

        training_result = trainer.train()
        logger.debug(training_result)

        trainer.save_model()
        self.tokenizer.save_pretrained(self.output_dir)

        with open(f'{self.output_dir}/settings.json', 'w') as outfile:
            json.dump(
                {
                    "labels": self.labels,
                    "ignore_sub_tokens_labes": self.ignore_sub_tokens_labes,
                    "spliting_strategy": self.spliting_strategy,
                    "sentence_strategy": self.sentence_strategy,
                    "prediction_strategy": self.prediction_strategy,
                }, outfile)
        self.model.to('cpu')

        return training_result
Beispiel #3
0
    def test_parse_dict(self):
        parser = HfArgumentParser(BasicExample)

        args_dict = {
            "foo": 12,
            "bar": 3.14,
            "baz": "42",
            "flag": True,
        }

        parsed_args = parser.parse_dict(args_dict)[0]
        args = BasicExample(**args_dict)
        self.assertEqual(parsed_args, args)
Beispiel #4
0
def convert_to_prunable_checkpoint(checkpoint_folder, experiment):
    """
    This loads a dense models weights and a prunable model of similar architecture (one
    with SparseWeightsBase layers), copies the weights of the former into the latter,
    and then saves a new checkpoint at `{checkpoint_folder}_prunable`.

    :param checkpoint_folder: path to dense checkpoint
    :param experiment: name of experiment config with a prunable architecture
    """

    # We'll use `sparsity=0` to ensure it's dense but prunable model.
    exp_config = CONFIGS[experiment]
    exp_config["config_kwargs"]["sparsity"] = 0
    exp_parser = HfArgumentParser(ModelArguments)
    model_args = exp_parser.parse_dict(exp_config)[0]

    # Initialize prunable model and dense model.
    config = init_config(model_args)
    tokenizer = init_tokenizer(model_args)
    prunable_model = AutoModelForMaskedLM.from_config(config)
    prunable_model.resize_token_embeddings(len(tokenizer))

    dense_model = AutoModelForMaskedLM.from_pretrained(checkpoint_folder)

    # Determine which parameters belong to SparseWeightsBase classes.
    sparse_params = filter_params(prunable_model,
                                  include_modules=[SparseWeightsBase])
    sparse_dataptrs = [p.data_ptr() for p in sparse_params.values()]

    # Load the dense params into the prunable params.
    for n2, p2 in prunable_model.named_parameters():

        # e.g. replace `linear.module.weight` with `linear.weight` when appropriate.
        if p2.data_ptr() in sparse_dataptrs:
            n1 = n2.replace(".module", "")
        else:
            n1 = n2

        p1 = get_module_attr(dense_model, n1)
        p2.data[:] = p1

    # Save the prunable model.
    new_folder_name = checkpoint_folder + "_prunable"
    prunable_model.save_pretrained(new_folder_name)
    print(f"Saved prunable model to:\n{new_folder_name}")
Beispiel #5
0
    def train(self):
        """
        训练模型,必须实现此方法
        :return:
        """
        config = HyperParametersConfig(epochs=self.args.EPOCHS,
                                       batch_size=self.args.BATCH)

        parser = HfArgumentParser(
            (ModelArguments, DataArguments, TrainingArguments))
        # config_dict = HyperParametersConfig().__dict__
        # print(config_dict)
        model_args, data_args, training_args = parser.parse_dict(
            config.__dict__)

        logger.info("Load pre-training model.")
        tokenizer = BertTokenizer.from_pretrained(
            model_args.model_name_or_path)
        model = CustomGPTGeneration.from_pretrained(
            model_args.model_name_or_path)

        # Get datasets
        logger.info("Loading dataset.")
        train_dataset = PsychologicalQADataset(
            data_args.dataset_path,
            tokenizer=tokenizer,
            max_sequence_len=data_args.max_sequence_len)

        logger.info("Initialize Trainer.")
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=None,
        )

        logger.info("Training start.")
        if training_args.do_train:
            trainer.train()
            trainer.save_model()
            if trainer.is_world_process_zero():
                tokenizer.save_pretrained(training_args.output_dir)
Beispiel #6
0
    def __init__(self, param_dict):
        # See all possible arguments in src/transformers/training_args.py
        # or by passing the --help flag to this script.
        # We now keep distinct sets of args, for a cleaner separation of concerns.
        arguments = copy.deepcopy(self.ARGUMENTS)
        self.arguments_names = list(arguments.keys())
        parser = HfArgumentParser(arguments.values())
        parse_results = parser.parse_dict(param_dict)  #, strict=True)

        assert self.arguments_names[0] == "model"
        assert self.arguments_names[1] == "data"
        assert self.arguments_names[2] == "training"

        # Explicitly affect args, to make IDE not flagging members as unknown
        self.model_args = parse_results[0]
        self.data_args = parse_results[1]
        self.training_args = parse_results[2]

        for i, (k, v) in enumerate(arguments.items()):
            if i < 3:
                continue
            setattr(self, k + "_args", parse_results[i])
Beispiel #7
0
def main():
    cmd_parser = argparse.ArgumentParser()
    cmd_parser.add_argument("experiments",
                            nargs="+",
                            choices=list(CONFIGS.keys()),
                            help="Available experiments")
    cmd_parser.add_argument("--local_rank",
                            default=None,
                            help="added by torch.distributed.launch")

    cmd_args = cmd_parser.parse_args()

    for experiment in cmd_args.experiments:
        config_dict = CONFIGS[experiment]
        local_rank = int(cmd_args.local_rank or -1)
        config_dict["local_rank"] = local_rank

        # See all possible arguments in transformers/training_args.py and ./run_args.py
        exp_parser = HfArgumentParser(
            (ModelArguments, DataTrainingArguments, CustomTrainingArguments))
        model_args, data_args, training_args = exp_parser.parse_dict(
            config_dict)

        # Overrides default behavior of TrainingArguments of setting run name
        # equal to output_dir when not available
        if training_args.run_name == training_args.output_dir:
            training_args.run_name = experiment
        # Run name (or experiment name) is added to the output_dir
        training_args.output_dir = os.path.join(training_args.output_dir,
                                                training_args.run_name)

        # Initialize wandb now to include the logs that follow.
        # For now, only support early wandb logging when running one experiment.
        distributed_initialized = torch.distributed.is_initialized()
        rank = -1 if not distributed_initialized else torch.distributed.get_rank(
        )
        if is_wandb_available() and len(cmd_args.experiments) == 1:
            CustomWandbCallback.early_init(training_args, rank)

        # Detecting last checkpoint.
        last_checkpoint = None
        if (os.path.isdir(training_args.output_dir) and training_args.do_train
                and not training_args.overwrite_output_dir):
            last_checkpoint = get_last_checkpoint(training_args.output_dir)
            logging.warning(f"Loading from checkpoint: {last_checkpoint} ")
            if (last_checkpoint is None
                    and len(os.listdir(training_args.output_dir)) > 0):
                raise ValueError(
                    f"Output directory ({training_args.output_dir}) already exists and "
                    "is not empty. Use --overwrite_output_dir to overcome.")
            elif last_checkpoint is not None:
                logging.info(
                    f"Checkpoint detected, resuming training at {last_checkpoint}. To "
                    "avoid this behavior, change the `--output_dir` or add "
                    "`--overwrite_output_dir` to train from scratch.")

        # Setup logging
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
            datefmt="%m/%d/%Y %H:%M:%S",
            handlers=[logging.StreamHandler(sys.stdout)],
            level=(logging.INFO if is_main_process(training_args.local_rank)
                   else logging.WARN))

        # Log config.
        logging.info(f"Running with config:\n{pformat(config_dict, indent=4)}")

        # Log on each process the small summary:
        logging.warning(
            f"Process rank: {training_args.local_rank}, "
            f"device: {training_args.device}, n_gpu: {training_args.n_gpu} "
            f"distributed training: {bool(training_args.local_rank != -1)}, "
            f"16-bits training: {training_args.fp16}")
        # Set the verbosity to info of the Transformers logging (on main process only):
        if is_main_process(training_args.local_rank):
            transformers.utils.logging.set_verbosity_info()
            transformers.utils.logging.enable_default_handler()
            transformers.utils.logging.enable_explicit_format()
        logging.info("Training/evaluation parameters %s", training_args)
        logging.info("Model parameters: %s", model_args)
        logging.info("Data parameters: %s", data_args)

        # Set seed before initializing model.
        set_seed(training_args.seed)
        logging.info(f"Seed to reproduce: {training_args.seed}")

        if model_args.finetuning:
            run_finetuning_multiple_tasks(model_args,
                                          data_args,
                                          training_args,
                                          last_checkpoint=last_checkpoint)
        else:
            run_pretraining(model_args,
                            data_args,
                            training_args,
                            last_checkpoint=last_checkpoint)

        # destroy process group before launching another experiment
        if cmd_args.local_rank:
            torch.distributed.destroy_process_group()
def run(args=None, training_args=None):
    if args is not None and training_args is not None:
        parser = HfArgumentParser((TrainScriptArguments))
        args = parser.parse_dict(args)[0]
        parser = HfArgumentParser((TrainingArguments))
        training_args = parser.parse_dict(training_args)[0]
    else:
        parser = HfArgumentParser((TrainScriptArguments, TrainingArguments))
        args, training_args = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )

    set_seed(training_args.seed)

    # Setup wandb
    os.environ["WANDB_PROJECT"] = args.wandb_project
    if args.is_dryrun:
        os.environ["WANDB_MODE"] = "dryrun"

    logging.getLogger('wandb.run_manager').setLevel(logging.WARNING)

    tokenizer_name = args.model_name if args.tokenizer_name is None else args.tokenizer_name

    # TODO: Fix this hard-coded stuff
    if args.model_name == "bert-base-cased":
        model = EncoderDecoderModel.from_encoder_decoder_pretrained(
            args.model_name, args.model_name)
    else:
        model = NAME_TO_MODEL[args.model_name].from_pretrained(args.model_name)
    tokenizer = NAME_TO_TOK[tokenizer_name].from_pretrained(tokenizer_name)

    logger.info(f"Path {args.train_data_path}")
    train_data = torch.load(
        args.train_data_path) if training_args.do_train else None
    test_data = torch.load(
        args.test_data_path) if training_args.do_eval else None

    # Resizes the train set to args.data_size percentage
    logger.info(f"Train-data pre-size: {train_data.num_rows}")
    if args.absolute_data_size:
        num_rows = args.absolute_data_size
    else:
        num_rows = int(train_data.num_rows * (args.data_size / 100))

    # The select takes time even when selecting all rows. Do this check first.
    if args.absolute_data_size or args.data_size < 100:
        # Shuffle train_data before re-sizing. Controlling with seed
        # Make sure to override the cache file as it does not care about data size
        train_data = train_data.shuffle(seed=training_args.seed,
                                        keep_in_memory=True,
                                        load_from_cache_file=False)
        train_data = train_data.select(torch.arange(0, num_rows),
                                       keep_in_memory=True,
                                       load_from_cache_file=False)

        # For some reason, Dataset.select() and Dataset.shuffle() resets format
        fields = ["source_ids", "target_ids", "attention_mask"]
        train_data.set_format(type="torch", columns=fields)
        test_data.set_format(type="torch", columns=fields)

    logger.info(f"Train-data size: {train_data.num_rows}")

    collator = DataCollator(tokenizer=tokenizer,
                            is_training=training_args.do_train,
                            tpu=training_args.tpu_num_cores is not None)

    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_data,
                      eval_dataset=test_data,
                      data_collator=collator,
                      prediction_loss_only=True)

    if training_args.do_train:
        trainer.train()
        trainer.save_model()
Beispiel #9
0
def main():
    # See all possible arguments in src/transformers/training_args.py or by passing
    # the --help flag to this script. We now keep distinct sets of args, for a cleaner
    # separation of concerns.
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments,
         AdapterTrainingArguments))

    # For running on multiple gpus with torch.distributed.launch, it adds a local_rank paramter, to allow the parser
    # still use the config file, we add the local_rank to the config file.
    if len(sys.argv) == 3 and sys.argv[1].startswith(
            "--local_rank") and sys.argv[2].endswith(".json"):
        args_dict = json.loads(Path(sys.argv[2]).read_text())
        args_dict.update({'local_rank': int(sys.argv[1].split('=')[-1])})
        model_args, data_args, training_args, adapter_args = parser.parse_dict(
            args_dict)
    elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args, adapter_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses(
        )

    check_output_dir(training_args)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = T5Config.from_pretrained(
        model_args.config_name if model_args.config_name else \
            model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout",
                          "attention_dropout", "fixed_length_emb",
                          "encoder_projection", "encoder_pooling",
                          "projection_length", "only_projection_bottleneck",
                          "concat_projection_token", "train_adapters")
    for p in extra_model_params:
        if getattr(training_args, p, None):
            assert hasattr(
                config, p
            ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute"
            setattr(config, p, getattr(training_args, p))

    # Gets the adapter config and updates the specified parameters.
    if training_args.train_adapters:
        adapter_config = AutoAdapterConfig.get(
            adapter_args.adapter_config_name)
        adapter_config.input_dim = config.d_model
        adapter_config.tasks = data_args.tasks
        extra_adapter_params = (
            "task_embedding_dir", "task_embedding_dim",
            "add_layer_norm_before_adapter", "add_layer_norm_after_adapter",
            "reduction_factor", "hidden_dim", "non_linearity",
            "train_task_embeddings", "projected_task_embedding_dim",
            "add_adapters_in_decoder", "add_adapter_in_feed_forward",
            "add_adapter_in_self_attention", "task_hidden_dim",
            "conditional_layer_norm", "one_layer_adapter_hyper_net",
            "adapter_hyper_net_with_bias",
            "one_layer_adapter_hyper_net_with_linear",
            "parametric_task_embedding", "conditional_layer_norm_for_T5",
            "train_adapters_blocks", "remove_original_layer_norms",
            "unique_hyper_net", "unique_hyper_net_layer_norm")
        for p in extra_adapter_params:
            if hasattr(adapter_args, p) and hasattr(adapter_config, p):
                setattr(adapter_config, p, getattr(adapter_args, p))
            else:
                logger.warning(
                    f"({adapter_config.__class__.__name__}) doesn't have a `{p}` attribute"
                )
        adapter_config.device = training_args.device
    else:
        adapter_config = None

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else \
            model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    if model_args.not_load_t5_checkpoint:
        model = T5ForConditionalGeneration(config=config,
                                           adapter_config=adapter_config)
    else:
        model = T5ForConditionalGeneration.from_pretrained(
            model_args.model_name_or_path,
            from_tf=".ckpt" in model_args.model_name_or_path,
            config=config,
            cache_dir=model_args.cache_dir,
            adapter_config=adapter_config)

    # set num_beams for evaluation
    if data_args.eval_beams is None:
        data_args.eval_beams = model.config.num_beams

    # freezing the parameters.
    if training_args.do_train:
        freezing_params(model, training_args, model_args, adapter_args)

    if training_args.print_num_parameters:
        logger.info(model)
        for name, param in model.named_parameters():
            if param.requires_grad:
                logger.info("Parameter name %s", name)
        total_trainable_params = sum(p.numel() for p in model.parameters()
                                     if p.requires_grad)
        logger.info("Total trainable parameters %s", total_trainable_params)
    # Gets the training/test/validation datasets.
    dataset_class = AutoTask
    if training_args.do_train:
        train_datasets = [
            dataset_class.get(task, seed=data_args.data_seed).get_dataset(
                split="train",
                n_obs=data_args.n_train,
                add_prefix=False if training_args.train_adapters else True)
            for task in data_args.tasks
        ]
        dataset_sizes = [
            len(train_dataset) for train_dataset in train_datasets
        ]
        train_dataset = datasets.concatenate_datasets(train_datasets)
    training_args.remove_unused_columns = False
    eval_datasets = ({
        task: dataset_class.get(task, seed=data_args.data_seed).get_dataset(
            split="validation",
            n_obs=data_args.n_val,
            add_prefix=False if training_args.train_adapters else True,
            split_validation_test=training_args.split_validation_test)
        for task in data_args.eval_tasks
    } if training_args.do_eval or
                     training_args.evaluation_strategy != EvaluationStrategy.NO
                     else None)
    test_dataset = ({
        task: dataset_class.get(task, seed=data_args.data_seed).get_dataset(
            split="test",
            n_obs=data_args.n_test,
            add_prefix=False if training_args.train_adapters else True,
            split_validation_test=training_args.split_validation_test)
        for task in data_args.eval_tasks
    } if training_args.do_test else None)
    # Defines the metrics for evaluation.
    compute_metrics_fn = (build_compute_metrics_fn(data_args.eval_tasks,
                                                   tokenizer)
                          if training_args.predict_with_generate else None)
    # Defines the trainer.
    trainer = T5Trainer(
        model=model,
        config=config,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_datasets,
        data_collator=TaskCollator(tokenizer,
                                   data_args,
                                   tpu_num_cores=training_args.tpu_num_cores),
        compute_metrics=None,
        multi_task_compute_metrics=compute_metrics_fn,
        data_args=data_args,
        dataset_sizes=dataset_sizes if training_args.do_train else None,
        callbacks=[T5CheckpointCallback()],
        adapter_config=adapter_config)
    if trainer.is_world_process_zero():
        arguments = get_training_args(
            [model_args, data_args, training_args, adapter_args])
        handle_metrics("arguments", arguments, training_args.output_dir,
                       training_args.gcs_bucket)

    # Trains the model.
    if training_args.do_train:
        trainer.train(
            model_path=get_last_checkpoint_path(training_args.output_dir) \
                if (os.path.isdir(training_args.output_dir) and not training_args.optimize_from_scratch) else None,
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_process_zero():
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir, "trainer_state.json"))
            tokenizer.save_pretrained(training_args.output_dir)
            if training_args.save_task_embeddings:
                for task, task_embedding in model.task_embedding_controller.task_to_embeddings.items(
                ):
                    create_dir(training_args.save_task_embeddings_dir)
                    np.save(
                        os.path.join(training_args.save_task_embeddings_dir,
                                     '{}.npy'.format(task)),
                        task_embedding.data.detach().cpu().numpy())

    # Evaluation
    all_metrics = {}
    if training_args.do_eval or training_args.do_test:
        if trainer.is_world_process_zero():
            # By default we load  the model from last checkpoint path,
            # in case of saving the model with the best metrics, make sure to
            # set save_total = 1 so the best model is loaded here.
            # if not exists returns the path to the output_dir.
            last_checkpoint_path = get_last_checkpoint_path(
                training_args.output_dir)
            config = T5Config.from_pretrained(last_checkpoint_path,
                                              cache_dir=model_args.cache_dir)
            model = T5ForConditionalGeneration.from_pretrained(
                last_checkpoint_path,
                from_tf=".ckpt" in training_args.output_dir,
                config=config,
                cache_dir=model_args.cache_dir,
                adapter_config=adapter_config)
            # NOTE: if trainer is not re-defined, there is a bug in the codes, that making
            # huggingface codes does not using the best checkpoint.
            trainer = T5Trainer(model=model,
                                config=config,
                                args=training_args,
                                train_dataset=train_dataset
                                if training_args.do_train else None,
                                eval_dataset=eval_datasets,
                                data_collator=TaskCollator(
                                    tokenizer,
                                    data_args,
                                    tpu_num_cores=training_args.tpu_num_cores),
                                compute_metrics=None,
                                multi_task_compute_metrics=compute_metrics_fn,
                                data_args=data_args,
                                dataset_sizes=dataset_sizes
                                if training_args.do_train else None,
                                callbacks=[T5CheckpointCallback()],
                                adapter_config=adapter_config)

        if training_args.train_adapters:
            if adapter_args.adapter_config_name == "adapter" and data_args.adapters is not None:
                for name, sub_module in model.named_modules():
                    task_to_adapter = {
                        eval_task: adapter
                        for eval_task, adapter in zip(data_args.eval_tasks,
                                                      data_args.adapters)
                    }
                    if isinstance(sub_module, AdapterController):
                        sub_module.set_task_to_adapter_map(task_to_adapter)
            if adapter_args.adapter_config_name in ["meta-adapter"]:
                # If this is parametric, then the evaluation task should be part of tasks
                # and the embeddings needs to be trained.
                if not adapter_args.parametric_task_embedding:
                    model.task_embedding_controller.set_task_embeddings(
                        eval_datasets.keys(),
                        parametric=adapter_args.parametric_task_embedding)

    if training_args.do_eval:
        metrics = trainer.evaluate(metric_key_prefix="val")
        if trainer.is_world_process_zero():
            handle_metrics("val", metrics, training_args.output_dir,
                           training_args.gcs_bucket)
            all_metrics.update(metrics)

    if training_args.do_test:
        metrics = trainer.evaluate(test_dataset, metric_key_prefix="test")
        if trainer.is_world_process_zero():
            handle_metrics("test", metrics, training_args.output_dir,
                           training_args.gcs_bucket)
            all_metrics.update(metrics)

    return all_metrics
Beispiel #10
0
    # os.environ["WANDB_DISABLED"] = "false" if args.is_tensorboard else "true"
    os.environ["TRANSFORMERS_CACHE"] = "../huggingface_cache/"
    # if cache does not exist, create one
    if not os.path.exists(os.environ["TRANSFORMERS_CACHE"]):
        os.makedirs(os.environ["TRANSFORMERS_CACHE"])

    training_args = TrainingArguments("tmp_trainer")
    training_args.no_cuda = args.no_cuda
    training_args.per_device_eval_batch_size = args.per_device_eval_batch_size
    training_args.per_gpu_eval_batch_size = args.per_device_eval_batch_size
    training_args_dict = training_args.to_dict()
    _n_gpu = training_args_dict["_n_gpu"]
    del training_args_dict["_n_gpu"]
    training_args_dict["n_gpu"] = _n_gpu
    HfParser = HfArgumentParser((TrainingArguments))
    training_args = HfParser.parse_dict(training_args_dict)[0]

    TASK_CONFIG = {"classification": ("text", None)}

    # Load pretrained model and tokenizer
    NUM_LABELS = 3
    MAX_SEQ_LEN = 128
    config = AutoConfig.from_pretrained(args.model_type,
                                        num_labels=3,
                                        finetuning_task=args.task_name,
                                        cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.model_type,
                                              use_fast=False,
                                              cache_dir=args.cache_dir)
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_path,
Beispiel #11
0
def main():
    config = HyperParametersConfig()

    # set_os_environ()
    # if config.do_train and torch.cuda.device_count() > 1:
    #     # 分布式初始化
    #     torch.distributed.init_process_group(backend="nccl", rank=0, world_size=1,
    #                                          init_method='tcp://localhost:7002')

    # set_seed(config.seed)  # Trainer内部已经包含

    parser = HfArgumentParser(
        (ModelArguments, DataArguments, TrainingArguments))
    # config_dict = HyperParametersConfig().__dict__
    # print(config_dict)
    model_args, data_args, training_args = parser.parse_dict(config.__dict__)

    logger.info("Load pre-training model.")
    tokenizer = BertTokenizer.from_pretrained(model_args.model_name_or_path)
    model = CustomGPTGeneration.from_pretrained(model_args.model_name_or_path)

    # Get datasets
    logger.info("Loading dataset.")
    data = torch.load(data_args.dataset_path)
    train_dataset = ChineseMedicalDataset(
        data=data["train"],
        tokenizer=tokenizer,
        max_sequence_len=data_args.max_sequence_len,
        max_condition_len=data_args.max_condition_len,
        max_target_len=data_args.max_target_len,
        is_right_pad=data_args.is_right_pad,
        is_condition_first=data_args.is_condition_first,
        is_unilm_mask=data_args.is_unilm_mask
    ) if training_args.do_train else None
    valid_dataset = ChineseMedicalDataset(
        data=data["valid"],
        tokenizer=tokenizer,
        max_sequence_len=data_args.max_sequence_len,
        max_condition_len=data_args.max_condition_len,
        max_target_len=data_args.max_target_len,
        is_right_pad=data_args.is_right_pad,
        is_condition_first=data_args.is_condition_first,
        is_unilm_mask=data_args.is_unilm_mask
    ) if training_args.do_eval else None

    logger.info("Initialize Trainer.")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
    )

    logger.info("Training start.")
    if training_args.do_train:
        logger.info("local rank value: {}".format(training_args.local_rank))
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

        # Evaluation
        results = {}
        if training_args.do_eval and training_args.local_rank in [-1, 0]:
            logger.info("*** Evaluate ***")

            eval_output = trainer.evaluate()

            output_eval_file = os.path.join(training_args.output_dir,
                                            "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                logger.info("***** Evaluate results *****")
                for key in sorted(eval_output.keys()):
                    logger.info("{} = {}".format(key, str(eval_output[key])))
                    writer.write("{} = {}\n".format(key,
                                                    str(eval_output[key])))

            results.update(eval_output)

        return results
def train_model(dict_args):
    # parse args dict
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_dict(dict_args)

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)
    logger.info("Data Arguments %s", data_args)
    logger.info("Model Arguments %s", model_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # load datasets
    assert os.path.exists(data_args.train_file) and os.path.exists(
        data_args.validation_file) and os.path.exists(data_args.test_file)
    datasets = load_dataset("csv",
                            data_files={
                                "train": data_args.train_file,
                                "validation": data_args.validation_file,
                                "test": data_args.test_file
                            },
                            delimiter="\t",
                            cache_dir=model_args.cache_dir)
    logger.info("Datasets %s", datasets)
    logger.info("Column names %s", datasets["train"].column_names)
    logger.info("Sample example %s", datasets["train"][0])

    # get label information
    text_column_name = "text"
    label_column_name = "labels"
    bbox_column_name = "bbox"
    num_labels, label_to_id, id_to_label = get_label_info(
        datasets["train"][label_column_name], data_args.task_name)
    logger.info("num_labels %s", num_labels)
    logger.info("label_to_id %s", label_to_id)
    logger.info("id_to_label %s", id_to_label)

    # Load config, tokenizer and pre-trained model
    # For Distributed training: The .from_pretrained methods guarantee that only
    # one local process can concurrently download model & vocab.
    if data_args.task_name == "regression":
        config = AutoConfig.from_pretrained(
            model_args.config_name
            if model_args.config_name else model_args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=data_args.task_name,
            cache_dir=model_args.cache_dir
            # for longformer : May adapt the attention_window=512 (default) in config
        )
    else:
        config = AutoConfig.from_pretrained(
            model_args.config_name
            if model_args.config_name else model_args.model_name_or_path,
            num_labels=num_labels,
            id2label=id_to_label,
            label2id=label_to_id,
            finetuning_task=data_args.task_name,
            cache_dir=model_args.cache_dir
            # for longformer : May adapt the attention_window=512 (default) in config
        )

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=True,
        add_prefix_space=True  # for roberta tokenizer
    )

    if data_args.task_name == "ner":
        model = AutoModelForTokenClassification.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        model = AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir)

    # Tokenizer check: this script requires a fast tokenizer.
    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise ValueError(
            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
            "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this "
            "requirement")

    # Pre-process the datasets (tokenize words and align labels/bboxes if needed)
    padding = "max_length" if data_args.pad_to_max_length else False
    use_bbox = data_args.use_bbox
    tokenized_datasets = datasets.map(
        lambda x: preprocess_dataset(x, tokenizer, label_to_id, data_args.
                                     label_all_tokens, padding, use_bbox,
                                     data_args.task_name),
        remove_columns=[label_column_name],
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=not data_args.overwrite_cache,
    )
    logger.info("Tokenized datasets %s", tokenized_datasets)
    logger.info("Column names %s", tokenized_datasets["train"].column_names)
    logger.info("Sample example %s", tokenized_datasets["train"][0])

    # Data collator. Used to pad the inputs of a single batch to the max size of this batch
    # Not needed if padding has already been done (if pad_to_max_length is true): default_data_collator
    # This does not work with bboxes. Hence pad_to_max_length is always True when using bboxes
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    else:
        if data_args.task_name == "ner":
            data_collator = DataCollatorForTokenClassification(tokenizer)
        else:
            data_collator = None  # will default to DataCollatorWithPadding
    logger.info("Data Collator used %s", data_collator)

    # Initialize our Trainer
    if data_args.task_name != "multilabel-classif":
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"]
            if training_args.do_train else None,
            eval_dataset=tokenized_datasets["validation"]
            if training_args.do_eval else None,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=lambda x: compute_metrics(x, id_to_label, data_args
                                                      .task_name),
        )
    else:
        trainer = MultilabelClassificationTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"]
            if training_args.do_train else None,
            eval_dataset=tokenized_datasets["validation"]
            if training_args.do_eval else None,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=lambda x: compute_metrics(x, id_to_label, data_args
                                                      .task_name),
        )

    # Training
    if training_args.do_train:
        train_result = trainer.train(
            model_path=model_args.model_name_or_path if os.path.
            isdir(model_args.model_name_or_path) else None)
        # we save the final model (last or best) to the sagemaker output folder
        trainer.save_model(output_dir=data_args.sagemaker_output_path
                           )  # It saves the tokenizer too for easy upload
        output_train_file = os.path.join(data_args.sagemaker_output_path,
                                         "train_results.txt")
        if trainer.is_world_process_zero():
            with open(output_train_file, "w") as writer:
                logger.info("***** Train results *****")
                for key, value in sorted(train_result.metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")
            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(data_args.sagemaker_output_path,
                             "trainer_state.json"))
            # We also save the model_args and data_args for future use (training_args are already saved)
            torch.save(
                asdict(model_args),
                os.path.join(data_args.sagemaker_output_path,
                             "model_args.bin"))
            torch.save(
                asdict(data_args),
                os.path.join(data_args.sagemaker_output_path, "data_args.bin"))

    # Evaluation (This will evaluate the final/best model on the dev set and write results
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate best/final model on dev set ***")
        results = trainer.evaluate()
        output_eval_file = os.path.join(data_args.sagemaker_output_path,
                                        "eval_results.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    # Evaluate and Predict on test set
    if training_args.do_predict:
        logger.info("*** Predict on test set ***")
        test_dataset = tokenized_datasets["test"]
        predictions, labels, metrics = trainer.predict(
            test_dataset, metric_key_prefix="test")
        if data_args.task_name == "classif":
            true_predictions = [
                id_to_label[p] for p in np.argmax(predictions, axis=1)
            ]
        elif data_args.task_name == "multilabel-classif":
            predictions = 1 / (1 + np.exp(-predictions))  # sigmoid
            predictions = (predictions > 0.5)  # threshold
            true_predictions = [[id_to_label[i] for i in np.where(p == 1)[0]]
                                for p in predictions]
        elif data_args.task_name == "regression":
            true_predictions = np.squeeze(predictions)
        elif data_args.task_name == "ner":
            predictions = np.argmax(predictions, axis=2)
            true_predictions = [[
                id_to_label[p] for (p, l) in zip(prediction, label)
                if l != -100
            ] for prediction, label in zip(predictions, labels)]
        output_test_results_file = os.path.join(
            data_args.sagemaker_output_path, "test_results.txt")
        if trainer.is_world_process_zero():
            with open(output_test_results_file, "w") as writer:
                for key, value in sorted(metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")
        output_test_predictions_file = os.path.join(
            data_args.sagemaker_output_path, "test_predictions.txt")
        if trainer.is_world_process_zero():
            with open(output_test_predictions_file, "w") as writer:
                for prediction in true_predictions:
                    if data_args.task_name == "ner":
                        writer.write(" ".join(prediction) + "\n")
                    else:
                        writer.write(str(prediction) + "\n")
    return results
Beispiel #13
0
def setup(argc=None, **kwargs):
    if argc is None:
        argc = sys.argv[1:]
    parser = HfArgumentParser((
        ModelArguments, DataTrainingArguments,
        DirArguments, TrainingArguments, WindowArguments
    ))
    if (
        isinstance(argc, list) and
        len(argc) == 1 and
        argc[0].endswith('.json')
    ):
        model_args, data_args, dir_args, training_args, window_args = (
            parser.parse_json_file(argc[0])
        )
    elif isinstance(argc, dict):
        model_args, data_args, dir_args, training_args, window_args = (
            parser.parse_dict(argc)
        )
    else:
        model_args, data_args, dir_args, training_args, window_args = (
            parser.parse_args_into_dataclasses()
        )

    if (
        os.path.exists(training_args.output_dir)
        and [f for f in os.listdir(training_args.output_dir) if f != '.gitignore']
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    all_args = {
        'model_args': model_args,
        'data_args': data_args,
        'dir_args': dir_args,
        'training_args': training_args,
        'window_args': window_args,
    }
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    try:
        processor = processors[data_args.task_name]()
        label_list = processor.get_labels()
        num_labels = len(label_list)
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    config_kwargs = kwargs.pop('config_kwargs', {})
    tokenizer_kwargs = kwargs.pop('tokenizer_kwargs', {})
    model_kwargs = kwargs.pop('model_kwargs', {})

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        **config_kwargs,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        **tokenizer_kwargs,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        **model_kwargs,
    )

    return all_args, processor, config, tokenizer, model
Beispiel #14
0
def calculate_sparsity_param(sparsity_desired,
                             parameters_desired,
                             experiment,
                             test_sparsity=False):
    """
    :param sparsity_desired: desired sparsity of model
    :param parameters_desired: desired number of on-params;
                               can't be used with sparsity_desired
    :param experiment: name of experiment config with a sparse architecture
    :param test_sparsity: whether to test the calculated sparsity param, this test loads
                          the model and calculates the resulting sparsity.
    """

    # Ensure sparsity_desired or parameters_desired is specified but not both.
    assert not (sparsity_desired is None and parameters_desired is None)
    assert sparsity_desired is not None or parameters_desired is not None

    print(bold("Initializing model... ") + "(this may take a minute)")
    print(f"   experiment: {experiment}")

    # Load and parse model args from config.
    exp_config = CONFIGS[experiment]
    exp_parser = HfArgumentParser(ModelArguments)
    model_args = exp_parser.parse_dict(exp_config)[0]
    model_args = replace(model_args, cache_dir=None)  # enable to run locally
    print(bold("\n\nModel parameters:\n") + pdict(model_args.__dict__))
    print()

    # Initialize model.
    config = init_config(model_args)
    tokenizer = init_tokenizer(model_args)
    model = AutoModelForMaskedLM.from_config(config)
    model.resize_token_embeddings(len(tokenizer))

    print(bold("Calculating target sparsity..."))

    # Get sparse modules and calculate total number of sparsifiable params.
    sparse_modules = filter_modules(model.bert,
                                    include_modules=[SparseWeightsBase])
    sparsifiable_params = 0
    for _, m in sparse_modules.items():
        sparsifiable_params += m.zero_mask.numel()

    # Calculate the total number of params and the needed sparsity.
    total_params, _ = count_nonzero_params(model.bert)

    if parameters_desired is None:
        parameters_desired = total_params * (1 - sparsity_desired)
    elif sparsity_desired is None:
        sparsity_desired = parameters_desired / total_params

    dense_params = total_params - sparsifiable_params
    target_sparsity = 1 - (parameters_desired -
                           dense_params) / sparsifiable_params

    print(f"   sparsity_desired: {sparsity_desired}")
    print(f"   parameters_desired: {parameters_desired}")
    print(f"   sparsifiable_params: {sparsifiable_params}")
    print(f"   total_params: {total_params}")
    print(f"   target_sparsity: {target_sparsity} (set your sparsity to this)")
    print()

    if not test_sparsity:
        return

    print(bold("Testing target sparsity..."))

    # Edit config to use the new sparsity param (sparsity=target_sparsity).
    exp_config["config_kwargs"]["sparsity"] = target_sparsity
    exp_parser = HfArgumentParser(ModelArguments)
    model_args = exp_parser.parse_dict(exp_config)[0]
    model_args = replace(model_args, cache_dir=None)  # remove to run locally

    # Initialize model; this time with the new sparsity param.
    config = init_config(model_args)
    tokenizer = init_tokenizer(model_args)
    model = AutoModelForMaskedLM.from_config(config)
    model.resize_token_embeddings(len(tokenizer))

    # Set all on-weights to one to make sure none are randomly off.
    sparse_modules = filter_modules(model.bert,
                                    include_modules=[SparseWeightsBase])
    for _, m in sparse_modules.items():
        m.weight.data[:] = 1
    model.apply(rezero_weights)  # set off weights to zero.

    resulting_sparsity = calc_model_sparsity(model.bert)
    _, nz_params = count_nonzero_params(model.bert)
    print(
        f"    Resulting sparsity of model.bert using sparsity={target_sparsity}\n"
        f"       actual_sparsity={resulting_sparsity}\n"
        f"       num_nonzero_params={nz_params}\n")
    print(f"    Note this may not be exactly as desired as there are "
          "discrete levels of allowable sparsity")
    print()
def generate_training_args(args, inoculation_step):
    training_args = TrainingArguments("tmp_trainer")
    training_args.no_cuda = args.no_cuda
    training_args.seed = args.seed
    training_args.do_train = args.do_train
    training_args.do_eval = args.do_eval
    training_args.output_dir = os.path.join(args.output_dir, str(inoculation_step)+"-sample")
    training_args.evaluation_strategy = args.evaluation_strategy # evaluation is done after each epoch
    training_args.metric_for_best_model = args.metric_for_best_model
    training_args.greater_is_better = args.greater_is_better
    training_args.logging_dir = args.logging_dir
    training_args.task_name = args.task_name
    training_args.learning_rate = args.learning_rate
    training_args.per_device_train_batch_size = args.per_device_train_batch_size
    training_args.per_device_eval_batch_size = args.per_device_eval_batch_size
    training_args.num_train_epochs = args.num_train_epochs # this is the maximum num_train_epochs, we set this to be 100.
    training_args.eval_steps = args.eval_steps
    training_args.logging_steps = args.logging_steps
    training_args.load_best_model_at_end = args.load_best_model_at_end
    if args.save_total_limit != -1:
        # only set if it is specified
        training_args.save_total_limit = args.save_total_limit
    import datetime
    date_time = "{}-{}".format(datetime.datetime.now().month, datetime.datetime.now().day)
    run_name = "{0}_{1}_{2}_{3}_mlen_{4}_lr_{5}_seed_{6}_metrics_{7}".format(
        args.run_name,
        args.task_name,
        args.model_type,
        date_time,
        args.max_seq_length,
        args.learning_rate,
        args.seed,
        args.metric_for_best_model
    )
    training_args.run_name = run_name
    training_args_dict = training_args.to_dict()
    # for PR
    _n_gpu = training_args_dict["_n_gpu"]
    del training_args_dict["_n_gpu"]
    training_args_dict["n_gpu"] = _n_gpu
    HfParser = HfArgumentParser((TrainingArguments))
    training_args = HfParser.parse_dict(training_args_dict)[0]

    if args.model_path == "":
        args.model_path = args.model_type
        if args.model_type == "":
            assert False # you have to provide one of them.
    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )

    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")
    return training_args
def main(args_dict=None):
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if args_dict is not None:
        model_args, data_args, training_args = parser.parse_dict(args_dict)
    elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Set project name
    os.environ["WANDB_PROJECT"] = "multilingual_zeroshot"

    num_labels = 3
    labels = ['entailment', 'neutral', 'contradiction']

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = MBartConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        dropout=model_args.dropout,
        attention_dropout=model_args.attention_dropout,
        finetuning_task="mnli",
        cache_dir=model_args.cache_dir,
    )
    tokenizer = MBartTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = MBartForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    columns = ['input_ids', 'attention_mask', 'labels']
    map_fn = get_mnli_map_fn(data_args.lang, data_args.max_seq_length,
                             tokenizer)

    train_dataset = nlp.load_dataset("multi_nli", split="train")
    train_dataset = train_dataset.map(map_fn, batched=True, batch_size=512)
    train_dataset.set_format(type='torch', columns=columns)

    eval_dataset = (nlp.load_dataset("multi_nli", split="validation_matched")
                    if training_args.do_eval else None)
    eval_dataset = eval_dataset.map(map_fn, batched=True, batch_size=512)
    eval_dataset.set_format(type='torch', columns=columns)

    def compute_metrics_fn(p: EvalPrediction):
        preds = np.argmax(p.predictions, axis=1)
        return glue_compute_metrics("classification", preds, p.label_ids)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics_fn,
        data_collator=DataCollator(tokenizer),
    )

    # disable wandb console logs
    logging.getLogger('wandb.run_manager').setLevel(logging.WARNING)

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        mis_matched_dataset = nlp.load_dataset("multi_nli",
                                               split="validation_mismatched")
        mis_matched_dataset = mis_matched_dataset.map(map_fn,
                                                      batched=True,
                                                      batch_size=512)
        mis_matched_dataset.set_format(type='torch', columns=columns)
        eval_datasets = [eval_dataset, mis_matched_dataset]

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = compute_metrics_fn
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(training_args.output_dir,
                                            f"eval_results.txt")
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results *****")
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)
Beispiel #17
0
    def __init__(self, model_type, model_name_or_path, output_dir, cache_dir,
                 data_dir, train_file_path, predict_file_path, aug_file_path,
                 do_aug, do_alum, alpha, eps, eta, sigma, do_train,
                 do_adv_eval, do_eval, per_device_train_batch_size,
                 per_device_eval_batch_size, gradient_accumulation_steps,
                 eval_all_checkpoints, num_train_epochs, max_steps, save_steps,
                 seed, fp16):

        args = {
            "model_type": model_type,
            "model_name_or_path": model_name_or_path,
            "output_dir": output_dir,
            "cache_dir": cache_dir,
            "data_dir": data_dir,
            "train_file_path": train_file_path,
            "predict_file_path": predict_file_path,
            "aug_file_path": aug_file_path,
            "do_aug": do_aug,
            "do_alum": do_alum,
            "alpha": alpha,
            "eps": eps,
            "eta": eta,
            "sigma": sigma,
            "do_train": do_train,
            "do_adv_eval": do_adv_eval,
            "do_eval": do_eval,
            "per_device_train_batch_size": per_device_train_batch_size,
            "per_device_eval_batch_size": per_device_eval_batch_size,
            "gradient_accumulation_steps": gradient_accumulation_steps,
            "eval_all_checkpoints": eval_all_checkpoints,
            "num_train_epochs": num_train_epochs,
            "max_steps": max_steps,
            "save_steps": save_steps,
            "seed": seed,
            "fp16": fp16,
        }
        parser = HfArgumentParser(
            dataclass_types=[ModelArguments, TrainingArguments])
        self.model_args, self.training_args = parser.parse_dict(args)

        # Load model and tokenizer
        config, self.model_cls, tokenizer_cls = MODEL_CLASSES[
            self.model_args.model_type]
        self.tokenizer = tokenizer_cls.from_pretrained(
            self.model_args.tokenizer_name_or_path
            if self.model_args.tokenizer_name_or_path else
            self.model_args.model_name_or_path,
            cache_dir=self.model_args.cache_dir,
        )
        model = self.model_cls.from_pretrained(
            self.model_args.model_name_or_path,
            cache_dir=self.model_args.cache_dir,
        )

        # Load training dataset
        if self.training_args.do_train:
            train_dataset = load_and_cache_examples(self.model_args,
                                                    self.tokenizer)
        else:
            train_dataset = None

        # Initialize the Trainer
        self.trainer = Trainer(
            model_args=self.model_args,
            data_collator=None,
            model=model,
            tokenizer=self.tokenizer,
            args=self.training_args,
            train_dataset=train_dataset,
            prediction_loss_only=True,
        )