コード例 #1
0
def test_huggingface_model_file(tmp_dir, model, args, data, tokenizer):
    dvclive.init("logs")
    model_path = tmp_dir / "model_hf"

    trainer = Trainer(
        model,
        args,
        train_dataset=data[0],
        eval_dataset=data[1],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    trainer.add_callback(DvcLiveCallback(model_file=model_path))
    trainer.train()

    assert model_path.is_dir()
    assert (model_path / "pytorch_model.bin").exists()
    assert (model_path / "config.json").exists()
コード例 #2
0
def test_huggingface_integration(tmp_dir, model, args, data, tokenizer):
    trainer = Trainer(
        model,
        args,
        train_dataset=data[0],
        eval_dataset=data[1],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    trainer.add_callback(DvcLiveCallback())
    trainer.train()

    assert os.path.exists("dvclive")

    logs, _ = read_logs(tmp_dir / "dvclive" / Scalar.subfolder)

    assert len(logs) == 10
    assert "eval_matthews_correlation" in logs
    assert "eval_loss" in logs
    assert len(logs["epoch"]) == 3
    assert len(logs["eval_loss"]) == 2
コード例 #3
0
def test_huggingface_model_file(tmp_dir, model, args, data, tokenizer, mocker):
    model_path = tmp_dir / "model_hf"
    model_save = mocker.spy(model, "save_pretrained")
    tokernizer_save = mocker.spy(tokenizer, "save_pretrained")
    trainer = Trainer(
        model,
        args,
        train_dataset=data[0],
        eval_dataset=data[1],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    trainer.add_callback(DvcLiveCallback(model_file=model_path))
    trainer.train()

    assert model_path.is_dir()

    assert (model_path / "pytorch_model.bin").exists()
    assert (model_path / "config.json").exists()
    assert model_save.call_count == 2

    assert (model_path / "tokenizer.json").exists()
    assert tokernizer_save.call_count == 2
コード例 #4
0
def train(data_args, last_checkpoint, model, model_args, tokenized_datasets,
          tokenizer, training_args):
    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
    tb_logger = pl_loggers.TensorBoardLogger('experiments/dilbert/logs/')
    # Initialize our Trainer
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=tokenized_datasets["train"]
                      if training_args.do_train else None,
                      eval_dataset=tokenized_datasets["validation"]
                      if training_args.do_eval else None,
                      tokenizer=tokenizer,
                      data_collator=data_collator)
    writer = SummaryWriter('experiments/dilbert/logs')
    trainer.add_callback(TensorBoardCallback(tb_writer=writer))
    trainer.add_callback(GPUMemoryPrinterCallback())
    # Training
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif model_args.model_name_or_path is not None and os.path.isdir(
                model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload

        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        if trainer.is_world_process_zero():
            with open(output_train_file, "w") as writer:
                logger.info("***** Train results *****")
                for key, value in sorted(train_result.metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir, "trainer_state.json"))
    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        results["perplexity"] = perplexity

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_mlm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in sorted(results.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")
    return results
    def train(self, inoculation_train_df, eval_df, model_path, training_args, max_length=128,
              inoculation_patience_count=5, pd_format=True, 
              scramble_proportion=0.0, eval_with_scramble=False):

        if pd_format:
            datasets = {}
            datasets["train"] = Dataset.from_pandas(inoculation_train_df)
            datasets["validation"] = Dataset.from_pandas(eval_df)
        else:
            datasets = {}
            datasets["train"] = inoculation_train_df
            datasets["validation"] = eval_df
        logger.info(f"***** Train Sample Count (Verify): %s *****"%(len(datasets["train"])))
        logger.info(f"***** Valid Sample Count (Verify): %s *****"%(len(datasets["validation"])))
    
        label_list = datasets["validation"].unique("label")
        label_list.sort()  # Let's sort it for determinism

        sentence1_key, sentence2_key = self.task_config
        
        # we will scramble out input sentence here
        # TODO: we scramble both train and eval sets
        if self.task_name == "sst3" or self.task_name == "cola":
            def scramble_inputs(proportion, example):
                original_text = example[sentence1_key]
                original_sentence = basic_tokenizer.tokenize(original_text)
                max_length = len(original_sentence)
                scramble_length = int(max_length*proportion)
                scramble_start = random.randint(0, len(original_sentence)-scramble_length)
                scramble_end = scramble_start + scramble_length
                scramble_sentence = original_sentence[scramble_start:scramble_end]
                random.shuffle(scramble_sentence)
                scramble_text = original_sentence[:scramble_start] + scramble_sentence + original_sentence[scramble_end:]

                out_string = " ".join(scramble_text).replace(" ##", "").strip()
                example[sentence1_key] = out_string
                return example
        elif self.task_name == "snli" or             self.task_name == "mrpc" or             self.task_name == "qnli":
            def scramble_inputs(proportion, example):
                original_premise = example[sentence1_key]
                original_hypothesis = example[sentence2_key]
                if original_hypothesis == None:
                    original_hypothesis = ""
                try:
                    original_premise_tokens = basic_tokenizer.tokenize(original_premise)
                    original_hypothesis_tokens = basic_tokenizer.tokenize(original_hypothesis)
                except:
                    print("Please debug these sequence...")
                    print(original_premise)
                    print(original_hypothesis)

                max_length = len(original_premise_tokens)
                scramble_length = int(max_length*proportion)
                scramble_start = random.randint(0, max_length-scramble_length)
                scramble_end = scramble_start + scramble_length
                scramble_sentence = original_premise_tokens[scramble_start:scramble_end]
                random.shuffle(scramble_sentence)
                scramble_text_premise = original_premise_tokens[:scramble_start] + scramble_sentence + original_premise_tokens[scramble_end:]

                max_length = len(original_hypothesis_tokens)
                scramble_length = int(max_length*proportion)
                scramble_start = random.randint(0, max_length-scramble_length)
                scramble_end = scramble_start + scramble_length
                scramble_sentence = original_hypothesis_tokens[scramble_start:scramble_end]
                random.shuffle(scramble_sentence)
                scramble_text_hypothesis = original_hypothesis_tokens[:scramble_start] + scramble_sentence + original_hypothesis_tokens[scramble_end:]

                out_string_premise = " ".join(scramble_text_premise).replace(" ##", "").strip()
                out_string_hypothesis = " ".join(scramble_text_hypothesis).replace(" ##", "").strip()
                example[sentence1_key] = out_string_premise
                example[sentence2_key] = out_string_hypothesis
                return example
        
        if scramble_proportion > 0.0:
            logger.info(f"You are scrambling the inputs to test syntactic feature importance!")
            datasets["train"] = datasets["train"].map(partial(scramble_inputs, scramble_proportion))
            if eval_with_scramble:
                logger.info(f"You are scrambling the evaluation data as well!")
                datasets["validation"] = datasets["validation"].map(partial(scramble_inputs, scramble_proportion))
        
        padding = "max_length"
        sentence1_key, sentence2_key = self.task_config
        label_to_id = None
        def preprocess_function(examples):
            # Tokenize the texts
            args = (
                (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
            )
            result = self.tokenizer(*args, padding=padding, max_length=max_length, truncation=True)
            # Map labels to IDs (not necessary for GLUE tasks)
            if label_to_id is not None and "label" in examples:
                result["label"] = [label_to_id[l] for l in examples["label"]]
            return result
        datasets["train"] = datasets["train"].map(preprocess_function, batched=True)
        datasets["validation"] = datasets["validation"].map(preprocess_function, batched=True)
        
        train_dataset = datasets["train"]
        eval_dataset = datasets["validation"]
        
        # Log a few random samples from the training set:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
            
        metric = load_metric("glue", "sst2") # any glue task will do the job, just for eval loss
        
        def asenti_compute_metrics(p: EvalPrediction):
            preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
            preds = np.argmax(preds, axis=1)
            result_to_print = classification_report(p.label_ids, preds, digits=5, output_dict=True)
            print(classification_report(p.label_ids, preds, digits=5))
            mcc_scores = matthews_corrcoef(p.label_ids, preds)
            logger.info(f"MCC scores: {mcc_scores}.")
            result_to_return = metric.compute(predictions=preds, references=p.label_ids)
            result_to_return["Macro-F1"] = result_to_print["macro avg"]["f1-score"]
            result_to_return["MCC"] = mcc_scores
            return result_to_return

        # Initialize our Trainer. We are only intersted in evaluations
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=asenti_compute_metrics,
            tokenizer=self.tokenizer,
            # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
            data_collator=default_data_collator
        )
        # Early stop
        if inoculation_patience_count != -1:
            trainer.add_callback(EarlyStoppingCallback(inoculation_patience_count))
        
        # Training
        if training_args.do_train:
            logger.info("*** Training our model ***")
            trainer.train(
                # we don't need this now.
                # model_path=model_path
            )
            trainer.save_model()  # Saves the tokenizer too for easy upload
        
        # Evaluation
        eval_results = {}
        if training_args.do_eval:
            logger.info("*** Evaluate ***")
            tasks = [self.task_name]
            eval_datasets = [eval_dataset]
            for eval_dataset, task in zip(eval_datasets, tasks):
                eval_result = trainer.evaluate(eval_dataset=eval_dataset)
                output_eval_file = os.path.join(training_args.output_dir, f"eval_results_{task}.txt")
                if trainer.is_world_process_zero():
                    with open(output_eval_file, "w") as writer:
                        logger.info(f"***** Eval results {task} *****")
                        for key, value in eval_result.items():
                            logger.info(f"  {key} = {value}")
                            writer.write(f"{key} = {value}\n")
                eval_results.update(eval_result)
コード例 #6
0
    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics_pos
        if args.task_name == "en_ewt" else compute_metrics_ner,
    )

    # Early stop
    if args.inoculation_patience_count != -1:
        trainer.add_callback(
            EarlyStoppingCallback(args.inoculation_patience_count))

    # Training
    if training_args.do_train:
        checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        trainer.save_model()  # Saves the tokenizer too for easy upload

        metrics["train_samples"] = len(train_dataset)

        # trainer.log_metrics("train", metrics)
        # trainer.save_metrics("train", metrics)
        # trainer.save_state()

    # Evaluation
コード例 #7
0
ファイル: run_clm.py プロジェクト: savvihub/transformers
def main():
    if is_wandb_available():
        import wandb

    parser = HfArgumentParser(
        (ModelArguments, DatasetArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name)
    else:
        datasets = load_from_disk(data_args.dataset_path)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_name_or_path:
        model = AutoModelForCausalLM.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForCausalLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    def tokenize_function(examples):
        return tokenizer(examples[text_column_name])

    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        num_proc=None,
        remove_columns=column_names,
        load_from_cache_file=not False,
    )

    block_size = tokenizer.model_max_length
    if block_size > 1024:
        logger.warn(
            f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
            "Picking 1024 instead. You can change that default value by passing --block_size xxx."
        )
    block_size = 1024

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {
            k: sum(examples[k], [])
            for k in examples.keys()
        }
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k:
            [t[i:i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
    # to preprocess.
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=None,
        load_from_cache_file=not False,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=lm_datasets["train"] if training_args.do_train else None,
        eval_dataset=lm_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        # Data collator will default to DataCollatorWithPadding, so we change it.
        data_collator=default_data_collator,
    )

    # Add Callbacks
    savvi_callback = SavviCallback()
    trainer.add_callback(savvi_callback)

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path if
                      (model_args.model_name_or_path is not None
                       and os.path.isdir(model_args.model_name_or_path)) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        results["perplexity"] = perplexity

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_clm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    savvihub.log(step=0, row={'perplexity': results["perplexity"]})

    return results
コード例 #8
0
def evaluate_model(cache_dict, layer_id, finally_pruned_layers, config,
                   model_args, data_args, training_args, train_dataset,
                   eval_dataset, compute_metrics, tokenizer, data_collator,
                   datasets):

    # Set seed before initializing model.
    set_seed(training_args.seed)

    if (layer_id in cache_dict):
        print("Layer %d from cache: %.4f" % (layer_id, cache_dict[layer_id]))
        return cache_dict[layer_id]

    print(f"Calculate layer {str(layer_id)}")
    model = create_model(config, model_args)

    model.prune_layers(finally_pruned_layers)
    if isinstance(layer_id, int):
        model.prune_layers([layer_id])

    # for param in model.base_model.parameters():
    #     param.requires_grad = False

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    trainer.add_callback(DisableCheckpointCallbackHandler())

    # Training
    train_result = trainer.train(resume_from_checkpoint=None)
    metrics = train_result.metrics

    # Evaluation
    eval_results = {}
    logger.info("*** Evaluate ***")

    # Loop to handle MNLI double evaluation (matched, mis-matched)
    tasks = [data_args.task_name]
    eval_datasets = [eval_dataset]
    if data_args.task_name == "mnli":
        tasks.append("mnli-mm")
        eval_mismatch = datasets["validation_mismatched"]
        eval_datasets.append(eval_mismatch)

    for eval_dataset, task in zip(eval_datasets, tasks):
        eval_result = trainer.evaluate(eval_dataset=eval_dataset)
        eval_results.update(eval_result)

    # res = eval_results.get("eval_loss", None)
    res = None
    res = res or eval_results.get("eval_f1", None)
    res = res or eval_results.get("eval_accuracy", None)
    res = res or eval_results.get("eval_spearmanr", None)
    res = res or eval_results.get("eval_matthews_correlation", None)

    res = round(res, 3)

    if (res == None):
        raise Exception("Now performance metric found!")

    cache_dict[layer_id] = res
    return cache_dict
コード例 #9
0
def main():
    args = get_args()
    set_seed(args.seed)

    dataset = load_dataset("codeparrot/codecomplex", split="train")
    train_test = dataset.train_test_split(test_size=0.2)
    test_validation = train_test["test"].train_test_split(test_size=0.5)
    train_test_validation = DatasetDict({
        "train": train_test["train"],
        "test": test_validation["train"],
        "valid": test_validation["test"],
    })

    print("Loading tokenizer and model")
    tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt,
                                                               num_labels=7)
    model.config.pad_token_id = model.config.eos_token_id

    if args.freeze:
        for param in model.roberta.parameters():
            param.requires_grad = False

    labels = ClassLabel(num_classes=7,
                        names=list(
                            set(train_test_validation["train"]["complexity"])))

    def tokenize(example):
        inputs = tokenizer(example["src"], truncation=True, max_length=1024)
        label = labels.str2int(example["complexity"])
        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "label": label,
        }

    tokenized_datasets = train_test_validation.map(
        tokenize,
        batched=True,
        remove_columns=train_test_validation["train"].column_names,
    )
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        learning_rate=args.learning_rate,
        lr_scheduler_type=args.lr_scheduler_type,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        num_train_epochs=args.num_epochs,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        weight_decay=0.01,
        metric_for_best_model="accuracy",
        run_name="complexity-java",
        report_to="wandb",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["valid"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print("Training...")
    trainer.add_callback(CustomCallback(trainer))
    trainer.train()
コード例 #10
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Detecting last checkpoint.
    last_checkpoint = None
    #training_args.output_dir = f"{training_args.output_dir}/{data_args.task_name}/{model_args.model_name_or_path}/{model_args.prune_method}/{str(model_args.prune_n_layers)}/{str(training_args.seed)}"
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
    # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
    # label if at least two columns are provided.
    #
    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
    # single column. You can easily tweak this behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.task_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset("glue", data_args.task_name)
    else:
        # Loading a dataset from your local files.
        # CSV/JSON training and evaluation files are needed.
        data_files = {
            "train": data_args.train_file,
            "validation": data_args.validation_file
        }

        # Get the test dataset: you can provide your own CSV/JSON test file (see below)
        # when you use `do_predict` without specifying a GLUE benchmark task.
        if training_args.do_predict:
            if data_args.test_file is not None:
                train_extension = data_args.train_file.split(".")[-1]
                test_extension = data_args.test_file.split(".")[-1]
                assert (
                    test_extension == train_extension
                ), "`test_file` should have the same extension (csv or json) as `train_file`."
                data_files["test"] = data_args.test_file
            else:
                raise ValueError(
                    "Need either a GLUE task or a test file for `do_predict`.")

        for key in data_files.keys():
            logger.info(f"load a local file for {key}: {data_files[key]}")

        if data_args.train_file.endswith(".csv"):
            # Loading a dataset from local csv files
            datasets = load_dataset("csv", data_files=data_files)
        else:
            # Loading a dataset from local json files
            datasets = load_dataset("json", data_files=data_files)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Labels
    if data_args.task_name is not None:
        is_regression = data_args.task_name == "stsb"
        if not is_regression:
            label_list = datasets["train"].features["label"].names
            num_labels = len(label_list)
        else:
            num_labels = 1
    else:
        # Trying to have good defaults here, don't hesitate to tweak to your needs.
        is_regression = datasets["train"].features["label"].dtype in [
            "float32", "float64"
        ]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
            label_list = datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = create_model(config, model_args)

    #
    # Prune model before training starts
    #
    if (model_args.prune_method == "prune-greedy"):
        current_path = pathlib.Path(__file__).parent.absolute()
        layer_file_path = os.path.join(
            current_path, "layer_files/",
            f"{model.name_or_path}_{data_args.task_name}_greedy.txt")
        with open(layer_file_path, 'r') as f:
            layers_to_prune = f.readlines()
        layers_to_prune = layers_to_prune[:model_args.prune_n_layers]
        layers_to_prune = [int(l.replace("\n", "")) for l in layers_to_prune]
        print(f"Pruned {str(layers_to_prune)}")
        model.prune_layers(layers_to_prune)

    elif (model_args.prune_method == "top-layers"):
        print(
            f"# Prune {model_args.prune_n_layers} layers with {model_args.prune_method}"
        )
        first_layer_to_prune = config.num_hidden_layers - model_args.prune_n_layers
        model.prune_layers(
            [i for i in range(first_layer_to_prune, config.num_hidden_layers)])

        # # Measure number of parameters
        # It really depends how pruning is implemented - if its deleted from the layers
        # module list or if the layer is simply skipped (then torch still measures those values)
        # if hasattr(model.base_model, "encoder"):
        #     base_class = model.base_model.encoder
        # else:
        #     base_class = model.base_model

        # layers = base_class.layer
        # layers = [l for (i, l) in enumerate(layers) if i not in model.get_pruned_layers()]
        # layers = nn.ModuleList(layers)
        # setattr(base_class, "layer", layers)

    # Print number of parameters
    num_params = sum(p.numel() for p in model.parameters())
    print("NUM Paramerers: %d" % num_params)

    # Preprocessing the datasets
    if data_args.task_name is not None:
        sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
    else:
        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
        non_label_column_names = [
            name for name in datasets["train"].column_names if name != "label"
        ]
        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
            sentence1_key, sentence2_key = "sentence1", "sentence2"
        else:
            if len(non_label_column_names) >= 2:
                sentence1_key, sentence2_key = non_label_column_names[:2]
            else:
                sentence1_key, sentence2_key = non_label_column_names[0], None

    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = None
    if (model.config.label2id !=
            PretrainedConfig(num_labels=num_labels).label2id
            and data_args.task_name is not None and not is_regression):
        # Some have all caps in their config, some don't.
        label_name_to_id = {
            k.lower(): v
            for k, v in model.config.label2id.items()
        }
        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
            label_to_id = {
                i: label_name_to_id[label_list[i]]
                for i in range(num_labels)
            }
        else:
            logger.warn(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                "\nIgnoring the model labels as a result.",
            )
    elif data_args.task_name is None and not is_regression:
        label_to_id = {v: i for i, v in enumerate(label_list)}

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warn(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_function(examples):
        # Tokenize the texts
        args = ((examples[sentence1_key], ) if sentence2_key is None else
                (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*args,
                           padding=padding,
                           max_length=max_seq_length,
                           truncation=True)

        # Map labels to IDs (not necessary for GLUE tasks)
        if label_to_id is not None and "label" in examples:
            result["label"] = [label_to_id[l] for l in examples["label"]]
        return result

    datasets = datasets.map(preprocess_function,
                            batched=True,
                            load_from_cache_file=not data_args.overwrite_cache)

    train_dataset = datasets["train"]
    eval_dataset = datasets["validation_matched" if data_args.task_name ==
                            "mnli" else "validation"]
    if data_args.task_name is not None or data_args.test_file is not None:
        test_dataset = datasets["test_matched" if data_args.task_name ==
                                "mnli" else "test"]

    # Get the metric function
    if data_args.task_name is not None:
        metric = load_metric("glue", data_args.task_name)
    # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from
    # compute_metrics

    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions,
                                               tuple) else p.predictions
        preds = np.squeeze(preds) if is_regression else np.argmax(preds,
                                                                  axis=1)
        if data_args.task_name is not None:
            result = metric.compute(predictions=preds, references=p.label_ids)
            if len(result) > 1:
                result["combined_score"] = np.mean(list(
                    result.values())).item()
            return result
        elif is_regression:
            return {"mse": ((preds - p.label_ids)**2).mean().item()}
        else:
            return {
                "accuracy":
                (preds == p.label_ids).astype(np.float32).mean().item()
            }

    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # trainer.add_callback(
    #     PruneCallbackHandler(
    #         model_args.prune_method,
    #         model_args.prune_n_layers,
    #         data_args.task_name)
    # )
    trainer.add_callback(DisableCheckpointCallbackHandler())

    # Training
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif os.path.isdir(model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics

        #trainer.save_model()  # Saves the tokenizer too for easy upload

        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        if trainer.is_world_process_zero():
            with open(output_train_file, "w") as writer:
                logger.info("***** Train results *****")
                for key, value in sorted(metrics.items()):
                    logger.info(f"  {key}: {value}")
                    writer.write(f"{key}: {value}\n")

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            eval_datasets.append(datasets["validation_mismatched"])

        for eval_dataset, task in zip(eval_datasets, tasks):

            # eval_result = trainer.evaluate(eval_dataset=eval_dataset)
            start = time.time()
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)
            print(f"TIMING: {(time.time() - start)} ")
            exit()
            output_eval_file = os.path.join(training_args.output_dir,
                                            f"eval_results_{task}.txt")
            if trainer.is_world_process_zero():
                with open(output_eval_file, "w") as writer:
                    logger.info(f"***** Eval results {task} *****")
                    for key, value in sorted(eval_result.items()):
                        logger.info(f"  {key}: {value}")
                        writer.write(f"{key}: {value}\n")

            eval_results.update(eval_result)

    if training_args.do_predict:
        logger.info("*** Test ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            test_datasets.append(datasets["test_mismatched"])

        for test_dataset, task in zip(test_datasets, tasks):
            # Removing the `label` columns because it contains -1 and Trainer won't like that.
            test_dataset.remove_columns_("label")
            predictions = trainer.predict(
                test_dataset=test_dataset).predictions
            predictions = np.squeeze(
                predictions) if is_regression else np.argmax(predictions,
                                                             axis=1)

            output_test_file = os.path.join(training_args.output_dir,
                                            f"test_results_{task}.txt")
            if trainer.is_world_process_zero():
                with open(output_test_file, "w") as writer:
                    logger.info(f"***** Test results {task} *****")
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        if is_regression:
                            writer.write(f"{index}\t{item:3.3f}\n")
                        else:
                            item = label_list[item]
                            writer.write(f"{index}\t{item}\n")
    return eval_results
コード例 #11
0
        metric_for_best_model=args.criterion,
        load_best_model_at_end=True)

    collator = get_collator(tokenizer)
    es_callback = EarlyStoppingCallback(early_stopping_patience=5)

    print(f"- Training args: {training_args}")
    trainer = Trainer(model,
                      args=training_args,
                      train_dataset=train_ds,
                      eval_dataset=test_ds,
                      compute_metrics=compute_metrics,
                      optimizers=(optimizer, scheduler),
                      data_collator=collator)

    trainer.add_callback(es_callback)

    trainer.train()

    print(f"- Label encoder mapping:")
    for i, label in enumerate(label_encoder.classes_):
        print(f"\t{i}: {label}")

    if args.save_test_preds:
        print(f"- Saving predictions to {args.save_test_preds}")

        preds = trainer.predict(test_ds)
        y_pred = np.argmax(preds[0], axis=1)

        test_df = pd.read_csv(args.test_file, index_col=0).dropna()
        test_df["prediction"] = label_encoder.inverse_transform(y_pred)