コード例 #1
0
    def uni_train(self,
                  data_dir: str,
                  train_dataset: Dataset,
                  eval_dataset: Optional[Dataset] = None,
                  compute_metrics_fn: Optional[Callable[[EvalPrediction],
                                                        Dict]] = None,
                  seed: int = None) -> PredictionOutput:
        r"""
        统一训练模块.
    """
        if not seed:
            seed = random.randint(0, 2020)
        set_seed(seed)

        # 构造训练参数
        training_args = self.init_training_args(self.model_path)

        # Initialize our Trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics_fn,
        )
        trainer.train(self.model_path)
        trainer.save_model()
        self.tokenizer.save_pretrained(self.model_path)
        self.trainer = trainer

        # Evaluation
        logger.info("*** Evaluate ***")
        trainer.compute_metrics = compute_metrics_fn
        eval_result = trainer.predict(test_dataset=eval_dataset)
        metrics = eval_result.metrics
        output_eval_file = os.path.join(self.model_path,
                                        f"eval_results_{self.task_name}.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(self.task_name))
            for key, value in metrics.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))

        return eval_result
コード例 #2
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    try:
        num_labels = glue_tasks_num_labels[data_args.task_name]
        output_mode = glue_output_modes[data_args.task_name]
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset = (GlueDataset(
        data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir)
                     if training_args.do_train else None)
    eval_dataset = (GlueDataset(data_args,
                                tokenizer=tokenizer,
                                mode="dev",
                                cache_dir=model_args.cache_dir)
                    if training_args.do_eval else None)
    test_dataset = (GlueDataset(data_args,
                                tokenizer=tokenizer,
                                mode="test",
                                cache_dir=model_args.cache_dir)
                    if training_args.do_predict else None)

    def build_compute_metrics_fn(
            task_name: str) -> Callable[[EvalPrediction], Dict]:
        def compute_metrics_fn(p: EvalPrediction):
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)
            return glue_compute_metrics(task_name, preds, p.label_ids)

        return compute_metrics_fn

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=build_compute_metrics_fn(data_args.task_name),
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            mnli_mm_data_args = dataclasses.replace(data_args,
                                                    task_name="mnli-mm")
            eval_datasets.append(
                GlueDataset(mnli_mm_data_args,
                            tokenizer=tokenizer,
                            mode="dev",
                            cache_dir=model_args.cache_dir))

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = build_compute_metrics_fn(
                eval_dataset.args.task_name)
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(
                training_args.output_dir,
                f"eval_results_{eval_dataset.args.task_name}.txt")
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results {} *****".format(
                        eval_dataset.args.task_name))
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)

    if training_args.do_predict:
        logging.info("*** Test ***")
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            mnli_mm_data_args = dataclasses.replace(data_args,
                                                    task_name="mnli-mm")
            test_datasets.append(
                GlueDataset(mnli_mm_data_args,
                            tokenizer=tokenizer,
                            mode="test",
                            cache_dir=model_args.cache_dir))

        for test_dataset in test_datasets:
            predictions = trainer.predict(
                test_dataset=test_dataset).predictions
            if output_mode == "classification":
                predictions = np.argmax(predictions, axis=1)

            output_test_file = os.path.join(
                training_args.output_dir,
                f"test_results_{test_dataset.args.task_name}.txt")
            if trainer.is_world_master():
                with open(output_test_file, "w") as writer:
                    logger.info("***** Test results {} *****".format(
                        test_dataset.args.task_name))
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        if output_mode == "regression":
                            writer.write("%d\t%3.3f\n" % (index, item))
                        else:
                            item = test_dataset.get_labels()[item]
                            writer.write("%d\t%s\n" % (index, item))
    return eval_results
コード例 #3
0
def twitter_bert(
        ROOTPATH=ROOTPATH,
        model_name_or_path="bert-base-uncased",
        task_name="TWIT",
        do_train=True,
        do_eval=True,
        data_dir=f'{ROOTPATH}/input',
        max_seq_length=128,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        num_train_epochs=3.0,
        cache_dir=None,
        output_dir=f'{ROOTPATH}/output',
        overwrite_cache=True,
        overwrite_output_dir=True,
        local_rank=-1,
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
        n_gpu=torch.cuda.device_count(),
        fp16=False,
        num_labels=2,
        evaluate_during_training=False,
        weight_decay=0,
        adam_epsilon=1e-8,
        max_grad_norm=1.0,
        train_dataset=None,
        dev_dataset=None,
        test_dataset=None,
        full_dataset=None,
        labels=None,
        temp_json=f'{ROOTPATH}/temp/run{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.json',
        use_test=False,
        save_steps=1e200,
        random_state=1234):

    set_seed(random_state)
    if full_dataset is not None:
        train_dataset, dev_dataset = train_test_split(
            full_dataset, test_size=0.2, random_state=random_state)

    # Setup logging
    logger = logging.getLogger(__name__)

    logger.info(f"LENGTH OF TRAIN DATASET: {len(train_dataset.index)}")
    # exit(0)

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        local_rank,
        device,
        n_gpu,
        bool(local_rank != -1),
        fp16,
    )

    logger.info(
        "Training/evaluation parameters local_rank: %s, device: %s, n_gpu: %s, fp16: %s",
        local_rank, device, n_gpu, fp16)
    logger.info(f"MAX SEQ LEN: {max_seq_length}")

    wordsegment.load()

    ## DEFINE FUNCTIONS
    @dataclass
    class ModelArguments:
        """
        Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
        """

        model_name_or_path: str = field(
            metadata={
                "help":
                "Path to pretrained model or model identifier from huggingface.co/models"
            })
        config_name: Optional[str] = field(
            default=None,
            metadata={
                "help":
                "Pretrained config name or path if not the same as model_name"
            })
        tokenizer_name: Optional[str] = field(
            default=None,
            metadata={
                "help":
                "Pretrained tokenizer name or path if not the same as model_name"
            })
        cache_dir: Optional[str] = field(
            default=None,
            metadata={
                "help":
                "Where do you want to store the pretrained models downloaded from s3"
            })

    training_args = TrainingArguments(
        output_dir=output_dir,
        do_train=True,
        do_eval=True,
        do_predict=True,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        learning_rate=learning_rate,
        overwrite_output_dir=overwrite_output_dir,
        evaluate_during_training=evaluate_during_training,
        weight_decay=weight_decay,
        adam_epsilon=adam_epsilon,
        max_grad_norm=max_grad_norm,
        save_steps=save_steps)

    data_args = DataTrainingArguments(task_name=task_name,
                                      data_dir=data_dir,
                                      max_seq_length=max_seq_length,
                                      overwrite_cache=overwrite_cache)

    model_args = ModelArguments(model_name_or_path=model_name_or_path, )

    def simple_accuracy(preds, labels):
        return (preds == labels).mean()

    def acc_and_f1(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1 = f1_score(y_true=labels, y_pred=preds)
        return {
            "acc": acc,
            "f1": f1,
            "acc_and_f1": (acc + f1) / 2,
        }

    def pearson_and_spearman(preds, labels):
        pearson_corr = pearsonr(preds, labels)[0]
        spearman_corr = spearmanr(preds, labels)[0]
        return {
            "pearson": pearson_corr,
            "spearmanr": spearman_corr,
            "corr": (pearson_corr + spearman_corr) / 2,
        }

    def compute_metrics(preds, labels):
        assert len(preds) == len(labels)
        return acc_and_f1(preds, labels)

    class TwitterProcessor(DataProcessor):
        def __init__(self):

            super(TwitterProcessor, self).__init__()
            '''
            You need to define three variables here:
            - self.train_dataset -> train dataset
            - self.dev_dataset -> dev dataset
            - self.test_dataset -> test dataset
            - self.labels -> a list of the labels

            Each {train,dev,test}_dataset must have (at least) two columns:
            - "tweet" -> includes the text of the tweet
            - "label" -> includes the label of the tweet


            '''

            self.train_dataset = train_dataset
            self.dev_dataset = dev_dataset
            self.test_dataset = test_dataset
            self.labels = labels

        def get_train_examples(self):
            return self._create_examples(self.train_dataset, "train")

        def get_dev_examples(self):
            return self._create_examples(self.dev_dataset, "train")

        def get_test_examples(self):
            return self._create_examples(self.test_dataset, "train")

        def get_labels(self):
            """See base class."""
            return self.labels

        def _preprocess_text(self, text):
            # 1
            text = emoji.demojize(text)

            # 2
            words = text.split()
            for word in words:
                if word[0] != '#':
                    continue
                hashtag = word[1:]
                replacement_words = wordsegment.segment(hashtag)
                text = text.replace(word, " ".join(replacement_words))

            # 3
            text = text.replace("URL", "http")

            # 4
            text = re.sub(r'(@[A-Za-z]+)( \1\b){3,}', r'\1 \1 \1', text)
            return text

        def _create_examples(self, data, set_type):

            examples = []

            raw_texts = data.tweet.values.tolist()
            raw_labels = data.label.values.tolist()

            for i in range(0, len(raw_texts)):
                guid = "%s-%s" % (set_type, i)
                raw_text = raw_texts[i]
                raw_label = raw_labels[i]
                label = raw_label

                text = self._preprocess_text(raw_text)
                examples.append(
                    InputExample(guid=guid,
                                 text_a=text,
                                 text_b=None,
                                 label=label))

            return examples

    def convert_examples_to_features(
        examples: List[InputExample],
        tokenizer: PreTrainedTokenizer,
        max_length: Optional[int] = None,
        task=None,
        label_list=None,
        output_mode=None,
    ):
        if max_length is None:
            max_length = tokenizer.max_len

        processor = TwitterProcessor()
        label_list = processor.get_labels()

        label_map = {label: i for i, label in enumerate(label_list)}

        def label_from_example(
                example: InputExample) -> Union[int, float, None]:
            return label_map[example.label]

        labels = [label_from_example(example) for example in examples]

        batch_encoding = tokenizer.batch_encode_plus(
            [(example.text_a, example.text_b) for example in examples],
            max_length=max_length,
            pad_to_max_length=True,
        )

        features = []
        for i in range(len(examples)):
            inputs = {k: batch_encoding[k][i] for k in batch_encoding}

            feature = InputFeatures(**inputs, label=labels[i])
            features.append(feature)

        for i, example in enumerate(examples[:5]):
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("features: %s" % features[i])

        return features

    class TwitterDataset(Dataset):
        """
        This will be superseded by a framework-agnostic approach
        soon.
        """
        def __init__(
            self,
            tokenizer,
            mode="train",
            cache_dir=cache_dir,
            args=data_args,
        ):
            self.args = args
            self.processor = TwitterProcessor()
            self.output_mode = 'Classification'

            label_list = self.processor.get_labels()
            self.label_list = label_list

            if mode == "dev":
                examples = self.processor.get_dev_examples()
            elif mode == "test":
                examples = self.processor.get_test_examples()
            else:
                examples = self.processor.get_train_examples()

            self.features = convert_examples_to_features(
                examples,
                tokenizer,
                max_length=max_seq_length,
                label_list=label_list,
                output_mode=self.output_mode,
            )

        def __len__(self):
            return len(self.features)

        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]

        def get_labels(self):
            return self.label_list

    def build_compute_metrics_fn(
            task_name: str) -> Callable[[EvalPrediction], Dict]:
        def compute_metrics_fn(p: EvalPrediction):
            preds = np.argmax(p.predictions, axis=1)
            return compute_metrics(preds, p.label_ids)

        return compute_metrics_fn

    # Create model
    config = AutoConfig.from_pretrained(
        model_name_or_path,
        num_labels=num_labels,
        cache_dir=cache_dir,
    )

    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        cache_dir=cache_dir,
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name_or_path,
        config=config,
        cache_dir=cache_dir,
    )

    # Get datasets
    train_dataset = (TwitterDataset(tokenizer=tokenizer,
                                    mode="train",
                                    cache_dir=cache_dir))
    eval_dataset = (TwitterDataset(tokenizer=tokenizer,
                                   mode="dev",
                                   cache_dir=cache_dir))

    if use_test:
        test_dataset = (TwitterDataset(tokenizer=tokenizer,
                                       mode="test",
                                       cache_dir=cache_dir))

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=build_compute_metrics_fn(task_name),
    )

    # Train the model
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model(f"{training_args.output_dir}/{task_name}")
        tokenizer.save_pretrained(f"{training_args.output_dir}/{task_name}")

    # Evaluation
    eval_results = []
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        if use_test:
            step_names = ["dev", "test"]
            eval_datasets = [eval_dataset, test_dataset]
        else:
            step_names = ["dev"]
            eval_datasets = [eval_dataset]

        ct = 0
        for eval_dataset in eval_datasets:

            step_name = step_names[ct]

            trainer.compute_metrics = build_compute_metrics_fn(
                eval_dataset.args.task_name)
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            logger.info("***** Eval results {} - {}*****".format(
                eval_dataset.args.task_name, step_name.upper()))
            for key, value in eval_result.items():
                logger.info("  %s = %s", key, value)

            # output_eval_file = os.path.join(
            #     training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}_{step_name}.txt"
            # )

            # if ct == 0:
            #     with open(output_eval_file, "w") as writer:
            #         logger.info("***** Eval results {} - {}*****".format(eval_dataset.args.task_name, step_name.upper()))
            #         for key, value in eval_result.items():
            #             logger.info("  %s = %s", key, value)
            # else:
            #     with open(output_eval_file, "a") as writer:
            #         logger.info("***** Eval results {} - {}*****".format(eval_dataset.args.task_name, step_name.upper()))
            #         for key, value in eval_result.items():
            #             logger.info("  %s = %s", key, value)

            eval_results.append(eval_result)

            write_type = 'a' if os.path.exists(temp_json) else 'w'
            with open(temp_json, write_type) as f:
                f.write(json.dumps(eval_result))
                f.write("\n")

            ct += 1

    return eval_results[-1]
コード例 #4
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    output_mode = "regression"

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    # tokenizer,用来做分词等数据预处理工作
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    train_dataset = RecDataset(data_args,
                               tokenizer=tokenizer,
                               cache_dir=model_args.cache_dir)
    # num_labels = len(train_dataset.get_labels())

    # config 包含了模型的基本参数设定
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )

    # 加载模型
    # model = AutoModelForSequenceClassification.from_pretrained(
    #     model_args.model_name_or_path,
    #     from_tf=bool(".ckpt" in model_args.model_name_or_path),
    #     config=config,
    #     cache_dir=model_args.cache_dir,
    # ) #.cuda()
    model = DualRobertaForDotProduct.from_pretrained(
        model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    eval_dataset = (RecDataset(data_args,
                               tokenizer=tokenizer,
                               mode="dev",
                               cache_dir=model_args.cache_dir)
                    if training_args.do_eval else None)
    test_dataset = (RecDataset(data_args,
                               tokenizer=tokenizer,
                               mode="test",
                               cache_dir=model_args.cache_dir)
                    if training_args.do_predict else None)

    def mse(preds, labels):
        return ((preds - labels) * (preds - labels)).mean()

    def compute_metrics_fn(p: EvalPrediction):
        preds = p.predictions
        return {"Rec": mse(preds, p.label_ids)}

    # Initialize our Trainer
    # 模型训练代码,非常值得一读 https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py#L134
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics_fn,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_datasets = [eval_dataset]

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = compute_metrics_fn
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(
                training_args.output_dir,
                f"eval_results_{eval_dataset.args.task_name}.txt")
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results {} *****".format(
                        eval_dataset.args.task_name))
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)

    if training_args.do_predict:
        logging.info("*** Test ***")
        test_datasets = [test_dataset]

        for test_dataset in test_datasets:
            predictions = trainer.predict(
                test_dataset=test_dataset).predictions

            output_test_file = os.path.join(
                training_args.output_dir,
                f"test_results_{test_dataset.args.task_name}.txt")
            if trainer.is_world_master():
                with open(output_test_file, "w") as writer:
                    logger.info("***** Test results {} *****".format(
                        test_dataset.args.task_name))
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        writer.write("%d\t%3.3f\n" % (index, item))
    return eval_results
コード例 #5
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # model_type = model_args.model_type
    # log_dir = './results'

    # if model_type == 'base':
    #     model_args.model_name_or_path = 'bert-base-uncased'
    # elif model_type == 'base-pubmed':
    #     model_args.model_name_or_path = 'bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12'
    # elif model_type == 'base-pubmed-mimic':
    #     model_args.model_name_or_path = 'bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12'
    # else:
    #     raise NotImplementedError

    # Setup logging
    logging.basicConfig(
        format=
        '[%(asctime)s - %(levelname)s - %(filename)s: %(lineno)d (%(funcName)s)] %(message)s',
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )

    # Set seed
    set_seed(training_args.seed)

    try:
        num_labels = glue_tasks_num_labels[data_args.task_name]
        output_mode = glue_output_modes[data_args.task_name]
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    dataset_name = data_args.data_dir.split('/')[-1]
    if dataset_name in ['GAD', 'EUADR']:
        final_split_results = []

        original_data_dir = copy.deepcopy(x=data_args.data_dir)

        data_splits = list(map(str, range(1, 11)))
        for split in data_splits:
            data_args.data_dir = os.path.join(original_data_dir, split)

            # Get datasets
            train_dataset = (GlueDataset(
                data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir)
                             if training_args.do_train else None)
            eval_dataset = (GlueDataset(data_args,
                                        tokenizer=tokenizer,
                                        mode="dev",
                                        cache_dir=model_args.cache_dir)
                            if training_args.do_eval else None)
            test_dataset = (GlueDataset(data_args,
                                        tokenizer=tokenizer,
                                        mode="test",
                                        cache_dir=model_args.cache_dir)
                            if training_args.do_predict else None)

            # Load pretrained model
            # Distributed training:
            # The .from_pretrained methods guarantee that only one local process can concurrently
            # download model & vocab.

            # Currently, this code do not support distributed training.
            training_args.warmup_steps = int(
                model_args.warmup_proportion *
                (len(train_dataset) /
                 training_args.per_device_train_batch_size) *
                training_args.num_train_epochs)
            training_args_weight_decay = 0.01
            logger.info("Training/evaluation parameters %s", training_args)

            config = AutoConfig.from_pretrained(
                model_args.config_name
                if model_args.config_name else model_args.model_name_or_path,
                num_labels=num_labels,
                finetuning_task=data_args.task_name,
                cache_dir=model_args.cache_dir,
            )
            try:
                model = AutoModelForSequenceClassification.from_pretrained(
                    model_args.model_name_or_path,
                    from_tf=False,
                    config=config,
                    cache_dir=model_args.cache_dir,
                )
            except:
                model = AutoModelForSequenceClassification.from_pretrained(
                    os.path.join(model_args.model_name_or_path,
                                 "model.ckpt.index"),
                    from_tf=True,
                    config=config,
                    cache_dir=model_args.cache_dir,
                )

            def build_compute_metrics_fn(
                    task_name: str) -> Callable[[EvalPrediction], Dict]:
                def compute_metrics_fn(p: EvalPrediction):
                    if output_mode == "classification":
                        preds = np.argmax(p.predictions, axis=1)
                    elif output_mode == "regression":
                        preds = np.squeeze(p.predictions)
                    return glue_compute_metrics(task_name, preds, p.label_ids)

                return compute_metrics_fn

            # Initialize our Trainer
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                compute_metrics=build_compute_metrics_fn(data_args.task_name),
            )

            # Training
            if training_args.do_train:
                training_start_time = time.time()

                trainer.train(
                    model_path=model_args.model_name_or_path if os.path.
                    isdir(model_args.model_name_or_path) else None)

                training_end_time = time.time()
                training_total_time = training_end_time - training_start_time

                trainer.save_model()
                # For convenience, we also re-save the tokenizer to the same directory,
                # so that you can share your model easily on huggingface.co/models =)
                if trainer.is_world_master():
                    tokenizer.save_pretrained(training_args.output_dir)

            # Evaluation
            eval_results = {}
            if training_args.do_eval:
                logger.info("*** Evaluate ***")

                # Loop to handle MNLI double evaluation (matched, mis-matched)
                eval_datasets = [eval_dataset]
                if data_args.task_name == "mnli":
                    mnli_mm_data_args = dataclasses.replace(
                        data_args, task_name="mnli-mm")
                    eval_datasets.append(
                        GlueDataset(mnli_mm_data_args,
                                    tokenizer=tokenizer,
                                    mode="dev",
                                    cache_dir=model_args.cache_dir))

                for eval_dataset in eval_datasets:
                    trainer.compute_metrics = build_compute_metrics_fn(
                        eval_dataset.args.task_name)
                    eval_result = trainer.evaluate(eval_dataset=eval_dataset)

                    output_eval_file = os.path.join(
                        training_args.output_dir,
                        f"eval_results_{eval_dataset.args.task_name}.txt")
                    if trainer.is_world_master():
                        with open(output_eval_file, "w") as writer:
                            logger.info("***** Eval results {} *****".format(
                                eval_dataset.args.task_name))
                            for key, value in eval_result.items():
                                logger.info("  %s = %s", key, value)
                                writer.write("%s = %s\n" % (key, value))

                    eval_results.update(eval_result)

            if training_args.do_predict:
                logging.info("*** Test ***")
                test_datasets = [test_dataset]
                if data_args.task_name == "mnli":
                    mnli_mm_data_args = dataclasses.replace(
                        data_args, task_name="mnli-mm")
                    test_datasets.append(
                        GlueDataset(mnli_mm_data_args,
                                    tokenizer=tokenizer,
                                    mode="test",
                                    cache_dir=model_args.cache_dir))

                for test_dataset in test_datasets:
                    predictions = trainer.predict(
                        test_dataset=test_dataset).predictions
                    labels = np.array([
                        test_dataset.__getitem__(idx).label
                        for idx in range(len(test_dataset))
                    ])

                    assert len(predictions) == len(
                        labels
                    ), f"len(predictions) = {len(predictions)} =/= len(labels) = {len(labels)}"

                    if output_mode == "classification":
                        predictions = np.argmax(predictions, axis=1)

                    output_test_file = os.path.join(
                        training_args.output_dir, f"test_results.txt"
                        #f"test_results_{test_dataset.args.task_name}.txt"
                    )

                    test_results = glue_compute_metrics(task_name='ddi',
                                                        preds=predictions,
                                                        labels=labels)

                    if trainer.is_world_master():
                        with open(output_test_file, "w") as writer:
                            logger.info("***** Test results {} *****".format(
                                test_dataset.args.task_name))
                            logger.info(
                                f"Accuracy: {test_results['acc']}\tMacro F1: {test_results['f1']}"
                            )
                            writer.write("index\tprediction\n")
                            for index, item in enumerate(predictions):
                                if output_mode == "regression":
                                    writer.write("%d\t%3.3f\n" % (index, item))
                                else:
                                    item = test_dataset.get_labels()[item]
                                    writer.write("%d\t%s\n" % (index, item))

                        training_time_formatted = time.strftime(
                            '%H:%M:%S', time.gmtime(training_total_time))
                        logger.info(
                            f"Total training time: {training_time_formatted}")

            final_results = copy.deepcopy(x=test_results)
            final_results['training_time'] = training_time_formatted

            logger.info(
                f"F1: {final_results['f1']} | Acc: {final_results['acc']} | Time Elapsed: {final_results['training_time']}"
            )

            final_split_results.append(final_results)
    else:
        # Get datasets
        train_dataset = (GlueDataset(
            data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir)
                         if training_args.do_train else None)
        eval_dataset = (GlueDataset(data_args,
                                    tokenizer=tokenizer,
                                    mode="dev",
                                    cache_dir=model_args.cache_dir)
                        if training_args.do_eval else None)
        test_dataset = (GlueDataset(data_args,
                                    tokenizer=tokenizer,
                                    mode="test",
                                    cache_dir=model_args.cache_dir)
                        if training_args.do_predict else None)

        # Load pretrained model
        # Distributed training:
        # The .from_pretrained methods guarantee that only one local process can concurrently
        # download model & vocab.

        # Currently, this code do not support distributed training.
        training_args.warmup_steps = int(
            model_args.warmup_proportion *
            (len(train_dataset) / training_args.per_device_train_batch_size) *
            training_args.num_train_epochs)
        training_args_weight_decay = 0.01
        logger.info("Training/evaluation parameters %s", training_args)

        config = AutoConfig.from_pretrained(
            model_args.config_name
            if model_args.config_name else model_args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=data_args.task_name,
            cache_dir=model_args.cache_dir,
        )
        try:
            model = AutoModelForSequenceClassification.from_pretrained(
                model_args.model_name_or_path,
                from_tf=False,
                config=config,
                cache_dir=model_args.cache_dir,
            )
        except:
            model = AutoModelForSequenceClassification.from_pretrained(
                os.path.join(model_args.model_name_or_path,
                             "model.ckpt.index"),
                from_tf=True,
                config=config,
                cache_dir=model_args.cache_dir,
            )

        def build_compute_metrics_fn(
                task_name: str) -> Callable[[EvalPrediction], Dict]:
            def compute_metrics_fn(p: EvalPrediction):
                if output_mode == "classification":
                    preds = np.argmax(p.predictions, axis=1)
                elif output_mode == "regression":
                    preds = np.squeeze(p.predictions)
                return glue_compute_metrics(task_name, preds, p.label_ids)

            return compute_metrics_fn

        # Initialize our Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=build_compute_metrics_fn(data_args.task_name),
        )

        # Training
        if training_args.do_train:
            training_start_time = time.time()

            trainer.train(model_path=model_args.model_name_or_path if os.path.
                          isdir(model_args.model_name_or_path) else None)

            training_end_time = time.time()
            training_total_time = training_end_time - training_start_time

            trainer.save_model()
            # For convenience, we also re-save the tokenizer to the same directory,
            # so that you can share your model easily on huggingface.co/models =)
            if trainer.is_world_master():
                tokenizer.save_pretrained(training_args.output_dir)

        # Evaluation
        eval_results = {}
        if training_args.do_eval:
            logger.info("*** Evaluate ***")

            # Loop to handle MNLI double evaluation (matched, mis-matched)
            eval_datasets = [eval_dataset]
            if data_args.task_name == "mnli":
                mnli_mm_data_args = dataclasses.replace(data_args,
                                                        task_name="mnli-mm")
                eval_datasets.append(
                    GlueDataset(mnli_mm_data_args,
                                tokenizer=tokenizer,
                                mode="dev",
                                cache_dir=model_args.cache_dir))

            for eval_dataset in eval_datasets:
                trainer.compute_metrics = build_compute_metrics_fn(
                    eval_dataset.args.task_name)
                eval_result = trainer.evaluate(eval_dataset=eval_dataset)

                output_eval_file = os.path.join(
                    training_args.output_dir,
                    f"eval_results_{eval_dataset.args.task_name}.txt")
                if trainer.is_world_master():
                    with open(output_eval_file, "w") as writer:
                        logger.info("***** Eval results {} *****".format(
                            eval_dataset.args.task_name))
                        for key, value in eval_result.items():
                            logger.info("  %s = %s", key, value)
                            writer.write("%s = %s\n" % (key, value))

                eval_results.update(eval_result)

        if training_args.do_predict:
            logging.info("*** Test ***")
            test_datasets = [test_dataset]
            if data_args.task_name == "mnli":
                mnli_mm_data_args = dataclasses.replace(data_args,
                                                        task_name="mnli-mm")
                test_datasets.append(
                    GlueDataset(mnli_mm_data_args,
                                tokenizer=tokenizer,
                                mode="test",
                                cache_dir=model_args.cache_dir))

            for test_dataset in test_datasets:
                predictions = trainer.predict(
                    test_dataset=test_dataset).predictions
                labels = np.array([
                    test_dataset.__getitem__(idx).label
                    for idx in range(len(test_dataset))
                ])

                assert len(predictions) == len(
                    labels
                ), f"len(predictions) = {len(predictions)} =/= len(labels) = {len(labels)}"

                if output_mode == "classification":
                    predictions = np.argmax(predictions, axis=1)

                output_test_file = os.path.join(
                    training_args.output_dir, f"test_results.txt"
                    #f"test_results_{test_dataset.args.task_name}.txt"
                )

                test_results = glue_compute_metrics(task_name='ddi',
                                                    preds=predictions,
                                                    labels=labels)

                if trainer.is_world_master():
                    with open(output_test_file, "w") as writer:
                        logger.info("***** Test results {} *****".format(
                            test_dataset.args.task_name))
                        logger.info(
                            f"Accuracy: {test_results['acc']}\tMacro F1: {test_results['f1']}"
                        )
                        writer.write("index\tprediction\n")
                        for index, item in enumerate(predictions):
                            if output_mode == "regression":
                                writer.write("%d\t%3.3f\n" % (index, item))
                            else:
                                item = test_dataset.get_labels()[item]
                                writer.write("%d\t%s\n" % (index, item))

                    training_time_formatted = time.strftime(
                        '%H:%M:%S', time.gmtime(training_total_time))
                    logger.info(
                        f"Total training time: {training_time_formatted}")

        final_results = copy.deepcopy(x=test_results)
        final_results['training_time'] = training_time_formatted

        logger.info(
            f"F1: {final_results['f1']} | Acc: {final_results['acc']} | Time Elapsed: {final_results['training_time']}"
        )

    if dataset_name in ['GAD', 'EUADR']:
        average_f1_scores = np.mean([x['f1'] for x in final_split_results])
        average_acc = np.mean([x['acc'] for x in final_split_results])

        logger.info(
            f"Average F1 Scores: {average_f1_scores} | Average Accuracy: {average_acc}"
        )

        return final_split_results
    else:
        return final_results
コード例 #6
0

trainer.train(
    model_path=model_name_or_path if os.path.isdir(model_name_or_path) else None
)

# Evaluation
eval_results = {}

logger.info("*** Evaluate ***")

# Loop to handle MNLI double evaluation (matched, mis-matched)
eval_datasets = [eval_dataset]

for eval_dataset in eval_datasets:
    trainer.compute_metrics = build_compute_metrics_fn(eval_dataset.args.task_name)
    eval_result = trainer.evaluate(eval_dataset=eval_dataset)

    output_eval_file = os.path.join(
        output_dir, f"eval_results_{eval_dataset.args.task_name}.txt"
    )
    if trainer.is_world_master():
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name))
            for key, value in eval_result.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))

    eval_results.update(eval_result)

コード例 #7
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.

    parser = HfArgumentParser(dataclass_types=(ModelArguments,
                                               DataTrainingArguments,
                                               TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith('.json'):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f'Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome.'
        )

    logger.warning(
        'Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s',
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info('Training/evaluation parameters %s', training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load task-specific number of labels (==1 if regression) and output modes)
    try:
        num_labels = seq_clf_tasks_num_labels[data_args.task_name]
        logger.info('number of labels: ', num_labels)
        output_mode = seq_clf_output_modes[data_args.task_name]
        logger.info('task output mode: ', output_mode)
    except KeyError:
        raise ValueError('Task not found: %s' % (data_args.task_name))

    # Load pretrained model and tokenizer
    if model_args.config_name:
        logger.info('config_name provided as: %s', model_args.config_name)
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)

    elif model_args.model_name_or_path:
        logger.info('model_name_or_path provided as: %s',
                    model_args.model_name_or_path)

        if model_args.continue_from_checkpoint:
            logger.info(
                'checking for the newest checkpoint directory %s/checkpoint-<Trainer.global_step>',
                model_args.model_name_or_path)
            sorted_checkpoints = _sorted_checkpoints(
                model_args.model_name_or_path)
            logger.info('checkpoints found: %s', sorted_checkpoints)
            if len(sorted_checkpoints) == 0:
                raise ValueError(
                    'Used --continue_from_checkpoint but no checkpoint was found in --model_name_or_path.'
                )
            else:
                model_args.model_name_or_path = sorted_checkpoints[-1]
        config = AutoConfig.from_pretrained(
            model_args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=data_args.task_name,
            cache_dir=model_args.cache_dir,
        )

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        # use_fast=True,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool('.ckpt' in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset = (SeqClfDataset(
        args=data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir)
                     if training_args.do_train else None)
    eval_dataset = (SeqClfDataset(args=data_args,
                                  tokenizer=tokenizer,
                                  mode='dev',
                                  cache_dir=model_args.cache_dir)
                    if training_args.do_eval else None)
    test_dataset = (SeqClfDataset(args=data_args,
                                  tokenizer=tokenizer,
                                  mode='test',
                                  cache_dir=model_args.cache_dir)
                    if training_args.do_predict else None)

    # Metrics computation for a task
    def build_compute_metrics_fn(
            task_name: str) -> Callable[[EvalPrediction], Dict]:
        def compute_metrics_fn(p: EvalPrediction) -> Dict:
            """computes metrics

            Args:
                p (EvalPrediction): NamedTuple with predictions and label ids

            Returns:
                Dict: a dict with metrics
            """
            if output_mode == 'classification':
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == 'regression':
                preds = np.squeeze(
                    p.predictions
                )  # see x = np.array([[[0], [1], [2]]]) x.shape np.squeeze(x).shape
            # logger.info('DEBUGGING testing: ')
            # logger.info('preds: ', '\n', preds)
            # logger.info('p.label_ids: ', '\n', p.label_ids)
            return glue_compute_metrics(task_name, preds, p.label_ids)

        return compute_metrics_fn

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=build_compute_metrics_fn(data_args.task_name),
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info('*** Evaluate ***')

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_datasets = [eval_dataset]
        if data_args.task_name == 'mnli':
            mnli_mm_data_args = dataclasses.replace(data_args,
                                                    task_name='mnli-mm')
            eval_datasets.append(
                GlueDataset(mnli_mm_data_args,
                            tokenizer=tokenizer,
                            evaluate=True))

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = build_compute_metrics_fn(
                eval_dataset.args.task_name)
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(
                training_args.output_dir,
                f'eval_results_{eval_dataset.args.task_name}.txt')

            if trainer.is_world_master():
                with open(output_eval_file, 'w') as writer:
                    logger.info('***** Eval results {} *****'.format(
                        eval_dataset.args.task_name))
                    for key, value in eval_result.items():
                        logger.info('  %s = %s', key, value)
                        writer.write('%s = %s\n' % (key, value))

            eval_results.update(eval_result)

    if training_args.do_predict:
        logging.info('*** Test ***')
        test_datasets = [test_dataset]
        if data_args.task_name == 'mnli':
            mnli_mm_data_args = dataclasses.replace(data_args,
                                                    task_name='mnli-mm')
            test_datasets.append(
                GlueDataset(mnli_mm_data_args,
                            tokenizer=tokenizer,
                            mode='test',
                            cache_dir=model_args.cache_dir))

        for test_dataset in test_datasets:
            predictions = trainer.predict(
                test_dataset=test_dataset).predictions
            if output_mode == 'classification':
                predictions = np.argmax(predictions, axis=1)

            output_test_file = os.path.join(
                training_args.output_dir,
                f'test_results_{test_dataset.args.task_name}.txt')
            if trainer.is_world_master():
                with open(output_test_file, 'w') as writer:
                    logger.info('***** Test results {} *****'.format(
                        test_dataset.args.task_name))
                    writer.write('index\tprediction\n')
                    for index, item in enumerate(predictions):
                        if output_mode == 'regression':
                            writer.write('%d\t%3.3f\n' % (index, item))
                        else:
                            item = test_dataset.get_labels()[item]
                            writer.write('%d\t%s\n' % (index, item))
    return eval_results
コード例 #8
0
def main(args_dict=None):
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if args_dict is not None:
        model_args, data_args, training_args = parser.parse_dict(args_dict)
    elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Set project name
    os.environ["WANDB_PROJECT"] = "multilingual_zeroshot"

    num_labels = 3
    labels = ['entailment', 'neutral', 'contradiction']

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = MBartConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        dropout=model_args.dropout,
        attention_dropout=model_args.attention_dropout,
        finetuning_task="mnli",
        cache_dir=model_args.cache_dir,
    )
    tokenizer = MBartTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = MBartForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    columns = ['input_ids', 'attention_mask', 'labels']
    map_fn = get_mnli_map_fn(data_args.lang, data_args.max_seq_length,
                             tokenizer)

    train_dataset = nlp.load_dataset("multi_nli", split="train")
    train_dataset = train_dataset.map(map_fn, batched=True, batch_size=512)
    train_dataset.set_format(type='torch', columns=columns)

    eval_dataset = (nlp.load_dataset("multi_nli", split="validation_matched")
                    if training_args.do_eval else None)
    eval_dataset = eval_dataset.map(map_fn, batched=True, batch_size=512)
    eval_dataset.set_format(type='torch', columns=columns)

    def compute_metrics_fn(p: EvalPrediction):
        preds = np.argmax(p.predictions, axis=1)
        return glue_compute_metrics("classification", preds, p.label_ids)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics_fn,
        data_collator=DataCollator(tokenizer),
    )

    # disable wandb console logs
    logging.getLogger('wandb.run_manager').setLevel(logging.WARNING)

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        mis_matched_dataset = nlp.load_dataset("multi_nli",
                                               split="validation_mismatched")
        mis_matched_dataset = mis_matched_dataset.map(map_fn,
                                                      batched=True,
                                                      batch_size=512)
        mis_matched_dataset.set_format(type='torch', columns=columns)
        eval_datasets = [eval_dataset, mis_matched_dataset]

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = compute_metrics_fn
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(training_args.output_dir,
                                            f"eval_results.txt")
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results *****")
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)
コード例 #9
0
def train(model_args, data_args, training_args):
    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logs
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if 'roberta' in model_args.model_type:
        tokenizer = RobertaTokenizer.from_pretrained(
            model_args.model_name_or_path)
        config = RobertaConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = RobertaForSequenceClassification.from_pretrained(
            model_args.model_name_or_path, config=config)
    elif 'electra' in model_args.model_type:
        tokenizer = ElectraTokenizer.from_pretrained(
            model_args.model_name_or_path)
        config = ElectraConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = ElectraForSequenceClassification.from_pretrained(
            model_args.model_name_or_path, config=config)
    else:
        # default -> bert
        tokenizer = BertTokenizer.from_pretrained(
            model_args.model_name_or_path)
        config = BertConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = BertForSequenceClassification.from_pretrained(
            model_args.model_name_or_path, config=config)
        tokenizer.add_special_tokens()

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets
    train_df = pd.read_csv(data_args.train_data_file, sep='\t')
    if data_args.add_train_data_file1 is not None:
        tmp = pd.read_csv(data_args.add_train_data_file1, sep='\t')
        train_df = pd.concat([train_df, tmp])
    if data_args.add_train_data_file2 is not None:
        tmp = pd.read_csv(data_args.add_train_data_file2, sep='\t')
        train_df = pd.concat([train_df, tmp])
    train_df = train_df.fillna('no_q')
    train_dataset = get_dataset(
        data_args, tokenizer, train_df,
        model_args.model_type) if training_args.do_train else None

    dev_df = pd.read_csv(data_args.eval_data_file, sep='\t')
    dev_df = dev_df.fillna('no_q')
    eval_dataset = get_dataset(
        data_args, tokenizer, dev_df,
        model_args.model_type) if training_args.do_eval else None
    data_collator = MyDataCollator()

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=build_compute_metrics_fn(),
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        trainer.compute_metrics = build_compute_metrics_fn()
        result = trainer.evaluate(eval_dataset=eval_dataset)

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results
コード例 #10
0
def main():
    # 在src/transformers/training_args.py中查看所有可能的参数,
    # 或将 -help标志传递给此脚本。 现在,我们保留了不同的参数集,以更清晰地分离关注点。

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # 如果我们仅向脚本传递一个参数,并且它是json文件的路径,
        # 让我们对其进行解析以获取参数。
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        #通过命令行传递参数
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # 判断是不是重新训练
    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    #设置日志格式
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    #打印当前设置
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    #打印参数
    logger.info("训练/评估 参数 %s", training_args)

    #随机数种子
    set_seed(training_args.seed)
    #num_labels 类别数量, output_mode  是任务类型,'classification'
    try:
        num_labels = glue_tasks_num_labels[data_args.task_name]
        output_mode = glue_output_modes[data_args.task_name]
    except KeyError:
        raise ValueError("没有发现相关任务: %s" % (data_args.task_name))

    # 加载预训练的模型和令牌生成器
    # 分布式培训:
    # from_pretrained方法保证只有一个本地进程可以并发下载模型和vocab。
    #添加自定义参数finetuning_task, num_labels
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    # .ckpt 是可以加载tensorflow的模型
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset = (
        GlueDataset(data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None
    )
    eval_dataset = (
        GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir)
        if training_args.do_eval
        else None
    )
    test_dataset = (
        GlueDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir)
        if training_args.do_predict
        else None
    )

    def build_compute_metrics_fn(task_name: str) -> Callable[[EvalPrediction], Dict]:
        def compute_metrics_fn(p: EvalPrediction):
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)
            return glue_compute_metrics(task_name, preds, p.label_ids)

        return compute_metrics_fn

    #初始化一个Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=build_compute_metrics_fn(data_args.task_name),
    )

    # Training
    if training_args.do_train:
        logger.info("*** 开始训练 ***")
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # 为了方便起见,我们还将令牌生成器重新保存到同一目录中,
        # 以便您可以轻松地在huggingface.co/models上共享模型
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** 开始评估 ***")

        # 如果是mnli, 循环以处理MNLI双重评估(匹配,不匹配)
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm")
            eval_datasets.append(
                GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir)
            )

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = build_compute_metrics_fn(eval_dataset.args.task_name)
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(
                training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt"
            )
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name))
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)

    if training_args.do_predict:
        logging.info("*** 开始预测 ***")
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm")
            test_datasets.append(
                GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir)
            )

        for test_dataset in test_datasets:
            predictions = trainer.predict(test_dataset=test_dataset).predictions
            if output_mode == "classification":
                predictions = np.argmax(predictions, axis=1)

            output_test_file = os.path.join(
                training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt"
            )
            if trainer.is_world_master():
                with open(output_test_file, "w") as writer:
                    logger.info("***** Test results {} *****".format(test_dataset.args.task_name))
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        if output_mode == "regression":
                            writer.write("%d\t%3.3f\n" % (index, item))
                        else:
                            item = test_dataset.get_labels()[item]
                            writer.write("%d\t%s\n" % (index, item))
    return eval_results
コード例 #11
0
ファイル: run.py プロジェクト: giganticode/bohr
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, TrainingArguments))
    model_args, training_args = parser.parse_args_into_dataclasses()
    print(model_args)
    print(training_args)

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    num_labels = len(LABEL_NAMES)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # config = RobertaConfig.from_pretrained(
    #     model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    #     num_labels=num_labels,
    #     # hidden_dropout_prob=0.00,
    #     cache_dir=model_args.cache_dir,
    # )
    # tokenizer = RobertaTokenizerFast.from_pretrained(
    #     model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
    #     cache_dir=model_args.cache_dir,
    #     do_lower_case=False
    # )
    # model = HeadlessRobertaForSequenceClassification.from_pretrained(
    #     model_args.model_name_or_path,
    #     from_tf=bool(".ckpt" in model_args.model_name_or_path),
    #     config=config,
    #     cache_dir=model_args.cache_dir,
    # )

    def df_to_dataset(df):
        print("Loading dataset...")
        df = df[df.bug != -1]
        df = df[~df.message.isnull()]

        text_values = df.message.values
        label_ids = df.bug.values

        text_values_list = text_values.tolist()
        for elm in text_values_list:
            if not isinstance(elm, str):
                print(elm)

        encoding = tokenizer(
            text_values_list,
            add_special_tokens=True,
            return_attention_mask=True,
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors="pt",
        )

        input_ids = encoding["input_ids"]
        label_ids_dtype = torch.float32 if num_labels == 1 else torch.int64
        label_ids_t = torch.tensor(label_ids, dtype=label_ids_dtype)

        print(tokenizer.decode(input_ids[0, :].tolist()))

        print("DF shape: ", df.shape)
        print(input_ids.shape)
        print(label_ids_t.shape)

        dataset = SimpleDataset(input_ids, encoding["attention_mask"],
                                label_ids_t)

        print("Done")
        return dataset

    if model_args.eval_test:
        print("**** TEST EVAL *****")
        test_df = pd.read_csv(model_args.data_file)
        eval_dataset = df_to_dataset(test_df)
        train_dataset = None
    else:
        print("**** TRAINING ******")
        train_valid_df = pd.read_csv(model_args.data_file)
        train_df, valid_df = train_test_split(train_valid_df,
                                              test_size=0.1,
                                              shuffle=False)
        train_dataset = df_to_dataset(train_df)
        eval_dataset = df_to_dataset(valid_df)

    output_mode = model_args.output_mode

    def compute_metrics_fn(p: EvalPrediction):
        if output_mode == "classification":
            preds = np.argmax(p.predictions, axis=1)
        else:
            raise ValueError()
            # elif output_mode == "regression":
        #    raise ValueError()
        #    preds = np.squeeze(p.predictions)
        print(preds)
        print(p.label_ids)

        print(
            classification_report(p.label_ids,
                                  preds,
                                  target_names=LABEL_NAMES,
                                  digits=3))

        acc = accuracy_score(p.label_ids, preds)
        f1 = f1_score(p.label_ids, preds, average="macro")
        return {
            "acc": acc,
            "f1": f1,
        }

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics_fn,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_process_zero():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_datasets = [eval_dataset]

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = compute_metrics_fn
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(training_args.output_dir,
                                            f"eval_results.txt")
            if trainer.is_world_process_zero():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results *****")
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)

    if training_args.do_predict:
        logging.info("*** Test ***")
        eval_datasets = [eval_dataset]

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = compute_metrics_fn
            predictions = trainer.predict(
                test_dataset=eval_dataset).predictions

    if output_mode == "classification":
        predictions = np.argmax(predictions, axis=1)

    output_test_file = os.path.join(training_args.output_dir,
                                    f"assigned_labels.csv")
    if trainer.is_world_process_zero():
        with open(output_test_file, "w") as writer:
            logger.info("***** Test results *****")
            writer.write("index,prediction\n")
            for index, item in enumerate(predictions):
                if output_mode == "regression":
                    writer.write("%d,%3.3f\n" % (index, item))
                else:
                    # item = LABEL_NAMES[item]
                    writer.write("%d,%s\n" % (index, item))
    return eval_results