Exemple #1
0
        def compute_metrics_fn(p: EvalPrediction):
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)

            return glue_compute_metrics(task_name, preds, p.label_ids)
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # model_type = model_args.model_type
    # log_dir = './results'

    # if model_type == 'base':
    #     model_args.model_name_or_path = 'bert-base-uncased'
    # elif model_type == 'base-pubmed':
    #     model_args.model_name_or_path = 'bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12'
    # elif model_type == 'base-pubmed-mimic':
    #     model_args.model_name_or_path = 'bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12'
    # else:
    #     raise NotImplementedError

    # Setup logging
    logging.basicConfig(
        format=
        '[%(asctime)s - %(levelname)s - %(filename)s: %(lineno)d (%(funcName)s)] %(message)s',
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )

    # Set seed
    set_seed(training_args.seed)

    try:
        num_labels = glue_tasks_num_labels[data_args.task_name]
        output_mode = glue_output_modes[data_args.task_name]
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    dataset_name = data_args.data_dir.split('/')[-1]
    if dataset_name in ['GAD', 'EUADR']:
        final_split_results = []

        original_data_dir = copy.deepcopy(x=data_args.data_dir)

        data_splits = list(map(str, range(1, 11)))
        for split in data_splits:
            data_args.data_dir = os.path.join(original_data_dir, split)

            # Get datasets
            train_dataset = (GlueDataset(
                data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir)
                             if training_args.do_train else None)
            eval_dataset = (GlueDataset(data_args,
                                        tokenizer=tokenizer,
                                        mode="dev",
                                        cache_dir=model_args.cache_dir)
                            if training_args.do_eval else None)
            test_dataset = (GlueDataset(data_args,
                                        tokenizer=tokenizer,
                                        mode="test",
                                        cache_dir=model_args.cache_dir)
                            if training_args.do_predict else None)

            # Load pretrained model
            # Distributed training:
            # The .from_pretrained methods guarantee that only one local process can concurrently
            # download model & vocab.

            # Currently, this code do not support distributed training.
            training_args.warmup_steps = int(
                model_args.warmup_proportion *
                (len(train_dataset) /
                 training_args.per_device_train_batch_size) *
                training_args.num_train_epochs)
            training_args_weight_decay = 0.01
            logger.info("Training/evaluation parameters %s", training_args)

            config = AutoConfig.from_pretrained(
                model_args.config_name
                if model_args.config_name else model_args.model_name_or_path,
                num_labels=num_labels,
                finetuning_task=data_args.task_name,
                cache_dir=model_args.cache_dir,
            )
            try:
                model = AutoModelForSequenceClassification.from_pretrained(
                    model_args.model_name_or_path,
                    from_tf=False,
                    config=config,
                    cache_dir=model_args.cache_dir,
                )
            except:
                model = AutoModelForSequenceClassification.from_pretrained(
                    os.path.join(model_args.model_name_or_path,
                                 "model.ckpt.index"),
                    from_tf=True,
                    config=config,
                    cache_dir=model_args.cache_dir,
                )

            def build_compute_metrics_fn(
                    task_name: str) -> Callable[[EvalPrediction], Dict]:
                def compute_metrics_fn(p: EvalPrediction):
                    if output_mode == "classification":
                        preds = np.argmax(p.predictions, axis=1)
                    elif output_mode == "regression":
                        preds = np.squeeze(p.predictions)
                    return glue_compute_metrics(task_name, preds, p.label_ids)

                return compute_metrics_fn

            # Initialize our Trainer
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                compute_metrics=build_compute_metrics_fn(data_args.task_name),
            )

            # Training
            if training_args.do_train:
                training_start_time = time.time()

                trainer.train(
                    model_path=model_args.model_name_or_path if os.path.
                    isdir(model_args.model_name_or_path) else None)

                training_end_time = time.time()
                training_total_time = training_end_time - training_start_time

                trainer.save_model()
                # For convenience, we also re-save the tokenizer to the same directory,
                # so that you can share your model easily on huggingface.co/models =)
                if trainer.is_world_master():
                    tokenizer.save_pretrained(training_args.output_dir)

            # Evaluation
            eval_results = {}
            if training_args.do_eval:
                logger.info("*** Evaluate ***")

                # Loop to handle MNLI double evaluation (matched, mis-matched)
                eval_datasets = [eval_dataset]
                if data_args.task_name == "mnli":
                    mnli_mm_data_args = dataclasses.replace(
                        data_args, task_name="mnli-mm")
                    eval_datasets.append(
                        GlueDataset(mnli_mm_data_args,
                                    tokenizer=tokenizer,
                                    mode="dev",
                                    cache_dir=model_args.cache_dir))

                for eval_dataset in eval_datasets:
                    trainer.compute_metrics = build_compute_metrics_fn(
                        eval_dataset.args.task_name)
                    eval_result = trainer.evaluate(eval_dataset=eval_dataset)

                    output_eval_file = os.path.join(
                        training_args.output_dir,
                        f"eval_results_{eval_dataset.args.task_name}.txt")
                    if trainer.is_world_master():
                        with open(output_eval_file, "w") as writer:
                            logger.info("***** Eval results {} *****".format(
                                eval_dataset.args.task_name))
                            for key, value in eval_result.items():
                                logger.info("  %s = %s", key, value)
                                writer.write("%s = %s\n" % (key, value))

                    eval_results.update(eval_result)

            if training_args.do_predict:
                logging.info("*** Test ***")
                test_datasets = [test_dataset]
                if data_args.task_name == "mnli":
                    mnli_mm_data_args = dataclasses.replace(
                        data_args, task_name="mnli-mm")
                    test_datasets.append(
                        GlueDataset(mnli_mm_data_args,
                                    tokenizer=tokenizer,
                                    mode="test",
                                    cache_dir=model_args.cache_dir))

                for test_dataset in test_datasets:
                    predictions = trainer.predict(
                        test_dataset=test_dataset).predictions
                    labels = np.array([
                        test_dataset.__getitem__(idx).label
                        for idx in range(len(test_dataset))
                    ])

                    assert len(predictions) == len(
                        labels
                    ), f"len(predictions) = {len(predictions)} =/= len(labels) = {len(labels)}"

                    if output_mode == "classification":
                        predictions = np.argmax(predictions, axis=1)

                    output_test_file = os.path.join(
                        training_args.output_dir, f"test_results.txt"
                        #f"test_results_{test_dataset.args.task_name}.txt"
                    )

                    test_results = glue_compute_metrics(task_name='ddi',
                                                        preds=predictions,
                                                        labels=labels)

                    if trainer.is_world_master():
                        with open(output_test_file, "w") as writer:
                            logger.info("***** Test results {} *****".format(
                                test_dataset.args.task_name))
                            logger.info(
                                f"Accuracy: {test_results['acc']}\tMacro F1: {test_results['f1']}"
                            )
                            writer.write("index\tprediction\n")
                            for index, item in enumerate(predictions):
                                if output_mode == "regression":
                                    writer.write("%d\t%3.3f\n" % (index, item))
                                else:
                                    item = test_dataset.get_labels()[item]
                                    writer.write("%d\t%s\n" % (index, item))

                        training_time_formatted = time.strftime(
                            '%H:%M:%S', time.gmtime(training_total_time))
                        logger.info(
                            f"Total training time: {training_time_formatted}")

            final_results = copy.deepcopy(x=test_results)
            final_results['training_time'] = training_time_formatted

            logger.info(
                f"F1: {final_results['f1']} | Acc: {final_results['acc']} | Time Elapsed: {final_results['training_time']}"
            )

            final_split_results.append(final_results)
    else:
        # Get datasets
        train_dataset = (GlueDataset(
            data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir)
                         if training_args.do_train else None)
        eval_dataset = (GlueDataset(data_args,
                                    tokenizer=tokenizer,
                                    mode="dev",
                                    cache_dir=model_args.cache_dir)
                        if training_args.do_eval else None)
        test_dataset = (GlueDataset(data_args,
                                    tokenizer=tokenizer,
                                    mode="test",
                                    cache_dir=model_args.cache_dir)
                        if training_args.do_predict else None)

        # Load pretrained model
        # Distributed training:
        # The .from_pretrained methods guarantee that only one local process can concurrently
        # download model & vocab.

        # Currently, this code do not support distributed training.
        training_args.warmup_steps = int(
            model_args.warmup_proportion *
            (len(train_dataset) / training_args.per_device_train_batch_size) *
            training_args.num_train_epochs)
        training_args_weight_decay = 0.01
        logger.info("Training/evaluation parameters %s", training_args)

        config = AutoConfig.from_pretrained(
            model_args.config_name
            if model_args.config_name else model_args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=data_args.task_name,
            cache_dir=model_args.cache_dir,
        )
        try:
            model = AutoModelForSequenceClassification.from_pretrained(
                model_args.model_name_or_path,
                from_tf=False,
                config=config,
                cache_dir=model_args.cache_dir,
            )
        except:
            model = AutoModelForSequenceClassification.from_pretrained(
                os.path.join(model_args.model_name_or_path,
                             "model.ckpt.index"),
                from_tf=True,
                config=config,
                cache_dir=model_args.cache_dir,
            )

        def build_compute_metrics_fn(
                task_name: str) -> Callable[[EvalPrediction], Dict]:
            def compute_metrics_fn(p: EvalPrediction):
                if output_mode == "classification":
                    preds = np.argmax(p.predictions, axis=1)
                elif output_mode == "regression":
                    preds = np.squeeze(p.predictions)
                return glue_compute_metrics(task_name, preds, p.label_ids)

            return compute_metrics_fn

        # Initialize our Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=build_compute_metrics_fn(data_args.task_name),
        )

        # Training
        if training_args.do_train:
            training_start_time = time.time()

            trainer.train(model_path=model_args.model_name_or_path if os.path.
                          isdir(model_args.model_name_or_path) else None)

            training_end_time = time.time()
            training_total_time = training_end_time - training_start_time

            trainer.save_model()
            # For convenience, we also re-save the tokenizer to the same directory,
            # so that you can share your model easily on huggingface.co/models =)
            if trainer.is_world_master():
                tokenizer.save_pretrained(training_args.output_dir)

        # Evaluation
        eval_results = {}
        if training_args.do_eval:
            logger.info("*** Evaluate ***")

            # Loop to handle MNLI double evaluation (matched, mis-matched)
            eval_datasets = [eval_dataset]
            if data_args.task_name == "mnli":
                mnli_mm_data_args = dataclasses.replace(data_args,
                                                        task_name="mnli-mm")
                eval_datasets.append(
                    GlueDataset(mnli_mm_data_args,
                                tokenizer=tokenizer,
                                mode="dev",
                                cache_dir=model_args.cache_dir))

            for eval_dataset in eval_datasets:
                trainer.compute_metrics = build_compute_metrics_fn(
                    eval_dataset.args.task_name)
                eval_result = trainer.evaluate(eval_dataset=eval_dataset)

                output_eval_file = os.path.join(
                    training_args.output_dir,
                    f"eval_results_{eval_dataset.args.task_name}.txt")
                if trainer.is_world_master():
                    with open(output_eval_file, "w") as writer:
                        logger.info("***** Eval results {} *****".format(
                            eval_dataset.args.task_name))
                        for key, value in eval_result.items():
                            logger.info("  %s = %s", key, value)
                            writer.write("%s = %s\n" % (key, value))

                eval_results.update(eval_result)

        if training_args.do_predict:
            logging.info("*** Test ***")
            test_datasets = [test_dataset]
            if data_args.task_name == "mnli":
                mnli_mm_data_args = dataclasses.replace(data_args,
                                                        task_name="mnli-mm")
                test_datasets.append(
                    GlueDataset(mnli_mm_data_args,
                                tokenizer=tokenizer,
                                mode="test",
                                cache_dir=model_args.cache_dir))

            for test_dataset in test_datasets:
                predictions = trainer.predict(
                    test_dataset=test_dataset).predictions
                labels = np.array([
                    test_dataset.__getitem__(idx).label
                    for idx in range(len(test_dataset))
                ])

                assert len(predictions) == len(
                    labels
                ), f"len(predictions) = {len(predictions)} =/= len(labels) = {len(labels)}"

                if output_mode == "classification":
                    predictions = np.argmax(predictions, axis=1)

                output_test_file = os.path.join(
                    training_args.output_dir, f"test_results.txt"
                    #f"test_results_{test_dataset.args.task_name}.txt"
                )

                test_results = glue_compute_metrics(task_name='ddi',
                                                    preds=predictions,
                                                    labels=labels)

                if trainer.is_world_master():
                    with open(output_test_file, "w") as writer:
                        logger.info("***** Test results {} *****".format(
                            test_dataset.args.task_name))
                        logger.info(
                            f"Accuracy: {test_results['acc']}\tMacro F1: {test_results['f1']}"
                        )
                        writer.write("index\tprediction\n")
                        for index, item in enumerate(predictions):
                            if output_mode == "regression":
                                writer.write("%d\t%3.3f\n" % (index, item))
                            else:
                                item = test_dataset.get_labels()[item]
                                writer.write("%d\t%s\n" % (index, item))

                    training_time_formatted = time.strftime(
                        '%H:%M:%S', time.gmtime(training_total_time))
                    logger.info(
                        f"Total training time: {training_time_formatted}")

        final_results = copy.deepcopy(x=test_results)
        final_results['training_time'] = training_time_formatted

        logger.info(
            f"F1: {final_results['f1']} | Acc: {final_results['acc']} | Time Elapsed: {final_results['training_time']}"
        )

    if dataset_name in ['GAD', 'EUADR']:
        average_f1_scores = np.mean([x['f1'] for x in final_split_results])
        average_acc = np.mean([x['acc'] for x in final_split_results])

        logger.info(
            f"Average F1 Scores: {average_f1_scores} | Average Accuracy: {average_acc}"
        )

        return final_split_results
    else:
        return final_results
 def compute_metrics_fn(p: EvalPrediction):
     preds = np.argmax(p.predictions, axis=1)
     return glue_compute_metrics(preds, p.label_ids)