コード例 #1
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    module = import_module("tasks")
    try:
        token_classification_task_clazz = getattr(module, model_args.task_type)
        token_classification_task: TokenClassificationTask = token_classification_task_clazz(
        )
    except AttributeError:
        raise ValueError(
            f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
            f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Prepare CONLL-2003 task
    labels = token_classification_task.get_labels(data_args.labels)
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i
                  for i, label in enumerate(labels)},
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset = (TokenClassificationDataset(
        token_classification_task=token_classification_task,
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.train,
    ) if training_args.do_train else None)
    eval_dataset = (TokenClassificationDataset(
        token_classification_task=token_classification_task,
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.dev,
    ) if training_args.do_eval else None)

    def align_predictions(
            predictions: np.ndarray,
            label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
        preds = np.argmax(predictions, axis=2)

        batch_size, seq_len = preds.shape

        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                    out_label_list[i].append(label_map[label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        return preds_list, out_label_list

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds_list, out_label_list = align_predictions(p.predictions,
                                                       p.label_ids)
        return {
            "accuracy_score": accuracy_score(out_label_list, preds_list),
            "precision": precision_score(out_label_list, preds_list),
            "recall": recall_score(out_label_list, preds_list),
            "f1": f1_score(out_label_list, preds_list),
        }

    # Data collator
    data_collator = DataCollatorWithPadding(
        tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_process_zero():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

            results.update(result)

    # Predict
    if training_args.do_predict:
        test_dataset = TokenClassificationDataset(
            token_classification_task=token_classification_task,
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.test,
        )

        predictions, label_ids, metrics = trainer.predict(test_dataset)
        preds_list, _ = align_predictions(predictions, label_ids)

        output_test_results_file = os.path.join(training_args.output_dir,
                                                "test_results.txt")
        if trainer.is_world_process_zero():
            with open(output_test_results_file, "w") as writer:
                for key, value in metrics.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

        # Save predictions
        output_test_predictions_file = os.path.join(training_args.output_dir,
                                                    "test_predictions.txt")
        if trainer.is_world_process_zero():
            with open(output_test_predictions_file, "w") as writer:
                with open(os.path.join(data_args.data_dir, "test.txt"),
                          "r") as f:
                    token_classification_task.write_predictions_to_file(
                        writer, f, preds_list)

    return results
コード例 #2
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    module = import_module("tasks")
    try:
        token_classification_task_clazz = getattr(module, model_args.task_type)
        token_classification_task: TokenClassificationTask = token_classification_task_clazz(
        )
    except AttributeError:
        raise ValueError(
            f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
            f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
        )

    # Setup logging
    if IS_IN_DOCKER_CONTAINER:
        log_level = logging.WARNING
    elif training_args.training_on_cloud:
        log_level = logging.WARNING
    else:
        log_level = logging.INFO if (training_args.local_rank
                                     in [-1, 0]) else logging.WARN
    logging.basicConfig(
        format=
        "%(asctime)s - %(levelname)s - %(name)s - %(lineno)d -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=log_level)
    logger.warning(f"log level: {log_level}")
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters:")
    for key, value in vars(training_args).items():
        logger.info("  %s = %s", key, value)

    # Set seed
    set_seed(training_args.seed)

    # Prepare CONLL-2003 task
    labels = token_classification_task.get_labels(data_args.labels)
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i
                  for i, label in enumerate(labels)},
        cache_dir=model_args.cache_dir,
        gradient_checkpointing=training_args.gradient_checkpointing)
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
        additional_special_tokens=["[unused1]"])
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        training_args=training_args)

    # Get datasets
    train_dataset = (TokenClassificationDataset(
        token_classification_task=token_classification_task,
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.train,
        training_args=training_args,
        remove_label=Remove_Label) if training_args.do_train else None)
    eval_dataset = (TokenClassificationDataset(
        token_classification_task=token_classification_task,
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.dev,
        training_args=training_args,
        remove_label=Remove_Label)
                    if training_args.do_eval or training_args.do_predict_dev
                    or training_args.evaluate_during_training else None)

    def align_predictions(predictions: np.ndarray,
                          label_ids: np.ndarray,
                          dateset=None) -> Tuple[List[int], List[int]]:

        if training_args.use_crf:
            mask = []
            for feature in dateset.features:
                mask.append(feature.attention_mask)
            if mask:
                if isinstance(mask, list):
                    mask = np.array(mask)
                mask = torch.from_numpy(mask).cuda() == 1
                predictions = torch.from_numpy(predictions).cuda()
                preds = model.crf.decode(predictions, mask)
            else:
                preds = model.crf.decode(predictions)

            out_label_list = []
            preds_list = []

            for pred_one, label_one in zip(preds, label_ids.tolist()):
                out_label = []
                pred = []

                for p, l in zip(pred_one, label_one):
                    out_label.append(label_map[l])
                    pred.append(label_map[p])

                out_label = out_label[1:-1]
                pred = pred[1:-1]
                out_label_list.append(out_label)
                preds_list.append(pred)
        else:
            preds = np.argmax(predictions, axis=2)
            batch_size, seq_len = preds.shape
            out_label_list = [[] for _ in range(batch_size)]
            preds_list = [[] for _ in range(batch_size)]

            for i in range(batch_size):
                for j in range(seq_len):
                    if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                        out_label_list[i].append(label_map[label_ids[i][j]])
                        preds_list[i].append(label_map[preds[i][j]])

        return preds_list, out_label_list

    def precision_score(y_true, y_pred, average='micro'):
        true_entities = set(y_true)
        pred_entities = set(y_pred)

        nb_correct = len(true_entities & pred_entities)
        nb_pred = len(pred_entities)

        score = nb_correct / nb_pred if nb_pred > 0 else 0

        return score

    def recall_score(y_true, y_pred, average='micro', suffix=False):
        true_entities = set(y_true)
        pred_entities = set(y_pred)

        nb_correct = len(true_entities & pred_entities)
        nb_true = len(true_entities)

        score = nb_correct / nb_true if nb_true > 0 else 0

        return score

    def f_score(y_true, y_pred, average='micro', suffix=False):
        true_entities = set(y_true)
        pred_entities = set(y_pred)

        nb_correct = len(true_entities & pred_entities)
        nb_pred = len(pred_entities)
        nb_true = len(true_entities)

        p = nb_correct / nb_pred if nb_pred > 0 else 0
        r = nb_correct / nb_true if nb_true > 0 else 0
        score = 2 * p * r / (p + r) if p + r > 0 else 0

        return score

    def compute_metrics(p: EvalPrediction, mode, dateset=None) -> Dict:
        preds_list, out_label_list = align_predictions(p.predictions,
                                                       p.label_ids, dateset)
        words_list, preds_list = post_align_predictions(
            dateset, preds_list, tokenizer)

        if mode == "dev":
            reader_file = os.path.join(data_args.data_dir, f"{mode}.txt")
        elif mode == "test":
            reader_file = os.path.join(data_args.test_data_dir, f"{mode}.txt")
        else:
            raise ValueError(f"mode is error: {mode}")

        if mode == "dev":
            data_dir = TRAIN_DIR
        elif mode == "test":
            data_dir = TEST_DIR
        else:
            raise ValueError(f"mode is error: {mode}")

        with open(reader_file, "r", encoding="utf8") as reader:
            pre_tuple, real_tuple = conver_entity_list_to_tuple(
                reader, preds_list, words_list, data_dir)
        # print(real_tuple[0], pre_tuple[0])
        result = {
            # "accuracy_score": accuracy_score(out_label_list, preds_list),
            "all_precision": precision_score(real_tuple, pre_tuple),
            "all_recall": recall_score(real_tuple, pre_tuple),
            "all_f_score": f_score(real_tuple, pre_tuple),
        }
        for label in LABEL_LIST:
            sub_pre_tuple = [t for t in pre_tuple[:] if t[-1] == label]
            sub_real_tuple = [t for t in real_tuple[:] if t[-1] == label]

            result.update({
                f"{label}_precision":
                precision_score(sub_real_tuple, sub_pre_tuple),
                f"{label}_recall":
                recall_score(sub_real_tuple, sub_pre_tuple),
                f"{label}_f_score":
                f_score(sub_real_tuple, sub_pre_tuple),
            })
        metrics_report = f"\n{'Tag':20s}\t{'Precision':9s}\t{'Recall':9s}\t{'F-Score':9s}\t\n"
        for label in LABEL_LIST:
            pkey = f"{label}_precision"
            rkey = f"{label}_recall"
            fkey = f"{label}_f_score"
            p, r, f = result[pkey], result[rkey], result[fkey]
            metrics_report += f"{label:20s}\t{p:9.7f}\t{r:9.7f}\t{f:9.7f}\t\n"
        metrics_report += "<BLANKLINE>\n"
        pkey = f"all_precision"
        rkey = f"all_recall"
        fkey = f"all_f_score"
        p, r, f = result[pkey], result[rkey], result[fkey]
        label = "ALL"
        metrics_report += f"{label:20s}\t{p:9.7f}\t{r:9.7f}\t{f:9.7f}\t\n"
        logger.info("--------metricd report---------")
        logger.info(metrics_report)

        return result

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    select_model(training_args.output_dir, k_fold=10, select=True, rm=False)

    if training_args.do_eval or training_args.do_predict or training_args.do_predict_dev:
        model = AutoModelForTokenClassification.from_pretrained(
            training_args.output_dir,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            training_args=training_args)

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
        )
        logger.warning(f"load best mode from {training_args.output_dir}")

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    # logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

            results.update(result)

    # Predict
    if training_args.do_predict:
        test_dataset = TokenClassificationDataset(
            token_classification_task=token_classification_task,
            data_dir=data_args.test_data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.test,
            training_args=training_args,
            remove_label=Remove_Label)

        # logger.warning(list(zip(tokens_list[3], test_dataset.features[3].input_ids)))
        predictions, label_ids, metrics = trainer.predict(test_dataset)
        preds_list, _ = align_predictions(predictions, label_ids, test_dataset)

        # 数据对齐
        words_list, preds_list = post_align_predictions(
            test_dataset, preds_list, tokenizer)

        output_test_results_file = os.path.join(training_args.output_dir,
                                                "test_results.txt")
        if trainer.is_world_master():
            with open(output_test_results_file, "w") as writer:
                for key, value in metrics.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

        # Save predictions
        output_test_predictions_file = os.path.join(training_args.output_dir,
                                                    "test_predictions.txt")
        if trainer.is_world_master():
            with open(output_test_predictions_file, "w") as writer:
                with open(os.path.join(data_args.test_data_dir, "test.txt"),
                          "r") as f:
                    token_classification_task.write_predictions_to_file(
                        writer, f, preds_list, words_list)

    # Predict dev
    if training_args.do_predict_dev:

        # logger.warning(list(zip(tokens_list[3], test_dataset.features[3].input_ids)))
        predictions, label_ids, metrics = trainer.predict(
            eval_dataset, description="Evaluation")
        preds_list, _ = align_predictions(predictions, label_ids, eval_dataset)

        # 数据对齐
        words_list, preds_list = post_align_predictions(
            eval_dataset, preds_list, tokenizer)

        output_test_results_file = os.path.join(training_args.output_dir,
                                                "dev_results.txt")
        if trainer.is_world_master():
            with open(output_test_results_file, "w") as writer:
                for key, value in metrics.items():
                    writer.write("%s = %s\n" % (key, value))

        # Save predictions
        output_test_predictions_file = os.path.join(training_args.output_dir,
                                                    "dev_predictions.txt")
        if trainer.is_world_master():
            with open(output_test_predictions_file, "w") as writer:
                with open(os.path.join(data_args.data_dir, "dev.txt"),
                          "r") as f:
                    token_classification_task.write_predictions_to_file(
                        writer, f, preds_list, words_list)

    return results