def main(args):
    label2idx = json_load(os.path.join(args.pretrained_model,
                                       "label2idx.json"))
    num_labels = len(label2idx)
    idx2label = {v: k for k, v in label2idx.items()}
    args.label2idx = label2idx
    args.idx2label = idx2label
    # get config, model and tokenizer
    model_config, _, model_tokenizer = MODEL_CLASSES[args.model_type]
    tokenizer = model_tokenizer.from_pretrained(
        args.pretrained_model, do_lower_case=args.do_lower_case)
    args.tokenizer = tokenizer
    config = model_config.from_pretrained(args.pretrained_model,
                                          do_lower_case=args.do_lower_case)
    args.config = config
    args.use_crf = config.use_crf
    model = load_model(args, args.pretrained_model)
    model.to(args.device)

    ner_data_processor = TransformerNerDataProcessor()
    ner_data_processor.set_logger(args.logger)
    ner_data_processor.set_data_dir(args.preprocessed_text_dir)
    if args.data_has_offset_information:
        ner_data_processor.offset_info_available()

    # fids = [each.stem.split(".")[0] for each in Path(args.preprocessed_text_dir).glob("*.txt")]
    for each_file in Path(args.preprocessed_text_dir).glob("*.txt"):
        try:
            test_example = ner_data_processor.get_test_examples(
                file_name=each_file.name)
            test_features = transformer_convert_data_to_features(
                args=args,
                input_examples=test_example,
                label2idx=label2idx,
                tokenizer=tokenizer,
                max_seq_len=args.max_seq_length)
            predictions = predict(args, model, test_features)
            Path(args.output_dir).mkdir(parents=True, exist_ok=True)
            ofn = each_file.stem.split(".")[0] + ".bio.txt"
            args.predict_output_file = os.path.join(args.output_dir, ofn)
            _output_bio(args, test_example, predictions)
        except Exception as ex:
            args.logger.error(
                f"Encountered an error when processing predictions for file: {each_file.name}"
            )
            args.logger.error(traceback.format_exc())

    if args.do_format:
        base_path = Path(args.output_dir)
        output_formatted_dir = base_path.parent / f"{base_path.stem}_formatted_output"
        output_formatted_dir.mkdir(parents=True, exist_ok=True)
        format_converter(text_dir=args.raw_text_dir,
                         input_bio_dir=args.output_dir,
                         output_dir=output_formatted_dir,
                         formatter=args.do_format,
                         do_copy_text=args.do_copy)
Ejemplo n.º 2
0
def run_task(args):
    set_seed(args.seed)

    if os.path.exists(args.new_model_dir) and os.listdir(
            args.new_model_dir
    ) and args.do_train and not args.overwrite_model_dir:
        raise ValueError(
            'new model directory: {} exists. Use --overwrite_model_dir to overwrite the previous model or create another directory for the new model'
            .format(args.new_model_dir))

    # init data processor
    ner_data_processor = TransformerNerDataProcessor()
    ner_data_processor.set_data_dir(args.data_dir)
    ner_data_processor.set_logger(args.logger)
    if args.data_has_offset_information:
        ner_data_processor.offset_info_available()

    if args.do_train:
        labels, label2idx = ner_data_processor.get_labels(
            default=args.model_type)
    else:
        label2idx = json_load(
            os.path.join(args.new_model_dir, "label2idx.json"))

    num_labels = len(label2idx)
    idx2label = {v: k for k, v in label2idx.items()}
    args.label2idx = label2idx
    args.idx2label = idx2label

    # get config, model and tokenizer
    model_config, model_model, model_tokenizer = MODEL_CLASSES[args.model_type]
    args.logger.info("Training/evaluation parameters: {}".format(
        {k: v
         for k, v in vars(args).items()}))

    # training
    if args.do_train:
        tokenizer = model_tokenizer.from_pretrained(
            args.tokenizer_name, do_lower_case=args.do_lower_case)
        tokenizer.add_tokens([NEXT_TOKEN])
        config = model_config.from_pretrained(args.config_name,
                                              num_labels=num_labels)
        tf_ckpts = list(Path(args.pretrained_model).glob("*.ckpt.index"))
        from_tf_flag = True if tf_ckpts else False
        if args.use_crf:
            crf_layer = Transformer_CRF(num_labels=num_labels,
                                        start_label_id=label2idx['CLS'])
            model = model_model.from_pretrained(args.pretrained_model,
                                                from_tf=from_tf_flag,
                                                config=config,
                                                crf=crf_layer)
            model.active_using_crf()
        else:
            model = model_model.from_pretrained(args.pretrained_model,
                                                from_tf=from_tf_flag,
                                                config=config)

        # #add an control token for combine sentence if it is too long to fit max_seq_len
        model.resize_token_embeddings(len(tokenizer))
        args.tokenizer = tokenizer
        args.config = model.config
        model.to(args.device)

        train_examples = ner_data_processor.get_train_examples()
        train_features = transformer_convert_data_to_features(
            args,
            input_examples=train_examples,
            label2idx=label2idx,
            tokenizer=tokenizer,
            max_seq_len=args.max_seq_length)

        dev_examples = ner_data_processor.get_dev_examples()
        dev_features = transformer_convert_data_to_features(
            args,
            input_examples=dev_examples,
            label2idx=label2idx,
            tokenizer=tokenizer,
            max_seq_len=args.max_seq_length)

        # set up evaluation metrics
        args.eval_tool = set_up_eval_tool(args)
        # start training
        train(args, model, train_features, dev_features)
        # save config and tokenizer with new model
        args.tokenizer.save_pretrained(args.new_model_dir)
        args.config.save_pretrained(args.new_model_dir)

    # predict - test.txt file prediction (if you need predict many files, use 'run_transformer_batch_prediction')
    if args.do_predict:
        args.config = model_config.from_pretrained(args.new_model_dir,
                                                   num_labels=num_labels)
        args.tokenizer = model_tokenizer.from_pretrained(
            args.new_model_dir, do_lower_case=args.do_lower_case)
        model = load_model(args)
        model.to(args.device)

        test_example = ner_data_processor.get_test_examples()
        test_features = transformer_convert_data_to_features(
            args,
            input_examples=test_example,
            label2idx=label2idx,
            tokenizer=args.tokenizer,
            max_seq_len=args.max_seq_length)

        predictions = predict(args, model, test_features)
        _output_bio(args, test_example, predictions)
Ejemplo n.º 3
0
def run_task(args):
    set_seed(args.seed)

    if os.path.exists(args.new_model_dir) and os.listdir(
            args.new_model_dir
    ) and args.do_train and not args.overwrite_model_dir:
        raise ValueError("""new model directory: {} exists. 
            Use --overwrite_model_dir to overwrite the previous model. 
            Or create another directory for the new model""".format(
            args.new_model_dir))

    # init data processor
    ner_data_processor = TransformerNerDataProcessor()
    ner_data_processor.set_data_dir(args.data_dir)
    ner_data_processor.set_logger(args.logger)
    if args.data_has_offset_information:
        ner_data_processor.offset_info_available()

    if args.do_train:
        labels, label2idx = ner_data_processor.get_labels(
            default=args.model_type)
    else:
        label2idx = json_load(
            os.path.join(args.new_model_dir, "label2idx.json"))

    num_labels = len(label2idx)
    idx2label = {v: k for k, v in label2idx.items()}
    args.num_labels = num_labels
    args.label2idx = label2idx
    args.idx2label = idx2label

    # get config, model and tokenizer
    model_config, model_model, model_tokenizer = MODEL_CLASSES[args.model_type]
    args.logger.info("Training/evaluation parameters: {}".format(
        {k: v
         for k, v in vars(args).items()}))

    # training
    if args.do_train:
        if args.model_type in {"roberta", "bart", "longformer", "deberta"}:
            # we need to set add_prefix_space to True for roberta, longformer, and Bart (any tokenizer from BPE)
            tokenizer = model_tokenizer.from_pretrained(
                args.tokenizer_name,
                do_lower_case=args.do_lower_case,
                add_prefix_space=True)
        else:
            tokenizer = model_tokenizer.from_pretrained(
                args.tokenizer_name, do_lower_case=args.do_lower_case)
        tokenizer.add_tokens(NEXT_TOKEN)
        config = model_config.from_pretrained(args.config_name,
                                              num_labels=num_labels)
        config.use_crf = args.use_crf
        config.label2idx = args.label2idx
        args.logger.info("New Model Config:\n{}".format(config))

        if args.pretrained_model == "microsoft/deberta-xlarge-v2":
            raise NotImplementedError(
                """the deberta-xlarge-v2 tokenizer is different from other deberta models
            the support for deberta-xlarge-v2 is not implemented.
            you can try other debata models: microsoft/deberta-base, 
            microsoft/deberta-large, microsoft/deberta-xlarge""")

        model = model_model.from_pretrained(args.pretrained_model,
                                            config=config)

        # #add an control token for combine sentence if it is too long to fit max_seq_len
        model.resize_token_embeddings(len(tokenizer))
        config.vocab_size = len(tokenizer)
        args.tokenizer = tokenizer
        args.config = model.config
        model.to(args.device)

        train_examples = ner_data_processor.get_train_examples()
        train_features = transformer_convert_data_to_features(
            args,
            input_examples=train_examples,
            label2idx=label2idx,
            tokenizer=tokenizer,
            max_seq_len=args.max_seq_length)

        dev_examples = ner_data_processor.get_dev_examples()
        dev_features = transformer_convert_data_to_features(
            args,
            input_examples=dev_examples,
            label2idx=label2idx,
            tokenizer=tokenizer,
            max_seq_len=args.max_seq_length)

        # set up evaluation metrics
        args.eval_tool = set_up_eval_tool(args)
        # start training
        train(args, model, train_features, dev_features)
        # save config and tokenizer with new model
        args.tokenizer.save_pretrained(args.new_model_dir)
        args.config.save_pretrained(args.new_model_dir)

    # predict - test.txt file prediction (if you need predict many files, use 'run_transformer_batch_prediction')
    if args.do_predict:
        args.config = model_config.from_pretrained(args.new_model_dir,
                                                   num_labels=num_labels)
        args.use_crf = args.config.use_crf
        # args.model_type = args.config.model_type
        if args.model_type in {"roberta", "bart", "longformer"}:
            # we need to set add_prefix_space to True for roberta, longformer, and Bart (any tokenizer from GPT-2)
            tokenizer = model_tokenizer.from_pretrained(
                args.tokenizer_name,
                do_lower_case=args.do_lower_case,
                add_prefix_space=True)
        else:
            args.tokenizer = model_tokenizer.from_pretrained(
                args.new_model_dir, do_lower_case=args.do_lower_case)
        model = load_model(args)
        model.to(args.device)

        test_example = ner_data_processor.get_test_examples()
        test_features = transformer_convert_data_to_features(
            args,
            input_examples=test_example,
            label2idx=label2idx,
            tokenizer=args.tokenizer,
            max_seq_len=args.max_seq_length)

        predictions = predict(args, model, test_features)
        _output_bio(args, test_example, predictions)