예제 #1
0
def main(args):
    base_path = Path(args.bio_dir)
    output_formatted_dir = base_path.parent / f"{base_path.stem}_formatted_output"
    output_formatted_dir.mkdir(parents=True, exist_ok=True)
    format_converter(text_dir=args.raw_text_dir,
                     input_bio_dir=args.bio_dir,
                     output_dir=output_formatted_dir,
                     formatter=args.do_format,
                     do_copy_text=args.do_copy)
def main(args):
    label2idx = json_load(os.path.join(args.pretrained_model,
                                       "label2idx.json"))
    num_labels = len(label2idx)
    idx2label = {v: k for k, v in label2idx.items()}
    args.label2idx = label2idx
    args.idx2label = idx2label
    # get config, model and tokenizer
    model_config, _, model_tokenizer = MODEL_CLASSES[args.model_type]
    tokenizer = model_tokenizer.from_pretrained(
        args.pretrained_model, do_lower_case=args.do_lower_case)
    args.tokenizer = tokenizer
    config = model_config.from_pretrained(args.pretrained_model,
                                          do_lower_case=args.do_lower_case)
    args.config = config
    args.use_crf = config.use_crf
    model = load_model(args, args.pretrained_model)
    model.to(args.device)

    ner_data_processor = TransformerNerDataProcessor()
    ner_data_processor.set_logger(args.logger)
    ner_data_processor.set_data_dir(args.preprocessed_text_dir)
    if args.data_has_offset_information:
        ner_data_processor.offset_info_available()

    # fids = [each.stem.split(".")[0] for each in Path(args.preprocessed_text_dir).glob("*.txt")]
    for each_file in Path(args.preprocessed_text_dir).glob("*.txt"):
        try:
            test_example = ner_data_processor.get_test_examples(
                file_name=each_file.name)
            test_features = transformer_convert_data_to_features(
                args=args,
                input_examples=test_example,
                label2idx=label2idx,
                tokenizer=tokenizer,
                max_seq_len=args.max_seq_length)
            predictions = predict(args, model, test_features)
            Path(args.output_dir).mkdir(parents=True, exist_ok=True)
            ofn = each_file.stem.split(".")[0] + ".bio.txt"
            args.predict_output_file = os.path.join(args.output_dir, ofn)
            _output_bio(args, test_example, predictions)
        except Exception as ex:
            args.logger.error(
                f"Encountered an error when processing predictions for file: {each_file.name}"
            )
            args.logger.error(traceback.format_exc())

    if args.do_format:
        base_path = Path(args.output_dir)
        output_formatted_dir = base_path.parent / f"{base_path.stem}_formatted_output"
        output_formatted_dir.mkdir(parents=True, exist_ok=True)
        format_converter(text_dir=args.raw_text_dir,
                         input_bio_dir=args.output_dir,
                         output_dir=output_formatted_dir,
                         formatter=args.do_format,
                         do_copy_text=args.do_copy)