Example #1
0
def get_text_reader(reader_name, task, num_labels):
    # AILAW Corpus is korean dataset.
    # So, model is fixed to Korean Model such as multilingual-BERT, kobert, koelectra, etc.

    if reader_name == "bert":
        if task == "classification":
            model_name = "bert-base-multilingual-cased"
            text_reader = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        else: # ner
            model_name = "bert-base-multilingual-cased"
            text_reader = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    elif reader_name == "kobert":
        if task == "classification":
            model_name = "monologg/kobert"
            text_reader = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        else: # ner
            model_name = "monologg/kobert"
            text_reader = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    elif reader_name == "koelectra":
        if task == "classification":
            model_name = "monologg/koelectra-base-discriminator"
            text_reader = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        else: # ner
            model_name = "monologg/koelectra-base-discriminator"
            text_reader = ElectraForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    else:
        raise KeyError(reader_name)

    return text_reader
Example #2
0
def punctuate_electra(input_text,
                      download_dir,
                      model_type="ELECTRA",
                      format="inline"):
    """Punctuate the input text with the ELECTRA model. Capitalize sentence beginnings."""
    get_model(model_type, download_dir)

    model_path = f"{download_dir}/{model_type}"
    config = AutoConfig.from_pretrained(model_path)
    tokenizer = ElectraTokenizer.from_pretrained(model_path)
    tokenizer.add_tokens(["<NUM>"])
    pytorch_model = ElectraForTokenClassification.from_pretrained(model_path)
    pytorch_model.resize_token_embeddings(len(tokenizer))

    punctuation_dict = {
        "COMMA": ",",
        "PERIOD": ".",
        "QUESTIONMARK": "?",
        "EXCLAMATIONMARK": "!",
        "COLON": ":",
        "SEMICOLON": ";",
        "DASH": "-",
    }
    eos_punct = [".", "?", "!"]

    labels = config.id2label

    # Read the input and clean of non-printable characters
    input_list = read_input(input_text, format).split()

    # split up long lines to not exceed the training sequence length

    n = 60
    text_to_punctuate = []
    if len(input_list) > n:
        line_part = [
            " ".join(input_list[x:x + n])
            for x in range(0, len(input_list), n)
        ]
        text_to_punctuate.extend(line_part)
    elif len(input_list) == 0:
        pass
    else:
        text_to_punctuate.append(" ".join(input_list))

    punctuated_text = []
    for t in text_to_punctuate:
        input_ids = tokenizer(t, return_tensors="pt")["input_ids"]
        tokens = tokenizer.tokenize(t)
        predictions = pytorch_model(input_ids)
        pred_ids = np.argmax(
            predictions[0].detach().numpy(),
            axis=2)[0]  # Take the first matrix, since only have batch size 1
        predictions = [labels[pred_ids[i]] for i in range(1, len(pred_ids))]
        line_punctuated = iterate(tokens, predictions, eos_punct,
                                  punctuation_dict)
        punctuated_text.append(line_punctuated)

    return upcase_first_letter(" ".join(punctuated_text))
 def create_and_check_electra_for_token_classification(
     self,
     config,
     input_ids,
     token_type_ids,
     input_mask,
     sequence_labels,
     token_labels,
     choice_labels,
     fake_token_labels,
 ):
     config.num_labels = self.num_labels
     model = ElectraForTokenClassification(config=config)
     model.to(torch_device)
     model.eval()
     loss, logits = model(
         input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
     )
     result = {
         "loss": loss,
         "logits": logits,
     }
     self.parent.assertListEqual(
         list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
     )
     self.check_loss_output(result)
 def create_and_check_electra_for_token_classification(
     self,
     config,
     input_ids,
     token_type_ids,
     input_mask,
     sequence_labels,
     token_labels,
     choice_labels,
     fake_token_labels,
 ):
     config.num_labels = self.num_labels
     model = ElectraForTokenClassification(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
Example #5
0
from transformers import ElectraForTokenClassification, TokenClassificationPipeline
from tokenization_kocharelectra import KoCharElectraTokenizer
from pprint import pprint

tokenizer = KoCharElectraTokenizer.from_pretrained(
    "monologg/kocharelectra-base-kmounlp-ner")
model = ElectraForTokenClassification.from_pretrained(
    "monologg/kocharelectra-base-kmounlp-ner")

ner = TokenClassificationPipeline(model=model,
                                  tokenizer=tokenizer,
                                  ignore_labels=["O"],
                                  grouped_entities=True,
                                  device=-1)

pprint(
    ner("문재인 대통령은 28일 서울 코엑스에서 열린 ‘데뷰 (Deview) 2019’ 행사에 참석해 젊은 개발자들을 격려하면서 우리 정부의 인공지능 기본구상을 내놓았다. 출처 : 미디어오늘 (http://www.mediatoday.co.kr)"
        ))
Example #6
0
                    type=str,
                    required=True,
                    default=None,
                    help="Filename of input corpus")
parser.add_argument("--model_name_or_path",
                    type=str,
                    default="monologg/kocharelectra-base-modu-ner-all")
parser.add_argument("--input_dir", default="data", type=str)
parser.add_argument("--output_dir", default="result", type=str)
parser.add_argument("--device",
                    default=-1,
                    type=int,
                    help="Device Num (-1 for cpu)")
args = parser.parse_args()

model = ElectraForTokenClassification.from_pretrained(args.model_name_or_path)
tokenizer = KoCharElectraTokenizer.from_pretrained(args.model_name_or_path)

ner = TokenClassificationPipeline(model=model,
                                  tokenizer=tokenizer,
                                  ignore_labels=["O"],
                                  grouped_entities=True,
                                  device=args.device)

if not os.path.exists(args.output_dir):
    os.mkdir(args.output_dir)

instance_lst = []

with open(os.path.join(args.input_dir, args.filename), "r",
          encoding="utf-8") as f:
Example #7
0
from transformers import ElectraForTokenClassification, TokenClassificationPipeline
from tokenization_kocharelectra import KoCharElectraTokenizer
from pprint import pprint

tokenizer = KoCharElectraTokenizer.from_pretrained(
    "monologg/kocharelectra-base-modu-ner-all")
model = ElectraForTokenClassification.from_pretrained(
    "monologg/kocharelectra-base-modu-ner-all")

ner = TokenClassificationPipeline(model=model,
                                  tokenizer=tokenizer,
                                  ignore_labels=["O"],
                                  grouped_entities=True,
                                  device=-1)

pprint(
    ner("문재인 대통령은 28일 서울 코엑스에서 열린 ‘데뷰 (Deview) 2019’ 행사에 참석해 젊은 개발자들을 격려하면서 우리 정부의 인공지능 기본구상을 내놓았다. 출처 : 미디어오늘 (http://www.mediatoday.co.kr)"
        ))
Example #8
0
def main(cli_args):
    args = AttrDict(cli_args)
    logger.info("Training/evaluation parameters {}".format(args))

    args.output_dir = os.path.join(args.ckpt_dir, args.task)

    set_seed(args)

    output_mode = "classification"
    if "nsmc" in args.train_file:
        processor = NSMCProcessor(args)
    elif "kornli" in args.train_file:
        processor = KorNLIProcessor(args)
    elif "paws" in args.train_file:
        processor = PawsProcessor(args)
    elif "korsts" in args.train_file:
        processor = KorSTSProcessor(args)
        output_mode = "regression"
    elif "question-pair" in args.train_file:
        processor = QuestionPairProcessor(args)
    elif "hate-speech" in args.train_file:
        processor = HateSpeechProcessor(args)
    elif "naver-ner" in args.train_file:
        processor = NaverNerProcessor(args)
    else:
        processor = IntentProcessor(args)
    args["output_mode"] = output_mode
    labels = processor.get_labels()

    config = ElectraConfig.from_pretrained(
        args.model_name_or_path,
        num_labels=len(labels),
        id2label={str(i): label for i, label in enumerate(labels)},
        label2id={label: i for i, label in enumerate(labels)},
    )
    if args.mecab:
        tokenizer = KoNLPyBertTokenizer(
            konlpy_wordpiece=KoNLPyWordPieceTokenizer(Mecab(), use_tag=False),
            vocab_file=os.path.join(args.model_name_or_path, "vocab.txt"),
            do_lower_case=args.do_lower_case,
        )
    else:
        tokenizer = ElectraTokenizer.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case
        )

    if "naver-ner" in args.train_file:
        model = ElectraForTokenClassification.from_pretrained(
            args.model_name_or_path, config=config
        )
    else:
        model = ElectraForSequenceClassification.from_pretrained(
            args.model_name_or_path, config=config
        )

    #Re-init
    if args.do_reinit:
        init_layer(model.electra.encoder.layer, top_n_layer=1)
    
    # GPU or CPU
    args.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
    model.to(args.device)

    # Load dataset
    if "naver-ner" in args.train_file:
        train_dataset = (
            ner_load_and_cache_examples(args, tokenizer, mode="train")
            if args.train_file
            else None
        )
        dev_dataset = (
            ner_load_and_cache_examples(args, tokenizer, mode="dev")
            if args.dev_file
            else None
        )
        test_dataset = (
            ner_load_and_cache_examples(args, tokenizer, mode="test")
            if args.test_file
            else None
        )
    else:
        train_dataset = (
            seq_cls_load_and_cache_examples(args, tokenizer, mode="train")
            if args.train_file
            else None
        )
        dev_dataset = (
            seq_cls_load_and_cache_examples(args, tokenizer, mode="dev")
            if args.dev_file
            else None
        )
        test_dataset = (
            seq_cls_load_and_cache_examples(args, tokenizer, mode="test")
            if args.test_file
            else None
        )

    if dev_dataset == None:
        args.evaluate_test_during_training = (
            True  # If there is no dev dataset, only use testset
        )

    if args.do_train:
        global_step, tr_loss = train(
            args, model, labels, train_dataset, dev_dataset, test_dataset
        )
        logger.info(" global_step = {}, average loss = {}".format(global_step, tr_loss))

    if args.do_eval and not args.do_nni:
        results = {}
        checkpoints = list(
            os.path.dirname(c)
            for c in sorted(
                glob.glob(
                    args.output_dir + "/**/" + "pytorch_model.bin", recursive=True
                )
            )
        )
        if not args.eval_all_checkpoints:
            checkpoints = checkpoints[-1:]
        else:
            logging.getLogger("transformers.configuration_utils").setLevel(
                logging.WARN
            )  # Reduce logging
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN
            )  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1]
            if "naver-ner" in args.train_file:
                model = ElectraForTokenClassification.from_pretrained(checkpoint)
            else:
                model = ElectraForSequenceClassification.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(
                args,
                model,
                test_dataset,
                mode="test",
                labels=labels,
                global_step=global_step,
            )
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as f_w:
            for key in sorted(results.keys()):
                f_w.write("{} = {}\n".format(key, str(results[key])))
from transformers import ElectraTokenizer, ElectraForTokenClassification
from ner_pipeline import NerPipeline
from pprint import pprint

tokenizer = ElectraTokenizer.from_pretrained(
    "monologg/koelectra-small-finetuned-naver-ner")
model = ElectraForTokenClassification.from_pretrained(
    "monologg/koelectra-small-finetuned-naver-ner")

ner = NerPipeline(model=model,
                  tokenizer=tokenizer,
                  ignore_labels=[],
                  ignore_special_tokens=True)

texts = [
    "문재인 대통령은 28일 서울 코엑스에서 열린 ‘데뷰 (Deview) 2019’ 행사에 참석해 젊은 개발자들을 격려하면서 우리 정부의 인공지능 기본구상을 내놓았다. 출처 : 미디어오늘 (http://www.mediatoday.co.kr)",
    "2017년 장점마을 문제가 본격적으로 이슈가 될 무렵 임 의원은 장점마을 민관협의회 위원들과 여러 차례 마을과 금강농산을 찾아갔다.",
    "2009년 7월 FC서울을 떠나 잉글랜드 프리미어리그 볼턴 원더러스로 이적한 이청용은 크리스탈 팰리스와 독일 분데스리가2 VfL 보훔을 거쳐 지난 3월 K리그로 컴백했다. 행선지는 서울이 아닌 울산이었다"
]

pprint(ner(texts))
Example #10
0
from transformers import ElectraTokenizer, ElectraForTokenClassification
from ner_pipeline import NerPipeline
from pprint import pprint

tokenizer = ElectraTokenizer.from_pretrained(
    "monologg/koelectra-base-v3-naver-ner")
model = ElectraForTokenClassification.from_pretrained(
    "monologg/koelectra-base-v3-naver-ner")

ner = NerPipeline(model=model,
                  tokenizer=tokenizer,
                  ignore_labels=["O"],
                  ignore_special_tokens=True,
                  device=-1)

texts = [
    "문재인 대통령은 28일 서울 코엑스에서 열린 ‘데뷰 (Deview) 2019’ 행사에 참석해 젊은 개발자들을 격려하면서 우리 정부의 인공지능 기본구상을 내놓았다. 출처 : 미디어오늘 (http://www.mediatoday.co.kr)",
    "2017년 장점마을 문제가 본격적으로 이슈가 될 무렵 임 의원은 장점마을 민관협의회 위원들과 여러 차례 마을과 금강농산을 찾아갔다.",
    "2009년 7월 FC서울을 떠나 잉글랜드 프리미어리그 볼턴 원더러스로 이적한 이청용은 크리스탈 팰리스와 독일 분데스리가2 VfL 보훔을 거쳐 지난 3월 K리그로 컴백했다. 행선지는 서울이 아닌 울산이었다",
]

pprint(ner(texts))

pprint(
    ner("2009년 7월 FC서울을 떠나 잉글랜드 프리미어리그 볼턴 원더러스로 이적한 이청용은 크리스탈 팰리스와 독일 분데스리가2 VfL 보훔을 거쳐 지난 3월 K리그로 컴백했다. 행선지는 서울이 아닌 울산이었다"
        ))