Python Tokenizer Examples, data_utils.vocab_tokenizer.Tokenizer Python Examples

Example #1

0

Show file

File: inference.py Project: switiz/pytorch-transformer-chatbot

def main(parser):

    # Config
    args = parser.parse_args()
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)
    data_config = Config(json_path=data_dir / 'config.json')
    model_config = Config(json_path=model_dir / 'config.json')

    # Vocab & Tokenizer
    with open(data_config.token2idx_vocab, mode='rb') as io:
        token2idx_vocab = json.load(io)
        print("token2idx_vocab: ", token2idx_vocab)
    vocab = Vocabulary(token2idx=token2idx_vocab)
    tokenizer = Tokenizer(vocab=vocab,
                          split_fn=mecab_token_pos_flat_fn,
                          pad_fn=keras_pad_fn,
                          maxlen=model_config.maxlen)
    model_config.vocab_size = len(vocab.token2idx)

    # Model
    model = Transformer(config=model_config, vocab=vocab)
    checkpoint_manager = CheckpointManager(model_dir)  # experiments/base_model
    checkpoint = checkpoint_manager.load_checkpoint('best.tar')
    model.load_state_dict(checkpoint['model_state_dict'])

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    model.eval()

    while (True):
        input_text = input("문장을 입력하세요: ")
        enc_input = torch.tensor(
            tokenizer.list_of_string_to_arr_of_pad_token_ids([input_text]))
        dec_input = torch.tensor([[vocab.token2idx[vocab.START_TOKEN]]])

        for i in range(model_config.maxlen):
            y_pred = model(enc_input.to(device), dec_input.to(device))
            y_pred_ids = y_pred.max(dim=-1)[1]
            if (y_pred_ids[0, -1] == vocab.token2idx[vocab.END_TOKEN]).to(
                    torch.device('cpu')).numpy():
                decoding_from_result(enc_input=enc_input,
                                     y_pred=y_pred,
                                     tokenizer=tokenizer)
                break

            # decoding_from_result(enc_input, y_pred, tokenizer)
            dec_input = torch.cat([
                dec_input.to(torch.device('cpu')),
                y_pred_ids[0, -1].unsqueeze(0).unsqueeze(0).to(
                    torch.device('cpu'))
            ],
                                  dim=-1)

            if i == model_config.maxlen - 1:
                decoding_from_result(enc_input=enc_input,
                                     y_pred=y_pred,
                                     tokenizer=tokenizer)

Example #2

0

Show file

File: ner.py Project: connexioh-team/ner

def main():
    cur_path = os.path.dirname(sys.argv[0])
    if cur_path:
        os.chdir(cur_path)

    model_dir = Path('./experiments/base_model_with_crf')
    model_config = Config(json_path=model_dir / 'config.json')

    # load vocab & tokenizer
    tok_path = "./ptr_lm_model/tokenizer_78b3253a26.model"
    ptr_tokenizer = SentencepieceTokenizer(tok_path)

    with open(model_dir / "vocab.pkl", 'rb') as f:
        vocab = pickle.load(f)
    tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)

    # load ner_to_index.json
    with open(model_dir / "ner_to_index.json", 'rb') as f:
        ner_to_index = json.load(f)
        index_to_ner = {v: k for k, v in ner_to_index.items()}

    # model
    model = KobertCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab)

    # load
    model_dict = model.state_dict()
    checkpoint = torch.load("./model.bin", map_location=torch.device('cpu'))
    convert_keys = {}
    for k, v in checkpoint['model_state_dict'].items():
        new_key_name = k.replace("module.", '')
        if new_key_name not in model_dict:
            print("{} is not int model_dict".format(new_key_name), file=sys.stderr)
            continue
        convert_keys[new_key_name] = v

    model.load_state_dict(convert_keys, strict=False)
    model.eval()
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    #model.to(device)
    decoder_from_res = DecoderFromNamedEntitySequence(tokenizer=tokenizer, index_to_ner=index_to_ner)
    
    try:
        while(True):
            input_text = input()
        
            list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text])
            x_input = torch.tensor(list_of_input_ids).long()
            list_of_pred_ids = model(x_input)

            list_of_ner_word = decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids)
            if list_of_ner_word:
                print(",".join(list_of_ner_word))
            else:
                print("/")
    except:
        print("EOF", file=sys.stderr)

Example #3

0

Show file

 def __init__(self, config: Config, vocab: Vocabulary, state_dict = None):
     self.seq2seq = TransformerNet(config=config, vocab=vocab)
     self.config = config
     self.vocab = vocab
     self.tokenizer = Tokenizer(vocab=vocab, split_fn=mecab_token_pos_flat_fn, pad_fn=keras_pad_fn, maxlen=config.maxlen)
     self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
     if state_dict is not None:
         self.seq2seq.load_state_dict(state_dict)
     self.seq2seq.to(self.device)
     self.learning_rate = config.learning_rate

Example #4

0

Show file

File: app.py Project: czangyeob/pytorch-bert-crf-ner

def post():
    value = request.form['input']
    model_dir = Path('./experiments/base_model_with_crf')
    model_config = Config(json_path=model_dir / 'config.json')
    # load vocab & tokenizer
    tok_path = "ptr_lm_model/tokenizer_78b3253a26.model"
    ptr_tokenizer = SentencepieceTokenizer(tok_path)

    with open(model_dir / "vocab.pkl", 'rb') as f:
        vocab = pickle.load(f)
    tokenizer = Tokenizer(vocab=vocab,
                          split_fn=ptr_tokenizer,
                          pad_fn=keras_pad_fn,
                          maxlen=model_config.maxlen)

    # load ner_to_index.json
    with open(model_dir / "ner_to_index.json", 'rb') as f:
        ner_to_index = json.load(f)
        index_to_ner = {v: k for k, v in ner_to_index.items()}

    # model
    model = KobertCRFViz(config=model_config,
                         num_classes=len(ner_to_index),
                         vocab=vocab)

    # load
    model_dict = model.state_dict()
    checkpoint = torch.load(
        "./experiments/base_model_with_crf/best-epoch-16-step-1500-acc-0.993.bin",
        map_location=torch.device('cpu'))
    convert_keys = {}
    for k, v in checkpoint['model_state_dict'].items():
        new_key_name = k.replace("module.", '')
        if new_key_name not in model_dict:
            print("{} is not int model_dict".format(new_key_name))
            continue
        convert_keys[new_key_name] = v

    model.load_state_dict(convert_keys)
    model.eval()
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    decoder_from_res = DecoderFromNamedEntitySequence(
        tokenizer=tokenizer, index_to_ner=index_to_ner)
    input_text = value
    list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids(
        [input_text])
    x_input = torch.tensor(list_of_input_ids).long()
    list_of_pred_ids, _ = model(x_input)
    list_of_ner_word, decoding_ner_sentence = decoder_from_res(
        list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids)
    return {'word': list_of_ner_word, 'decoding': decoding_ner_sentence}

Example #5

0

Show file

File: ner_dataset.py Project: kyujinjeong813/fnf_process

    def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')):

        if vocab is None or tokenizer is None:
            tok_path = get_tokenizer()
            self.ptr_tokenizer = SentencepieceTokenizer(tok_path)
            self.ptr_detokenizer = SentencepieceDetokenizer(tok_path)
            _, vocab_of_gluonnlp = get_pytorch_kobert_model()
            token2idx = vocab_of_gluonnlp.token_to_idx
            self.vocab = Vocabulary(token2idx=token2idx)
            self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen)
        else:
            self.vocab = vocab
            self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.model_dir = model_dir

Example #6

0

Show file

File: predictor.py Project: kim-allison/sagemaker-byoc

    def get_tokenizer(cls):
        if cls.tokenizer == None:
            tok_path = "./tokenizer_78b3253a26.model"
            ptr_tokenizer = SentencepieceTokenizer(tok_path)

            cls.tokenizer = Tokenizer(vocab=cls.vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)
        return cls.tokenizer

Example #7

0

Show file

def load_generator(args):
    # 载入预训练的生成器
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)
    data_config = Config(json_path=data_dir / 'config.json')
    model_config = Config(json_path=model_dir / 'config.json')

    checkpoint_manager = CheckpointManager(model_dir) # experiments/base_model
    checkpoint = checkpoint_manager.load_checkpoint('best.tar')

    with open(data_config.token2idx_vocab, mode='rb') as io:
        token2idx_vocab = json.load(io)
        print("token2idx_vocab: ", token2idx_vocab)
    vocab = Vocabulary(token2idx = token2idx_vocab)
    model_config.vocab_size = len(vocab.token2idx)

    tokenizer = Tokenizer(vocab=vocab, split_fn=mecab_token_pos_flat_fn, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)
    # loss_fn = nn.CrossEntropyLoss(ignore_index=vocab.PAD_ID)
    return Generator(model_config, vocab, checkpoint['model_state_dict']), tokenizer, vocab.PAD_ID, checkpoint_manager

Example #8

0

Show file

def main(parser):
    # Config
    args = parser.parse_args()
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)

    # data_config = Config(json_path=data_dir / 'config.json')
    model_config = Config(json_path=model_dir / 'config.json')

    # Vocab & Tokenizer
    tok_path = get_tokenizer() # ./tokenizer_78b3253a26.model
    ptr_tokenizer = SentencepieceTokenizer(tok_path)

    _, vocab_of_gluonnlp = get_pytorch_kobert_model()
    token_to_idx = vocab_of_gluonnlp.token_to_idx

    model_config.vocab_size = len(token_to_idx)
    vocab = Vocabulary(token_to_idx=token_to_idx)

    print("len(token_to_idx): ", len(token_to_idx))
    with open(model_dir / "token2idx_vocab.json", 'w', encoding='utf-8') as f:
        json.dump(token_to_idx, f, ensure_ascii=False, indent=4)

    # save vocab & tokenizer
    with open(model_dir / "vocab.pkl", 'wb') as f:
        pickle.dump(vocab, f)

    # load vocab & tokenizer
    with open(model_dir / "vocab.pkl", 'rb') as f:
        vocab = pickle.load(f)

    tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)
    ner_formatter = NamedEntityRecognitionFormatter(vocab=vocab, tokenizer=tokenizer, maxlen=model_config.maxlen, model_dir=model_dir)

    # Train & Val Datasets
    cwd = Path.cwd()
    data_in = cwd / "data_in"
    train_data_dir = data_in / "NER-master" / "말뭉치 - 형태소_개체명"
    tr_clf_ds = NamedEntityRecognitionDataset(train_data_dir=train_data_dir, model_dir=model_dir)
    tr_clf_ds.set_transform_fn(transform_source_fn=ner_formatter.transform_source_fn, transform_target_fn=ner_formatter.transform_target_fn)
    tr_clf_dl = DataLoader(tr_clf_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False)

    # Model
    model = KobertCRF(config=model_config, num_classes=len(tr_clf_ds.ner_to_index))
    model.train()

    # optim
    train_examples_len = len(tr_clf_ds)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    # num_train_optimization_steps = int(train_examples_len / model_config.batch_size / model_config.gradient_accumulation_steps) * model_config.epochs
    t_total = len(tr_clf_dl) // model_config.gradient_accumulation_steps * model_config.epochs
    optimizer = AdamW(optimizer_grouped_parameters, lr=model_config.learning_rate, eps=model_config.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=model_config.warmup_steps, t_total=t_total)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    n_gpu = torch.cuda.device_count()
    # if n_gpu > 1:
    #     model = torch.nn.DataParallel(model)
    model.to(device)

    # save
    tb_writer = SummaryWriter('{}/runs'.format(model_dir))
    checkpoint_manager = CheckpointManager(model_dir)
    summary_manager = SummaryManager(model_dir)
    best_val_loss = 1e+10
    best_train_acc = 0

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(tr_clf_ds))
    logger.info("  Num Epochs = %d", model_config.epochs)
    logger.info("  Instantaneous batch size per GPU = %d", model_config.batch_size)
    # logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
    #                args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", model_config.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_dev_acc, best_dev_loss = 0.0, 99999999999.0
    best_steps = 0
    model.zero_grad()
    set_seed()  # Added here for reproductibility (even between python 2 and 3)

    # Train
    train_iterator = trange(int(model_config.epochs), desc="Epoch")
    for _epoch, _ in enumerate(train_iterator):
        epoch_iterator = tqdm(tr_clf_dl, desc="Iteration") # , disable=args.local_rank not in [-1, 0]
        epoch = _epoch
        for step, batch in enumerate(epoch_iterator):
            model.train()
            x_input, token_type_ids, y_real = map(lambda elm: elm.to(device), batch)
            log_likelihood, sequence_of_tags = model(x_input, token_type_ids, y_real)

            # loss: negative log-likelihood
            loss = -1 * log_likelihood

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if model_config.gradient_accumulation_steps > 1:
                loss = loss / model_config.gradient_accumulation_steps

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), model_config.max_grad_norm)
            tr_loss += loss.item()

            if (step + 1) % model_config.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                with torch.no_grad():
                    sequence_of_tags = torch.tensor(sequence_of_tags)
                    print("sequence_of_tags: ", sequence_of_tags)
                    print("y_real: ", y_real)
                    print("loss: ", loss)
                    print("(sequence_of_tags == y_real): ", (sequence_of_tags == y_real))

                    mb_acc = (sequence_of_tags == y_real).float()[y_real != vocab.PAD_ID].mean()

                tr_acc = mb_acc.item()
                tr_loss_avg = tr_loss / global_step
                tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc}

                # if step % 50 == 0:
                print('epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'.format(epoch + 1, global_step,
                                                                                             tr_summary['loss'],
                                                                                             tr_summary['acc']))

                if model_config.logging_steps > 0 and global_step % model_config.logging_steps == 0:
                    # Log metrics
                    if model_config.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        pass
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) / model_config.logging_steps, global_step)
                    logger.info("Average loss: %s at global step: %s",
                                str((tr_loss - logging_loss) / model_config.logging_steps), str(global_step))
                    logging_loss = tr_loss

                if model_config.save_steps > 0 and global_step % model_config.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(model_config.output_dir, 'epoch-{}'.format(epoch + 1))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    logger.info("Saving model checkpoint to %s", output_dir)

                    state = {'global_step': global_step + 1,
                             'model_state_dict': model.state_dict(),
                             'opt_state_dict': optimizer.state_dict()}
                    summary = {'train': tr_summary}
                    summary_manager.update(summary)
                    summary_manager.save('summary.json')

                    is_best = tr_acc >= best_train_acc  # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야)
                    # Save
                    if is_best:
                        best_train_acc = tr_acc
                        checkpoint_manager.save_checkpoint(state,
                                                           'best-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1,
                                                                                                         global_step,
                                                                                                         tr_acc))
                    else:
                        torch.save(state, os.path.join(output_dir,
                                                       'model-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1,
                                                                                                      global_step,
                                                                                                      tr_acc)))

    tb_writer.close()
    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss / global_step)

    return global_step, tr_loss / global_step, best_steps

Example #9

0

Show file

class NamedEntityRecognitionFormatter():
    """ NER formatter class """
    def __init__(self,
                 vocab=None,
                 tokenizer=None,
                 maxlen=30,
                 model_dir=Path('data_in')):

        if vocab is None or tokenizer is None:
            tok_path = get_tokenizer()
            self.ptr_tokenizer = SentencepieceTokenizer(tok_path)
            self.ptr_detokenizer = SentencepieceDetokenizer(tok_path)
            _, vocab_of_gluonnlp = get_pytorch_kobert_model()
            token2idx = vocab_of_gluonnlp.token_to_idx
            self.vocab = Vocabulary(token2idx=token2idx)
            self.tokenizer = Tokenizer(vocab=self.vocab,
                                       split_fn=self.ptr_tokenizer,
                                       pad_fn=keras_pad_fn,
                                       maxlen=maxlen)
        else:
            self.vocab = vocab
            self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.model_dir = model_dir

    def transform_source_fn(self, text):
        # text = "첫 회를 시작으로 13일까지 4일간 총 4회에 걸쳐 매 회 2편씩 총 8편이 공개될 예정이다."
        # label_text = "첫 회를 시작으로 <13일:DAT>까지 <4일간:DUR> 총 <4회:NOH>에 걸쳐 매 회 <2편:NOH>씩 총 <8편:NOH>이 공개될 예정이다."
        # text = "트래버 모리슨 학장은 로스쿨 학생과 교직원이 바라라 전 검사의 사법정의에 대한 깊이 있는 지식과 경험으로부터 많은 것을 배울 수 있을 것이라고 말했다."
        # label_text = "<트래버 모리슨:PER> 학장은 로스쿨 학생과 교직원이 <바라라:PER> 전 검사의 사법정의에 대한 깊이 있는 지식과 경험으로부터 많은 것을 배울 수 있을 것이라고 말했다."
        tokens = self.tokenizer.split(text)
        token_ids_with_cls_sep = self.tokenizer.list_of_string_to_arr_of_cls_sep_pad_token_ids(
            [text])

        # save token sequence length for matching entity label to sequence label
        prefix_sum_of_token_start_index = []
        sum = 0
        for i, token in enumerate(tokens):
            if i == 0:
                prefix_sum_of_token_start_index.append(0)
                sum += len(token) - 1
            else:
                prefix_sum_of_token_start_index.append(sum)
                sum += len(token)
        return token_ids_with_cls_sep, tokens, prefix_sum_of_token_start_index

    def transform_target_fn(self, label_text, tokens,
                            prefix_sum_of_token_start_index):
        regex_ner = re.compile('<(.+?):[A-Z]{3}>')
        regex_filter_res = regex_ner.finditer(label_text)

        list_of_ner_tag = []
        list_of_ner_text = []
        list_of_tuple_ner_start_end = []

        count_of_match = 0
        for match_item in regex_filter_res:
            ner_tag = match_item[0][-4:-1]  # <4일간:DUR> -> DUR
            ner_text = match_item[1]  # <4일간:DUR> -> 4일간
            start_index = match_item.start(
            ) - 6 * count_of_match  # delete previous '<, :, 3 words tag name, >'
            end_index = match_item.end() - 6 - 6 * count_of_match

            list_of_ner_tag.append(ner_tag)
            list_of_ner_text.append(ner_text)
            list_of_tuple_ner_start_end.append((start_index, end_index))
            count_of_match += 1

        list_of_ner_label = []
        entity_index = 0
        is_entity_still_B = True
        for tup in zip(tokens, prefix_sum_of_token_start_index):
            token, index = tup

            if '▁' in token:  # 주의할 점!! '▁' 이것과 우리가 쓰는 underscore '_'는 서로 다른 토큰임
                index += 1  # 토큰이 띄어쓰기를 앞단에 포함한 경우 index 한개 앞으로 당김 # ('▁13', 9) -> ('13', 10)

            if entity_index < len(list_of_tuple_ner_start_end):
                start, end = list_of_tuple_ner_start_end[entity_index]

                if end < index:  # 엔티티 범위보다 현재 seq pos가 더 크면 다음 엔티티를 꺼내서 체크
                    is_entity_still_B = True
                    entity_index = entity_index + 1 if entity_index + 1 < len(
                        list_of_tuple_ner_start_end) else entity_index
                    start, end = list_of_tuple_ner_start_end[entity_index]

                if start <= index and index < end:  # <13일:DAT>까지 -> ('▁13', 10, 'B-DAT') ('일까지', 12, 'I-DAT') 이런 경우가 포함됨, 포함 안시키려면 토큰의 length도 계산해서 제어해야함
                    entity_tag = list_of_ner_tag[entity_index]
                    if is_entity_still_B is True:
                        entity_tag = 'B-' + entity_tag
                        list_of_ner_label.append(entity_tag)
                        is_entity_still_B = False
                    else:
                        entity_tag = 'I-' + entity_tag
                        list_of_ner_label.append(entity_tag)
                else:
                    is_entity_still_B = True
                    entity_tag = 'O'
                    list_of_ner_label.append(entity_tag)

            else:
                entity_tag = 'O'
                list_of_ner_label.append(entity_tag)

            # print((token, index, entity_tag), end=' ')

        with open(self.model_dir / "ner_to_index.json", 'rb') as f:
            self.ner_to_index = json.load(f)
        # ner_str -> ner_ids -> cls + ner_ids + sep -> cls + ner_ids + sep + pad + pad .. + pad
        list_of_ner_ids = [self.ner_to_index['[CLS]']] + [
            self.ner_to_index[ner_tag] for ner_tag in list_of_ner_label
        ] + [self.ner_to_index['[SEP]']]
        list_of_ner_ids = self.tokenizer._pad([list_of_ner_ids],
                                              pad_id=self.vocab.PAD_ID,
                                              maxlen=self.maxlen)[0]

        return list_of_ner_ids, list_of_ner_label

Example #10

0

Show file

def transformation(): # Do an inference on a single batch of data
    data = None

    # 1) INPUT: convert Korean text input to NER code array
    if flask.request.content_type == 'text/plain':

        '''CHECK file locations'''
        model_config = Config(json_path="config.json")
        tok_path = "./tokenizer_78b3253a26.model"
        ptr_tokenizer = SentencepieceTokenizer(tok_path)

        with open("vocab.pkl", 'rb') as f:
            vocab = pickle.load(f)

        tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)

        with open("ner_to_index.json", 'rb') as f:
            ner_to_index = json.load(f)
            index_to_ner = {v: k for k, v in ner_to_index.items()}

        decoder_from_res = DecoderFromNamedEntitySequence(tokenizer=tokenizer, index_to_ner=index_to_ner)

        f = flask.request.get_data()
        # ftype = str(type(f))
        string_f = f.decode("utf-8") 
        lines = string_f.splitlines(True)

        with open("result.txt", 'w', encoding='utf-8-sig') as w:
            # w.write('start\n')
            # w.write(ftype)
            # w.write('\nand\n')
            # w.write(string_f)
            # w.write('\nend\n')
            index = 0
            for i in range(len(lines)):
                input_text = ''
                if i% 4 == 1:
                    input_text = lines[i][3:]
                    addInfo = lines[i+1][3:]
                if input_text == '':
                    continue

                index += 1
                # print("\n## " + str(index) + "\n")

                list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text])
                x_input = torch.tensor(list_of_input_ids).long()

                w.write('## '+str(index)+'\n')
                w.write(addInfo)
                # w.write('\n'+str(list_of_input_ids))

                predictions = run_inference_for_single_data(list_of_input_ids[0], ModelHandler.get_model())  
                
                # 2) OUTPUT: convert NER code to Korean text (FILE)
                emission = torch.tensor(predictions['output'])
                num_classes = len(ner_to_index)
                crf = CRF(num_tags=num_classes, batch_first=True) # 순서 (rearrange tag sequences)
                list_of_pred_ids = crf.decode(emission)

                input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids, unkTokenList=False)
                unkTokenList = makeUNKTokenList(input_text, input_token)
                input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids, unkTokenList=unkTokenList)
                
                w.write(str(list_of_ner_word) + '\n')
                w.write(str(decoding_ner_sentence[6:-5]) + '\n')

        return flask.Response(response=open("result.txt", 'r', encoding='utf-8-sig'), status=200, mimetype='text/plain')
    else:
        return flask.Response(response='This predictor only supports TEXT data', status=415, mimetype='text/plain')

Example #11

0

Show file

File: inference.py Project: oppa3109/pytorch-bert-crf-ner

def main(parser):

    args = parser.parse_args()
    model_dir = Path(args.model_dir)
    model_config = Config(json_path=model_dir / 'config.json')

    # Vocab & Tokenizer
    # tok_path = get_tokenizer() # ./tokenizer_78b3253a26.model
    tok_path = "./tokenizer_78b3253a26.model"
    ptr_tokenizer = SentencepieceTokenizer(tok_path)

    # load vocab & tokenizer
    with open(model_dir / "vocab.pkl", 'rb') as f:
        vocab = pickle.load(f)

    tokenizer = Tokenizer(vocab=vocab,
                          split_fn=ptr_tokenizer,
                          pad_fn=keras_pad_fn,
                          maxlen=model_config.maxlen)

    # load ner_to_index.json
    with open(model_dir / "ner_to_index.json", 'rb') as f:
        ner_to_index = json.load(f)
        index_to_ner = {v: k for k, v in ner_to_index.items()}

    # Model
    # model = KobertSequenceFeatureExtractor(config=model_config, num_classes=len(ner_to_index))
    model = KobertCRF(config=model_config,
                      num_classes=len(ner_to_index),
                      vocab=vocab)
    # model = KobertBiLSTMCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab)
    # model = KobertBiGRUCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab)

    # load
    model_dict = model.state_dict()
    # checkpoint = torch.load("./experiments/base_model/best-epoch-9-step-600-acc-0.845.bin", map_location=torch.device('cpu'))
    checkpoint = torch.load(
        "./experiments/base_model_with_crf/best-epoch-16-step-1500-acc-0.993.bin",
        map_location=torch.device('cpu'))
    # checkpoint = torch.load("./experiments/base_model_with_bilstm_crf/best-epoch-15-step-2750-acc-0.992.bin", map_location=torch.device('cpu'))
    # checkpoint = torch.load("./experiments/base_model_with_bigru_crf/model-epoch-18-step-3250-acc-0.997.bin", map_location=torch.device('cpu'))

    convert_keys = {}
    for k, v in checkpoint['model_state_dict'].items():
        new_key_name = k.replace("module.", '')
        if new_key_name not in model_dict:
            print("{} is not int model_dict".format(new_key_name))
            continue
        convert_keys[new_key_name] = v

    model.load_state_dict(convert_keys)
    model.eval()
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # n_gpu = torch.cuda.device_count()
    # if n_gpu > 1:
    #     model = torch.nn.DataParallel(model)
    model.to(device)

    decoder_from_res = DecoderFromNamedEntitySequence(
        tokenizer=tokenizer, index_to_ner=index_to_ner)

    while (True):
        input_text = input("문장을 입력하세요: ")
        list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids(
            [input_text])
        x_input = torch.tensor(list_of_input_ids).long()

        ## for bert alone
        # y_pred = model(x_input)
        # list_of_pred_ids = y_pred.max(dim=-1)[1].tolist()

        ## for bert crf
        list_of_pred_ids = model(x_input)

        ## for bert bilstm crf & bert bigru crf
        # list_of_pred_ids = model(x_input, using_pack_sequence=False)

        list_of_ner_word, decoding_ner_sentence = decoder_from_res(
            list_of_input_ids=list_of_input_ids,
            list_of_pred_ids=list_of_pred_ids)
        print("list_of_ner_word:", list_of_ner_word)
        print("decoding_ner_sentence:", decoding_ner_sentence)

Example #12

0

Show file

class Generator():
    def __init__(self, config: Config, vocab: Vocabulary, state_dict = None):
        self.seq2seq = TransformerNet(config=config, vocab=vocab)
        self.config = config
        self.vocab = vocab
        self.tokenizer = Tokenizer(vocab=vocab, split_fn=mecab_token_pos_flat_fn, pad_fn=keras_pad_fn, maxlen=config.maxlen)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        if state_dict is not None:
            self.seq2seq.load_state_dict(state_dict)
        self.seq2seq.to(self.device)
        self.learning_rate = config.learning_rate

    def switch_mode(self, mode='eval'):
        if mode=='eval': self.seq2seq.eval()
        else: self.seq2seq.train()
        
    def parameters(self):
        return self.seq2seq.parameters()

    def get_state_dict(self):
        state_dict = self.seq2seq.to(torch.device('cpu')).state_dict()
        self.seq2seq.to(self.device)
        return state_dict

    def eval(self): self.seq2seq.eval()
    def train(self): self.seq2seq.train()

    def gen_output_with_ids(self, input_ids):
        # dec_input = torch.tensor([[self.vocab.token2idx[self.vocab.START_TOKEN]]])
        dec_input = torch.full((input_ids.shape[0],1),self.vocab.token2idx[self.vocab.START_TOKEN]).long()
        # print(dec_input)
        for i in range(self.config.maxlen):
            # print('input_ids', input_ids)
            # print('dec_input', dec_input)
            # print('device', self.device)
            # y_pred = self.seq2seq(input_ids, dec_input)
            y_pred = self.seq2seq(input_ids.to(self.device), dec_input.to(self.device))
            y_pred_ids = y_pred.max(dim=-1)[1]
            # if (y_pred_ids[0,-1] == self.vocab.token2idx[self.vocab.END_TOKEN]).to(torch.device('cpu')).numpy():
            #     # 填充PAD_ID
            #     fill_pad  = torch.full((input_ids.shape[0],self.config.maxlen-dec_input.shape[1]), self.vocab.PAD_ID).long().to(self.device)
            #     # print(fill_pad.shape,y_pred_ids.shape)
            #     y_pred_ids = torch.cat([y_pred_ids, fill_pad], dim=1)
            #     break
            # 对self.vocab.END_TOKEN之后的位置填充pad_id
            # end_indices = (y_pred_ids==self.vocab.token2idx[self.vocab.END_TOKEN]).nonzero()
            # for val in end_indices:
            #     fill_pad  = torch.full((1,self.config.maxlen-val[1]), self.vocab.PAD_ID).long().to(self.device)
            #     print(y_pred_ids[val[0]])
            #     y_pred_ids[val[0]] = torch.cat([y_pred_ids[val[0],-1], fill_pad], dim=1)
            # print(end_indices)

            # decoding_from_result(enc_input, y_pred, tokenizer)
            # print('y_pred_ids',y_pred_ids.shape)
            # print(dec_input.shape, y_pred_ids.shape,y_pred.shape)
            # dec_input = torch.cat((dec_input.to(torch.device('cpu')), y_pred_ids[:,-1].view(-1,1).to(torch.device('cpu'))), dim=1)
            dec_input = torch.cat((dec_input.to(self.device), y_pred_ids[:,-1].view(-1,1)), dim=1)
            # print(dec_input)

            if i == self.config.maxlen - 1:
                # output_str = decoding_from_result(enc_input=enc_input, y_pred=y_pred, tokenizer=self.tokenizer)
                break
        
        # 对self.vocab.END_TOKEN之后的位置填充pad_id
        # end_indices = (y_pred_ids==self.vocab.token2idx[self.vocab.END_TOKEN]).nonzero()
        # 1. 寻找每行最早的结束token
        # end_tokens = []
        # last_r = -1
        # last_col = -1
        # print(end_indices)
        # for val in end_indices:
        #     if val[0]>last_r:
        #         last_r = val[0]
        #         last_col = 500
        #     if val[1]<last_col:
        #         last_col = val[1]
        #         end_tokens.append([last_r.cpu().tolist(), last_col.cpu().tolist()])
        #         continue
        # for item in end_tokens:
        #     fill_pad  = torch.full((1, self.config.maxlen-item[1]), self.vocab.PAD_ID).long().to(self.device)
        #     print(y_pred_ids[item[0]][0:item[1]])
        #     y_pred_ids[item[0]] = torch.cat([y_pred_ids[item[0]][0:item[1]], fill_pad], dim=1)
        # print('end_tokens',end_tokens)
        return y_pred_ids, y_pred

    def is_end_token(self, token):
        if (self.vocab.token2idx[self.vocab.END_TOKEN]==token).cpu().numpy():
            return True
        return False

    def gen_output(self, input_text):
        enc_input = torch.tensor(self.tokenizer.list_of_string_to_arr_of_pad_token_ids([input_text]))
        dec_input = torch.tensor([[self.vocab.token2idx[self.vocab.START_TOKEN]]])
        output_str = ''
        for i in range(self.config.maxlen):
            y_pred = self.seq2seq(enc_input.to(self.device), dec_input.to(self.device))
            y_pred_ids = y_pred.max(dim=-1)[1]
            if (y_pred_ids[0,-1] == self.vocab.token2idx[self.vocab.END_TOKEN]).to(torch.device('cpu')).numpy():
                output_str = decoding_from_result(enc_input=enc_input, y_pred=y_pred, tokenizer=self.tokenizer)
                break

            # decoding_from_result(enc_input, y_pred, tokenizer)
            dec_input = torch.cat([dec_input.to(torch.device('cpu')), y_pred_ids[0,-1].unsqueeze(0).unsqueeze(0).to(torch.device('cpu'))], dim=-1)

            if i == self.config.maxlen - 1:
                output_str = decoding_from_result(enc_input=enc_input, y_pred=y_pred, tokenizer=self.tokenizer)
                break
        
        output_str = output_str.replace('\n', '').replace('\r','')
        return output_str

    def sample(self, dataset):
        # 根据输入数据集生成回复数据集
        data_enc_input = []
        data_dec_input = []
        data_dec_output = []
        question = []
        answer = []
        labels = []
        self.seq2seq.eval()
        # preds = []
        for item in tqdm(dataset,desc='sampling'):
            enc_input, dec_input, dec_output = map(lambda elm: elm, item)
            pred_ids, _ = self.gen_output_with_ids(enc_input)
            # print(pred.shape)
            output_str = decoding_to_str(pred_ids, self.tokenizer)
            input_str = decoding_to_str(enc_input, self.tokenizer)
            # print(input_str)
            # print('---------------')
            # print(output_str)
            # discriminator_inputs = []
            # for r in range(len(input_str)):
            #     question += input_str[r]
            #     answer += output_str[r]
            data_enc_input += enc_input
            data_dec_input += dec_input
            data_dec_output += dec_output
            question += input_str
            answer += output_str
            labels += [0 for _ in range(len(output_str))]
            # preds += pred
            # data_D_set += discriminator_inputs
            # batch_data.append([enc_input, dec_input, dec_output, discriminator_inputs])
            data_enc_input += enc_input
            data_dec_input += dec_input
            data_dec_output += dec_output
            question += input_str
            answer_str = decoding_to_str(dec_input, self.tokenizer)
            # print('*********************')
            # print(answer_str)
            answer += answer_str
            labels += [1 for _ in range(len(output_str))]
            # break
        # print(batch_data)
        # print(len(labels), len(answer))
        # print(labels, answer)
        df = pd.DataFrame({'enc_input': data_enc_input, 'dec_input': data_dec_input, 'dec_output': data_dec_output, 'question': question, 'answer': answer, 'label': labels})
        return df

Example #13

0

Show file

def transformation():  # Do an inference on a single batch of data
    data = None

    # 1) INPUT: convert Korean text input to NER code array
    if flask.request.content_type == 'text/plain':
        '''CHECK file locations'''
        model_config = Config(json_path="config.json")
        tok_path = "./tokenizer_78b3253a26.model"
        ptr_tokenizer = SentencepieceTokenizer(tok_path)

        with open("vocab.pkl", 'rb') as f:
            vocab = pickle.load(f)

        tokenizer = Tokenizer(vocab=vocab,
                              split_fn=ptr_tokenizer,
                              pad_fn=keras_pad_fn,
                              maxlen=model_config.maxlen)

        with open("ner_to_index.json", 'rb') as f:
            ner_to_index = json.load(f)
            index_to_ner = {v: k for k, v in ner_to_index.items()}

        decoder_from_res = DecoderFromNamedEntitySequence(
            tokenizer=tokenizer, index_to_ner=index_to_ner)
        '''
            Assuming request.data is a string: name of txt file
            > NER_OY_data.txt as an example
            > 지금은 /opt/program에 (product-tags)

            HERE:?
        '''
        f = flask.request.data.decode("utf-8")
        lines = f.splitlines(True)
        index = 0

        with open("NER_OY_result.txt", 'w', encoding='utf-8-sig') as w:
            for i in range(len(lines)):
                input_text = ''
                if i % 4 == 1:
                    input_text = lines[i][3:]
                    addInfo = lines[i + 1][3:]
                if input_text == '':
                    continue

                index += 1
                # print("\n## " + str(index) + "\n")

                list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids(
                    [input_text])
                x_input = torch.tensor(list_of_input_ids).long()
                # print(list_of_input_ids)
                # print(x_input)

                data = {"instances": list_of_input_ids}
                predictions = ScoringService.predict(data)

                # 2) OUTPUT: convert NER code to Korean text (FILE)
                emission = torch.tensor(predictions['predictions'])
                num_classes = len(ner_to_index)
                crf = CRF(num_tags=num_classes, batch_first=True)
                list_of_pred_ids = crf.decode(emission)

                input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res(
                    list_of_input_ids=list_of_input_ids,
                    list_of_pred_ids=list_of_pred_ids,
                    unkTokenList=False)
                unkTokenList = makeUNKTokenList(input_text, input_token)
                input_token, list_of_ner_word, decoding_ner_sentence = decoder_from_res(
                    list_of_input_ids=list_of_input_ids,
                    list_of_pred_ids=list_of_pred_ids,
                    unkTokenList=unkTokenList)

                w.write('## ' + str(index) + '\n')
                w.write(addInfo)
                w.write(str(list_of_ner_word) + '\n')
                w.write(str(decoding_ner_sentence[6:-5]) + '\n')
            '''RETURN a file: NER_OY_result.txt'''
        return flask.Response(response=open("NER_OY_result.txt", 'r'),
                              status=200,
                              mimetype='text/plain')
    else:
        return flask.Response(
            response='This predictor only supports TEXT data',
            status=415,
            mimetype='text/plain')

Example #14

0

Show file

File: train.py Project: switiz/pytorch-transformer-chatbot

def main(parser):
    # Config
    args = parser.parse_args()
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)
    data_config = Config(json_path=data_dir / 'config.json')
    model_config = Config(json_path=model_dir / 'config.json')

    # Vocab & Tokenizer
    with open(data_config.token2idx_vocab, mode='rb') as io:
        token2idx_vocab = json.load(io)
        print("token2idx_vocab: ", token2idx_vocab)
    vocab = Vocabulary(token2idx=token2idx_vocab)
    tokenizer = Tokenizer(vocab=vocab,
                          split_fn=mecab_token_pos_flat_fn,
                          pad_fn=keras_pad_fn,
                          maxlen=model_config.maxlen)
    model_config.vocab_size = len(vocab.token2idx)

    # Model & Model Params
    model = Transformer(config=model_config, vocab=vocab)

    # Train & Val Datasets
    tr_ds = ChatbotDataset(data_config.train,
                           tokenizer.list_of_string_to_arr_of_pad_token_ids)
    tr_dl = DataLoader(tr_ds,
                       batch_size=model_config.batch_size,
                       shuffle=True,
                       num_workers=4,
                       drop_last=False)

    val_ds = ChatbotDataset(data_config.validation,
                            tokenizer.list_of_string_to_arr_of_pad_token_ids)
    val_dl = DataLoader(val_ds,
                        batch_size=model_config.batch_size,
                        shuffle=True,
                        num_workers=4,
                        drop_last=False)

    # loss
    loss_fn = nn.CrossEntropyLoss(ignore_index=vocab.PAD_ID)  # nn.NLLLoss()

    # optim
    opt = optim.Adam(
        params=model.parameters(), lr=model_config.learning_rate
    )  # torch.optim.SGD(params=model.parameters(), lr=model_config.learning_rate)
    # scheduler = ReduceLROnPlateau(opt, patience=5)  # Check
    scheduler = GradualWarmupScheduler(opt,
                                       multiplier=8,
                                       total_epoch=model_config.epochs)
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    # save
    # writer = SummaryWriter('{}/runs'.format(model_dir))
    checkpoint_manager = CheckpointManager(model_dir)
    summary_manager = SummaryManager(model_dir)
    best_val_loss = 1e+10
    best_train_acc = 0

    # load
    if (model_dir / 'best.tar').exists():
        print("pretrained model exists")
        checkpoint = checkpoint_manager.load_checkpoint('best.tar')
        model.load_state_dict(checkpoint['model_state_dict'])

    # Train
    for epoch in tqdm(range(model_config.epochs),
                      desc='epoch',
                      total=model_config.epochs):
        scheduler.step(epoch)
        print("epoch : {}, lr: {}".format(epoch, opt.param_groups[0]['lr']))
        tr_loss = 0
        tr_acc = 0
        model.train()

        for step, mb in tqdm(enumerate(tr_dl), desc='steps', total=len(tr_dl)):
            opt.zero_grad()

            enc_input, dec_input, dec_output = map(lambda elm: elm.to(device),
                                                   mb)
            y_pred = model(enc_input, dec_input)
            y_pred_copy = y_pred.detach()
            dec_output_copy = dec_output.detach()

            # loss 계산을 위해 shape 변경
            y_pred = y_pred.reshape(-1, y_pred.size(-1))
            dec_output = dec_output.view(-1).long()

            # padding 제외한 value index 추출
            real_value_index = [dec_output != 0]

            # padding은 loss 계산시 제외
            mb_loss = loss_fn(
                y_pred[real_value_index],
                dec_output[real_value_index])  # Input: (N, C) Target: (N)
            mb_loss.backward()
            opt.step()

            with torch.no_grad():
                mb_acc = acc(y_pred, dec_output)

            tr_loss += mb_loss.item()
            tr_acc = mb_acc.item()
            tr_loss_avg = tr_loss / (step + 1)
            tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc}
            total_step = epoch * len(tr_dl) + step

            # Eval
            if total_step % model_config.summary_step == 0 and total_step != 0:
                print("train: ")
                decoding_from_result(enc_input, y_pred_copy, dec_output_copy,
                                     tokenizer)

                model.eval()
                print("eval: ")
                val_summary = evaluate(model, val_dl, {
                    'loss': loss_fn,
                    'acc': acc
                }, device, tokenizer)
                val_loss = val_summary['loss']

                # writer.add_scalars('loss', {'train': tr_loss_avg,
                #                             'val': val_loss}, epoch * len(tr_dl) + step)

                tqdm.write(
                    'epoch : {}, step : {}, '
                    'tr_loss: {:.3f}, val_loss: {:.3f}, tr_acc: {:.2%}, val_acc: {:.2%}'
                    .format(epoch + 1, total_step, tr_summary['loss'],
                            val_summary['loss'], tr_summary['acc'],
                            val_summary['acc']))

                val_loss = val_summary['loss']
                # is_best = val_loss < best_val_loss # loss 기준
                is_best = tr_acc > best_train_acc  # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야)

                # Save
                if is_best:
                    print(
                        "[Best model Save] train_acc: {}, train_loss: {}, val_loss: {}"
                        .format(tr_summary['acc'], tr_summary['loss'],
                                val_loss))
                    # CPU에서도 동작 가능하도록 자료형 바꾼 뒤 저장
                    state = {
                        'epoch':
                        epoch + 1,
                        'model_state_dict':
                        model.to(torch.device('cpu')).state_dict(),
                        'opt_state_dict':
                        opt.state_dict()
                    }
                    summary = {'train': tr_summary, 'validation': val_summary}

                    summary_manager.update(summary)
                    summary_manager.save('summary.json')
                    checkpoint_manager.save_checkpoint(state, 'best.tar')

                    best_val_loss = val_loss

                model.to(device)
                model.train()
            else:
                if step % 50 == 0:
                    print(
                        'epoch : {}, step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'
                        .format(epoch + 1, total_step, tr_summary['loss'],
                                tr_summary['acc']))

Example #15

0

Show file

ABS_PATH = os.environ.get('BASEDIR')

model_dir = Path(f'{ABS_PATH}/experiments/base_model_with_crf_val')
model_config = Config(json_path=model_dir / 'config.json')

# Vocab & Tokenizer
tok_path = f"{ABS_PATH}/tokenizer_78b3253a26.model"
ptr_tokenizer = SentencepieceTokenizer(tok_path)

# load vocab & tokenizer
with open(model_dir / "vocab.pkl", 'rb') as f:
    vocab = pickle.load(f)

tokenizer = Tokenizer(vocab=vocab,
                      split_fn=ptr_tokenizer,
                      pad_fn=keras_pad_fn,
                      maxlen=None)

# load ner_to_index.json
with open(model_dir / "ner_to_index.json", 'rb') as f:
    ner_to_index = json.load(f)
    index_to_ner = {v: k for k, v in ner_to_index.items()}

# Model
model = KobertCRF(config=model_config,
                  num_classes=len(ner_to_index),
                  vocab=vocab)
# model = KobertBiGRUCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab)

# load
model_dict = model.state_dict()

Example #16

0

Show file

File: ner_dataset.py Project: kyujinjeong813/fnf_process

class NamedEntityRecognitionFormatter():
    def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')):

        if vocab is None or tokenizer is None:
            tok_path = get_tokenizer()
            self.ptr_tokenizer = SentencepieceTokenizer(tok_path)
            self.ptr_detokenizer = SentencepieceDetokenizer(tok_path)
            _, vocab_of_gluonnlp = get_pytorch_kobert_model()
            token2idx = vocab_of_gluonnlp.token_to_idx
            self.vocab = Vocabulary(token2idx=token2idx)
            self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen)
        else:
            self.vocab = vocab
            self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.model_dir = model_dir

    def transform_source_fn(self, text):
        tokens = self.tokenizer.split(text)
        token_ids_with_cls_sep = self.tokenizer.list_of_string_to_arr_of_cls_sep_pad_token_ids([text])

        prefix_sum_of_token_start_index = []
        sum = 0
        for i, token in enumerate(tokens):
            if i == 0:
                prefix_sum_of_token_start_index.append(0)
                sum += len(token) - 1
            else:
                prefix_sum_of_token_start_index.append(sum)
                sum += len(token)
        return token_ids_with_cls_sep, tokens, prefix_sum_of_token_start_index

    def transform_target_fn(self, label_text, tokens, prefix_sum_of_token_start_index):
        regex_ner = re.compile('<(.+?):[A-Z]{3}>') # NER Tag가 2자리 문자면 {3} -> {2}로 변경 (e.g. LOC -> LC) 인경우
        regex_filter_res = regex_ner.finditer(label_text)

        list_of_ner_tag = []
        list_of_ner_text = []
        list_of_tuple_ner_start_end = []

        count_of_match = 0
        for match_item in regex_filter_res:
            ner_tag = match_item[0][-4:-1]
            ner_text = match_item[1]
            start_index = match_item.start()
            end_index = match_item.end()

            list_of_ner_tag.append(ner_tag)
            list_of_ner_text.append(ner_text)
            list_of_tuple_ner_start_end.append((start_index, end_index))
            count_of_match += 1

        list_of_ner_label = []
        entity_index = 0
        is_entity_still_B = True
        for tup in zip(tokens, prefix_sum_of_token_start_index):
            token, index = tup

            if '▁' in token:
                index += 1

            if entity_index < len(list_of_tuple_ner_start_end):
                start, end = list_of_tuple_ner_start_end[entity_index]

                if end < index:
                    is_entity_still_B = True
                    entity_index = entity_index + 1 if entity_index + 1 < len(list_of_tuple_ner_start_end) else entity_index
                    start, end = list_of_tuple_ner_start_end[entity_index]

                if start <= index and index < end:
                    entity_tag = list_of_ner_tag[entity_index]
                    if is_entity_still_B is True:
                        entity_tag = 'B-' + entity_tag
                        list_of_ner_label.append(entity_tag)
                        is_entity_still_B = False
                    else:
                        is_entity_still_B = True
                        entity_tag = 'O'
                        list_of_ner_label.append(entity_tag)

            else:
                entity_tag = 'O'
                list_of_ner_label.append(entity_tag)

        with open(self.model_dir / "ner_to_index,json", 'rb') as f:
            self.ner_to_index = json.load(f)

        list_of_ner_ids = [self.ner_to_index['[CLS']] + [self.ner_to_index[ner_tag] for ner_tag in list_of_ner_label] + [self.ner_to_index['[SEP]']]
        list_of_ner_ids = self.tokenizer._pad([list_of_ner_ids], pad_id=self.vocab.PAD_ID, maxlen=self.maxlen)[0]

        return list_of_ner_ids, list_of_ner_label