コード例 #1
0
def generate_subword_script(dataset_path,
                            new_path,
                            script_prefix,
                            use_pretrain_kobert_tokenizer=False):
    print('create_subword_script...')

    if use_pretrain_kobert_tokenizer:
        tok_path = get_tokenizer()
        sp = SentencepieceTokenizer(tok_path)

    else:
        sp = spm.SentencePieceProcessor()
        vocab_file = "aihub_sentencepiece.model"
        sp.load(vocab_file)

    for folder in os.listdir(dataset_path):
        # folder : {KsponSpeech_01, ..., KsponSpeech_05}
        path = os.path.join(dataset_path, folder)
        for subfolder in os.listdir(path):
            path = os.path.join(dataset_path, folder, subfolder)
            for file in os.listdir(path):
                with open(os.path.join(path, file), "r",
                          encoding='cp949') as f:
                    sentence = f.read()

                if use_pretrain_kobert_tokenizer:
                    encode = sp(sentence)
                else:
                    encode = sp.encode_as_ids(sentence)

                with open(os.path.join(new_path, script_prefix + file[12:]),
                          "w",
                          encoding='cp949') as f:
                    f.write(" ".join(map(str, encode)))
コード例 #2
0
ファイル: model.py プロジェクト: L0Z1K/KoGPT2-chatbot
    def chat(self):
        tok = SentencepieceTokenizer(self.tok_path, num_best=0, alpha=0)
        with torch.no_grad():
            while 1:
                q = input('Q: ').strip()
                if q == 'quit':
                    break
                q_tok = tok(q)

                input_ids = torch.LongTensor([self.vocab['<usr>']] +
                                             self.vocab[q_tok] +
                                             self.vocab['</s>',
                                                        '<sys>']).unsqueeze(
                                                            dim=0)

                gen = self.kogpt2.generate(input_ids,
                                           num_beams=5,
                                           max_length=self.hparams.max_len,
                                           no_repeat_ngram_size=2,
                                           bad_words_ids=[[47437]])
                gen = self.vocab.to_tokens(gen.squeeze().tolist())

                answer = ''.join(g for g in gen)
                answer = answer[answer.find('<sys>') + 5:]
                answer = answer[:answer.find('</s>')]
                answer = answer.replace('▁', ' ')

                print("A: {}".format(answer.strip()))
コード例 #3
0
	def __init__(self, input_path_or_input_list, output_path):
		# load file to process
		if isinstance(input_path_or_input_list, str): # if a path is given as path string
			self.file = open(input_path_or_input_list,'rt',encoding='utf8')
		else: # if a path is given as list
			self.file = input_path_or_input_list
		self.output_path = output_path
		self.is_filetype = lambda x: any([isinstance(x, io.TextIOBase),
											isinstance(x, io.BufferedIOBase),
											isinstance(x, io.RawIOBase),
											isinstance(x, io.IOBase)])

		# tokenizer
		tok_path = get_tokenizer()
		self.tokenizer = SentencepieceTokenizer(tok_path)

		# rule set
		with open(config.post_process_rule_path,'rt',encoding='utf8') as f:
			self.rules = dict(map(lambda x:tuple(x.strip('\n').split('\t')),f))


		#dict to store (x,y,y_pred) triplet
		self.idx_map = ['x','y','y_pred']
		self.inst_dict = {}

		# numbers / hipen
		self.num_2_txt = {'^\(1\)':['우선,','먼저,','처음으로,'],
						  '^\(2\)':['두 번째로,', '이어서,','다음으로,'],
						  '^\(3\)':['세 번째로,','이어서,','다음으로,'],
						  '^\(4\)':['네 번째로,','이어서,','다음으로,'],
						  '^\(5\)':['다섯 번째로,','이어서,','다음으로,'],
						  '^\(6\)':['여섯 번째로,','이어서,','다음으로,']
                          }
コード例 #4
0
def main(parser):
    
    user_dic_path = 'userdic.txt'
    user_dic = _load_users_dict(user_dic_path)
    #print(users_dic)

    args = parser.parse_args()
    model_dir = Path(args.model_dir)
    model_config = Config(json_path=model_dir / 'config.json')

    # Vocab & Tokenizer
    tok_path = "./tokenizer_78b3253a26.model"
    ptr_tokenizer = SentencepieceTokenizer(tok_path)

    vocab_file = './kobert_model/kobertvocab_f38b8a4d6d.json'
    vocab_of_gluonnlp = nlp.vocab.BERTVocab.from_json(open(vocab_file, 'rt').read())
    token_to_idx = vocab_of_gluonnlp.token_to_idx
    vocab = Vocabulary(token_to_idx=token_to_idx)  
    tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)
    
    # load ner_to_index.json
    with open(model_dir / "ner_to_index.json", 'rb') as f:
        ner_to_index = json.load(f)
        index_to_ner = {v: k for k, v in ner_to_index.items()}

    # Model
    model = KobertCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab)

    model_state_dict = torch.load('{}/KobertCRF-lr5e-05-bs200/model.state'.format(model_dir))
    model.load_state_dict(model_state_dict)
    
    model.eval()
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    model.to(device)

    decoder_from_res = DecoderFromNamedEntitySequence(user_dic=user_dic, tokenizer=tokenizer, index_to_ner=index_to_ner)

    while(True):
        input_text = input("입력하세요: ")
        list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text])
        #print(list_of_input_ids)
        x_input = torch.tensor(list_of_input_ids).long()
        
        if torch.cuda.is_available():
            x_input = x_input.cuda() 
            
        ## for bert crf
        list_of_pred_ids = model(x_input)
        #print(list_of_pred_ids)
        list_of_ner_word, decoding_ner_sentence = decoder_from_res(input_text, list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids)
        
        '''
        for item in list_of_ner_word:
            if item['word'] in user_dic:
                print('f**k')
                item['tag'] = user_dic[item['word']]
        '''        
        print("list_of_ner_word:", list_of_ner_word)
        print("decoding_ner_sentence:", decoding_ner_sentence[6:-5])
def get_kobert_model_and_tokenizer():
    tok_path = get_tokenizer()
    basic_tokenizer = SentencepieceTokenizer(tok_path)
    bert_base, vocab = get_pytorch_kobert_model()
    kobert_tokenizer = KoBertTokenizer(basic_tokenizer, vocab)

    return bert_base, kobert_tokenizer
コード例 #6
0
 def chat(self, sent='0'):
     self.tok_path
     tok = SentencepieceTokenizer(self.tok_path, num_best=0, alpha=0)
     sent_tokens = tok(sent)
     with torch.no_grad():
         while 1:
             q = input('user > ').strip()
             if q == 'quit':
                 break
             q_tok = tok(q)
             a = ''
             a_tok = []
             while 1:
                 input_ids = torch.LongTensor([self.vocab[U_TKN]] +
                                              self.vocab[q_tok] +
                                              self.vocab[EOS, SENT] +
                                              self.vocab[sent_tokens] +
                                              self.vocab[EOS, S_TKN] +
                                              self.vocab[a_tok]).unsqueeze(
                                                  dim=0)
                 pred = self(input_ids)
                 gen = self.vocab.to_tokens(
                     torch.argmax(pred,
                                  dim=-1).squeeze().numpy().tolist())[-1]
                 if gen == EOS or len(gen) >= 200:
                     break
                 a += gen.replace('▁', ' ')
                 a_tok = tok(a)
             print("Simsimi > {}".format(a.strip()))
コード例 #7
0
ファイル: data.py プロジェクト: tyhtm3/Photory-AI
def sentencePieceTokenizer():
    tok_path = get_tokenizer()
    sentencepieceTokenizer = SentencepieceTokenizer(tok_path,
                                                    num_best=0,
                                                    alpha=0)

    return sentencepieceTokenizer
コード例 #8
0
def chat(kogptqa, sent='0'):
    tok_path = get_tokenizer()
    _, vocab = get_pytorch_kogpt2_model()
    tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
    sent_tokens = tok(sent)
    with torch.no_grad():
        while 1:
            q = input('user > ').strip()
            if q == 'quit':
                break
            q_tok = tok(q)
            a = ''
            a_tok = []
            while 1:
                input_ids = torch.LongTensor([
                    vocab[U_TKN]] + vocab[q_tok] +
                    vocab[EOS, SENT] + vocab[sent_tokens] +
                    vocab[EOS, S_TKN] +
                    vocab[a_tok]).unsqueeze(dim=0)
                pred = kogptqa(input_ids)
                gen = vocab.to_tokens(
                    torch.argmax(
                        pred,
                        dim=-1).squeeze().numpy().tolist())[-1]
                if gen == EOS:
                    break
                a += gen.replace('▁', ' ')
                a_tok = tok(a)
            print("Simsimi > {}".format(a.strip()))
コード例 #9
0
    def __init__(self, test_file, vocab, max_token_cnt=300):
        self.tokenizer = SentencepieceTokenizer(get_tokenizer())
        self.vocab = vocab

        self.max_token_cnt = max_token_cnt

        self.media_map = {
            '경기일보': 0,
            '광양신문': 1,
            '광주매일신문': 2,
            '광주일보': 3,
            '국제신문': 4,
            '기호일보': 5,
            '남도일보': 6,
            '당진시대': 7,
            '대구신문': 8,
            '대구일보': 9,
            '대전일보': 10
        }
        print("medias", self.media_map)

        samples = []
        with jsonlines.open(test_file) as f:
            for line in f.iter():
                media = line['media']
                id = line['id']
                sentences = []
                for i, sentence in enumerate(line['article_original']):
                    sentences.append(sentence.replace('\n', '').strip())
                samples.append([sentences, media, id])
        self.samples = samples
コード例 #10
0
    def get_tokenizer(cls):
        if cls.tokenizer == None:
            tok_path = "./tokenizer_78b3253a26.model"
            ptr_tokenizer = SentencepieceTokenizer(tok_path)

            cls.tokenizer = Tokenizer(vocab=cls.vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)
        return cls.tokenizer
コード例 #11
0
def chat(model_params, sent='0'):
    tok_path = get_tokenizer()
    model, vocab = get_mxnet_kogpt2_model(ctx=ctx)
    tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
    kogptqa = KoGPT2Chat(model)
    kogptqa.load_parameters(model_params, ctx=ctx)
    sent_tokens = tok(sent)
    while 1:
        q = input('user > ').strip()
        if q == 'quit':
            break
        q_tok = tok(q)
        a = ''
        a_tok = []
        while 1:
            input_ids = mx.nd.array([vocab[U_TKN]] + vocab[q_tok] +
                                    vocab[EOS, SENT] + vocab[sent_tokens] +
                                    vocab[EOS, S_TKN] +
                                    vocab[a_tok]).expand_dims(axis=0)
            pred = kogptqa(input_ids.as_in_context(ctx))
            gen = vocab.to_tokens(
                mx.nd.argmax(
                    pred,
                    axis=-1).squeeze().astype('int').asnumpy().tolist())[-1]
            if gen == EOS:
                break
            a += gen.replace('▁', ' ')
            a_tok = tok(a)
        print("Simsimi > {}".format(a.strip()))
コード例 #12
0
 def __init__(self):
     self.tok_path = get_tokenizer()
     self.sp = SentencepieceTokenizer(self.tok_path)
     self.v_dimension = 300
     self.v_window = 8
     self.hangul = re.compile("[^ㄱ-ㅎㅏ-ㅣ가-힣]+")
     self.mecab = Mecab()
コード例 #13
0
def Tokenizer(item):
    item = list(np.array(item.tolist()))
    max = 0
    tok_path = get_tokenizer()
    model, vocab = get_pytorch_kogpt2_model()
    tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)

    out = []

    for i in item:

        toked = tok(i)
        input_ids = torch.tensor([
            vocab[vocab.bos_token],
        ] + vocab[toked]).unsqueeze(0)
        size = input_ids.shape
        # print(input_ids)
        # print(input_ids.shape)
        y = torch.cat(
            [input_ids, torch.empty(1, max_seqlen - size[1])], axis=1)
        out = torch.cat([out, y], axis=0)

        print(out.shape)

    x_np = out.numpy()
    x_df = pd.DataFrame(x_np)
    x_df.to_csv('./data/encoded.csv', mode='w')
コード例 #14
0
 def __init__(self, name):
     self.name = name
     self.token2index = {}
     self.index2token = {}
     self.n_tokens = 0
     tok_path = get_tokenizer()
     self.sp = SentencepieceTokenizer(tok_path)
def main():
    nsmc_home_dir = 'NSMC_DIR'
    train_file = nsmc_home_dir + '/ratings_train.txt'  # 150K
    test_file = nsmc_home_dir + '/ratings_test.txt'  # 50K

    model, vocab = get_pytorch_kobert_model(
        ctx='cuda' if torch.cuda.is_available() else 'cpu')

    lr = 5e-5
    batch_size = 16
    epochs = 5
    dropout_rate = 0.1
    max_grad_norm = 1.0
    num_total_steps = math.ceil(150000 / batch_size) * epochs
    num_warmup_steps = num_total_steps // 10
    log_interval = 100
    seed = 2019
    num_workers = 2
    num_classes = 2
    pooler_out_dim = model.pooler.dense.out_features

    torch.manual_seed(seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print('device', device)

    tok_path = get_tokenizer()
    sp = SentencepieceTokenizer(tok_path)

    train_loader = torch.utils.data.DataLoader(MovieDataset(
        get_data(train_file, vocab, sp)),
                                               shuffle=True,
                                               batch_size=batch_size,
                                               num_workers=num_workers,
                                               collate_fn=batchify,
                                               pin_memory=True)

    test_loader = torch.utils.data.DataLoader(MovieDataset(
        get_data(test_file, vocab, sp)),
                                              batch_size=batch_size,
                                              shuffle=False,
                                              num_workers=num_workers,
                                              collate_fn=batchify,
                                              pin_memory=True)

    linear = torch.nn.Linear(pooler_out_dim, num_classes).to(device)

    all_params = list(model.parameters()) + list(linear.parameters())
    optimizer = AdamW(all_params, lr=lr, correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_total_steps)
    for epoch in range(epochs):
        train(train_loader, device, model, linear, all_params, optimizer,
              scheduler, dropout_rate, max_grad_norm, log_interval, epoch)
        print(datetime.now(), 'Testing...')
        test(test_loader, device, model, linear)
コード例 #16
0
def predict(sentence1, sentence2):
    ptr_dir = "C:/Users/aaaaa/workspace/fact-check/BERT_pairwise_text_classification/pretrained"
    data_dir = "C:/Users/aaaaa/workspace/fact-check/BERT_pairwise_text_classification/data"
    caseType = "skt"
    model_dir = "C:/Users/aaaaa/workspace/fact-check/BERT_pairwise_text_classification/experiments/base_model"
    checkpoint_model_file = "best_skt.tar"
    
    # ptr_dir = "BERT_pairwise_text_classification/pretrained"
    # data_dir = "BERT_pairwise_text_classification/data"
    # caseType = "skt"
    # model_dir = "BERT_pairwise_text_classification/experiments/base_model"
    # checkpoint_model_file = "best_skt.tar"
    
    # ptr_dir = "pretrained"
    # data_dir = "data"
    # caseType = "skt"
    # model_dir = "experiments/base_model"
    # checkpoint_model_file = "best_skt.tar"
    
    ptr_dir = Path(ptr_dir)
    data_dir = Path(data_dir)
    model_dir = Path(model_dir)
    checkpoint_model_file = Path(checkpoint_model_file)
    
    ptr_config = Config(ptr_dir / 'config_skt.json')
    data_config = Config(data_dir / 'config.json')
    model_config = Config(model_dir / 'config.json')
    
    # vocab
    with open(os.path.join(ptr_dir, ptr_config.vocab), mode='rb') as io:
        vocab = pickle.load(io)
    
    
    ptr_tokenizer = SentencepieceTokenizer(os.path.join(ptr_dir, ptr_config.tokenizer))
    pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token))
    preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=pad_sequence)
    
    # model (restore)
    checkpoint_manager = CheckpointManager(model_dir)
    checkpoint = checkpoint_manager.load_checkpoint(checkpoint_model_file)
    config = BertConfig(os.path.join(ptr_dir, ptr_config.config))
    model = PairwiseClassifier(config, num_classes=model_config.num_classes, vocab=preprocessor.vocab)
    model.load_state_dict(checkpoint['model_state_dict'])
    
    device = torch.device('cpu')
    model.to(device)
    
    transform = preprocessor.preprocess
    if model.training:
        model.eval()
        
    indices, token_types = [torch.tensor([elm]) for elm in transform(sentence1, sentence2)]

    with torch.no_grad():
        label = model(indices, token_types)
    label = label.max(dim=1)[1]
    label = label.numpy()[0]

    return label
コード例 #17
0
def load_module():
    global model, vocab, tok

    if not model or not vocab:
        model, vocab = get_pytorch_kogpt2_model()

    if not tok:
        tok_path = get_tokenizer()
        tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
コード例 #18
0
ファイル: ner.py プロジェクト: connexioh-team/ner
def main():
    cur_path = os.path.dirname(sys.argv[0])
    if cur_path:
        os.chdir(cur_path)

    model_dir = Path('./experiments/base_model_with_crf')
    model_config = Config(json_path=model_dir / 'config.json')

    # load vocab & tokenizer
    tok_path = "./ptr_lm_model/tokenizer_78b3253a26.model"
    ptr_tokenizer = SentencepieceTokenizer(tok_path)

    with open(model_dir / "vocab.pkl", 'rb') as f:
        vocab = pickle.load(f)
    tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)

    # load ner_to_index.json
    with open(model_dir / "ner_to_index.json", 'rb') as f:
        ner_to_index = json.load(f)
        index_to_ner = {v: k for k, v in ner_to_index.items()}

    # model
    model = KobertCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab)

    # load
    model_dict = model.state_dict()
    checkpoint = torch.load("./model.bin", map_location=torch.device('cpu'))
    convert_keys = {}
    for k, v in checkpoint['model_state_dict'].items():
        new_key_name = k.replace("module.", '')
        if new_key_name not in model_dict:
            print("{} is not int model_dict".format(new_key_name), file=sys.stderr)
            continue
        convert_keys[new_key_name] = v

    model.load_state_dict(convert_keys, strict=False)
    model.eval()
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    #model.to(device)
    decoder_from_res = DecoderFromNamedEntitySequence(tokenizer=tokenizer, index_to_ner=index_to_ner)
    
    try:
        while(True):
            input_text = input()
        
            list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text])
            x_input = torch.tensor(list_of_input_ids).long()
            list_of_pred_ids = model(x_input)

            list_of_ner_word = decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids)
            if list_of_ner_word:
                print(",".join(list_of_ner_word))
            else:
                print("/")
    except:
        print("EOF", file=sys.stderr)
コード例 #19
0
def main(args):
    for arg in vars(args):
        print(arg, getattr(args, arg))
    tokenizer = SentencepieceTokenizer(get_tokenizer())

    lines_len = 0
    src_docs = []
    with jsonlines.open(args.train_file) as f:

        for line in f.iter():
            lines_len += 1
            sentences = []
            for sentence in line['article_original']:
                sentences.append(sentence)
            src_docs.append(" ".join(sentences).replace('\n', '') + "\n")

    lens = []
    tr_max_src = 0
    for i, src_doc in enumerate(src_docs):
        if i % 100 == 0:
            print(i, len(src_docs))
        tokens = tokenizer(src_doc)
        cur_len = len(tokens)
        lens.append(cur_len)
        if tr_max_src < cur_len:
            tr_max_src = cur_len

    src_docs = []
    with jsonlines.open(args.test_file) as f:

        for line in f.iter():
            lines_len += 1
            sentences = []
            for sentence in line['article_original']:
                sentences.append(sentence)
            src_docs.append(" ".join(sentences).replace('\n', '') + "\n")

    max_src = 0
    test_lens = []
    for i, src_doc in enumerate(src_docs):
        if i % 100 == 0:
            print(i, len(src_docs))
        tokens = tokenizer(src_doc)
        cur_len = len(tokens)
        test_lens.append(cur_len)
        if max_src < cur_len:
            max_src = cur_len
    print("max source length train", tr_max_src)
    print("max source length test", max_src)
    print(sum(lens) / len(lens))
    print(sum(test_lens) / len(test_lens))
    import numpy as np
    print(np.median(np.array(lens)))
    print(np.median(np.array(test_lens)))
    print("done")
コード例 #20
0
ファイル: dataloader.py プロジェクト: zzingae/KoBERT
    def __init__(self, filename, vocab, maxlen, use_emotion):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, header=0, encoding='utf-8')
        # self.df = pd.read_csv(filename, delimiter = '\t')
        self.sp = SentencepieceTokenizer(get_tokenizer())
        self.vocab = vocab
        self.maxlen = maxlen
        self.use_emotion = use_emotion

        self.sp.tokens.index('!')
コード例 #21
0
    def __init__(self):

        self.PAD_IDX = 0
        self.UNK_IDX = 1
        self.PAD_TOKEN = 'PAD_TOKEN'
        self.UNK_TOKEN = 'UNK_TOKEN'
        self.tok=Mecab()
        _, self.vocab = get_pytorch_kogpt2_model()
        
        self.tok_path = get_tokenizer()
        self.tok2 = SentencepieceTokenizer(self.tok_path,  num_best=0, alpha=0)
コード例 #22
0
ファイル: app.py プロジェクト: czangyeob/pytorch-bert-crf-ner
def post():
    value = request.form['input']
    model_dir = Path('./experiments/base_model_with_crf')
    model_config = Config(json_path=model_dir / 'config.json')
    # load vocab & tokenizer
    tok_path = "ptr_lm_model/tokenizer_78b3253a26.model"
    ptr_tokenizer = SentencepieceTokenizer(tok_path)

    with open(model_dir / "vocab.pkl", 'rb') as f:
        vocab = pickle.load(f)
    tokenizer = Tokenizer(vocab=vocab,
                          split_fn=ptr_tokenizer,
                          pad_fn=keras_pad_fn,
                          maxlen=model_config.maxlen)

    # load ner_to_index.json
    with open(model_dir / "ner_to_index.json", 'rb') as f:
        ner_to_index = json.load(f)
        index_to_ner = {v: k for k, v in ner_to_index.items()}

    # model
    model = KobertCRFViz(config=model_config,
                         num_classes=len(ner_to_index),
                         vocab=vocab)

    # load
    model_dict = model.state_dict()
    checkpoint = torch.load(
        "./experiments/base_model_with_crf/best-epoch-16-step-1500-acc-0.993.bin",
        map_location=torch.device('cpu'))
    convert_keys = {}
    for k, v in checkpoint['model_state_dict'].items():
        new_key_name = k.replace("module.", '')
        if new_key_name not in model_dict:
            print("{} is not int model_dict".format(new_key_name))
            continue
        convert_keys[new_key_name] = v

    model.load_state_dict(convert_keys)
    model.eval()
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    decoder_from_res = DecoderFromNamedEntitySequence(
        tokenizer=tokenizer, index_to_ner=index_to_ner)
    input_text = value
    list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids(
        [input_text])
    x_input = torch.tensor(list_of_input_ids).long()
    list_of_pred_ids, _ = model(x_input)
    list_of_ner_word, decoding_ner_sentence = decoder_from_res(
        list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids)
    return {'word': list_of_ner_word, 'decoding': decoding_ner_sentence}
コード例 #23
0
ファイル: evaluate.py プロジェクト: shimdx/nlp_classification
def get_preprocessor(ptr_config_info, model_config):
    with open(ptr_config_info.vocab, mode='rb') as io:
        vocab = pickle.load(io)

    if model_config.type == 'etri':
        ptr_tokenizer = ETRITokenizer.from_pretrained(ptr_config_info.tokenizer, do_lower_case=False)
        pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token))
        preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer.tokenize, pad_fn=pad_sequence)
    elif model_config.type == 'skt':
        ptr_tokenizer = SentencepieceTokenizer(ptr_config_info.tokenizer)
        pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token))
        preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=pad_sequence)
    return preprocessor
コード例 #24
0
def korean_gpt_long_setence_life_test():
    config = get_config()
    kogpt2_config = get_kog_config()
    kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth"

    kogpt2_vocab_path = config['kogpt_vocab_path']
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    torch.load(kogpt2_model_path)
    kogpt2model.load_state_dict(torch.load(kogpt2_model_path))

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    kogpt2model.to(device)
    kogpt2model.eval()
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                   mask_token=None,
                                                   sep_token=None,
                                                   cls_token=None,
                                                   unknown_token='<unk>',
                                                   padding_token='<pad>',
                                                   bos_token='<s>',
                                                   eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)

    sent = '나는 밥을 먹었'
    toked = tok(sent)
    print(toked)
    sent_cnt = 0

    input_ids = torch.tensor([
        vocab[vocab.bos_token],
    ] + vocab[toked]).unsqueeze(0)
    input_ids = input_ids.to(device)

    outputs = kogpt2model.generate(input_ids=input_ids,
                                   max_length=100,
                                   min_length=50,
                                   repetition_penalty=1.2,
                                   do_sample=True,
                                   num_beams=3,
                                   bos_token_id=0,
                                   pad_token_id=3,
                                   eos_token_id=1,
                                   num_return_sequences=3)

    target = outputs[0]
    print("========수필===========")
    for i in range(3):  # 3 output sequences were generated
        toked = vocab.to_tokens(outputs[i].squeeze().tolist())
        ret = re.sub(r'(<s>|</s>|<pad>|<unk>)', '',
                     ''.join(toked).replace('▁', ' ').strip())
        print('Generated {}: {}'.format(i, ret))
コード例 #25
0
def model_fn(model_dir):
    voc_file_name = glob.glob('{}/*.spiece'.format(model_dir))[0]
    model_param_file_name = glob.glob('{}/*.params'.format(model_dir))[0]

    # check if GPU is available
    if mx.context.num_gpus() > 0:
        ctx = mx.gpu()
    else:
        ctx = mx.cpu()

    model, vocab = get_kogpt2_model(model_param_file_name, voc_file_name, ctx)
    tok = SentencepieceTokenizer(voc_file_name)

    return model, vocab, tok, ctx
コード例 #26
0
 def __init__(self, model_path=con.NER_MODEL_PATH):
     with open(con.NER_UTIL_PATH["token_to_index"], 'rb') as f:
         self.token_to_index = pickle.load(f)
     with open(con.NER_UTIL_PATH["index_to_token"], 'rb') as f:
         self.index_to_token = pickle.load(f)
     with open(con.NER_UTIL_PATH["entity_to_index"], 'rb') as f:
         self.entity_to_index = pickle.load(f)
     with open(con.NER_UTIL_PATH["index_to_entity"], 'rb') as f:
         self.index_to_entity = pickle.load(f)
     self.tokenizer = SentencepieceTokenizer(con.NER_UTIL_PATH["tokenizer"])
     self.model_config = con.MODEL_CONFIG
     self.model_path = model_path
     self.vocab = con.VOCAB
     self.__load_ner_model()
コード例 #27
0
 def chat(self, sent='0'):
     self.tok_path
     tok = SentencepieceTokenizer(self.tok_path, num_best=0, alpha=0)
     sent_tokens = tok(sent)
     with torch.no_grad():
         while 1:
             q = input('user > ').strip()
             if q == 'quit':
                 break
             q_tok = tok(q)
             a = ''
             a_tok = []
             timeout = time.time() + 60
             while 1:
                 input_ids = torch.LongTensor([self.vocab[U_TKN]] +
                                              self.vocab[q_tok] +
                                              self.vocab[EOS, SENT] +
                                              self.vocab[sent_tokens] +
                                              self.vocab[EOS, S_TKN] +
                                              self.vocab[a_tok]).unsqueeze(
                                                  dim=0)
                 pred = self(input_ids)
                 gen = self.vocab.to_tokens(
                     torch.argmax(pred,
                                  dim=-1).squeeze().numpy().tolist())[-1]
                 if gen == EOS:
                     break
                 a += gen.replace('▁', ' ')
                 a_tok = tok(a)
                 if time.time() > timeout:
                     break
             answer_list = kss.split_sentences(a)[1:-2]
             Simsimi_answer = "".join(answer_list)
             sentence_list = Simsimi_answer.split('.')
             sentences = []
             for s in sentence_list:
                 word_list = s.split()  #리스트
                 # sentences=[]
                 for word in word_list:
                     if word.endswith('*님이') == True:
                         word_list[word_list.index(word)] = word.replace(
                             word, "상담자님이")
                         # print(word)
                     else:
                         pass
                 sentence = " ".join(word_list)
                 sentences.append(sentence)
                 # print(sentence)
             print("Simsimi > ", ".".join(sentences))
コード例 #28
0
    def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')):

        if vocab is None or tokenizer is None:
            tok_path = get_tokenizer()
            self.ptr_tokenizer = SentencepieceTokenizer(tok_path)
            self.ptr_detokenizer = SentencepieceDetokenizer(tok_path)
            _, vocab_of_gluonnlp = get_pytorch_kobert_model()
            token2idx = vocab_of_gluonnlp.token_to_idx
            self.vocab = Vocabulary(token2idx=token2idx)
            self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen)
        else:
            self.vocab = vocab
            self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.model_dir = model_dir
コード例 #29
0
def Load_Model():
    global vocab_global
    global sent_tokens_global
    global kogptqa_global
    global tok_global
    tok_path = get_tokenizer()
    model, vocab = get_mxnet_kogpt2_model(ctx=ctx)
    tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
    kogptqa = KoGPT2Chat(model)
    kogptqa.load_parameters("KoGPT2-chatbot\kogpt2_chat.params", ctx=ctx)
    sent_tokens = tok("0")
    vocab_global = vocab
    sent_tokens_global = sent_tokens
    kogptqa_global = kogptqa
    tok_global = tok
コード例 #30
0
    def __init__(self,
                 samples,
                 vocab,
                 media_map,
                 word_dropout_prob=0.0,
                 max_word_dropout_ratio=0.0,
                 max_token_cnt=300):
        self.tokenizer = SentencepieceTokenizer(get_tokenizer())
        self.vocab = vocab

        self.samples = samples
        self.targets = [s[1] for s in samples]
        self.media_map = media_map
        self.word_dropout_prob = word_dropout_prob
        self.max_word_dropout_ratio = max_word_dropout_ratio
        self.max_token_cnt = max_token_cnt