def generate_subword_script(dataset_path, new_path, script_prefix, use_pretrain_kobert_tokenizer=False): print('create_subword_script...') if use_pretrain_kobert_tokenizer: tok_path = get_tokenizer() sp = SentencepieceTokenizer(tok_path) else: sp = spm.SentencePieceProcessor() vocab_file = "aihub_sentencepiece.model" sp.load(vocab_file) for folder in os.listdir(dataset_path): # folder : {KsponSpeech_01, ..., KsponSpeech_05} path = os.path.join(dataset_path, folder) for subfolder in os.listdir(path): path = os.path.join(dataset_path, folder, subfolder) for file in os.listdir(path): with open(os.path.join(path, file), "r", encoding='cp949') as f: sentence = f.read() if use_pretrain_kobert_tokenizer: encode = sp(sentence) else: encode = sp.encode_as_ids(sentence) with open(os.path.join(new_path, script_prefix + file[12:]), "w", encoding='cp949') as f: f.write(" ".join(map(str, encode)))
def chat(self): tok = SentencepieceTokenizer(self.tok_path, num_best=0, alpha=0) with torch.no_grad(): while 1: q = input('Q: ').strip() if q == 'quit': break q_tok = tok(q) input_ids = torch.LongTensor([self.vocab['<usr>']] + self.vocab[q_tok] + self.vocab['</s>', '<sys>']).unsqueeze( dim=0) gen = self.kogpt2.generate(input_ids, num_beams=5, max_length=self.hparams.max_len, no_repeat_ngram_size=2, bad_words_ids=[[47437]]) gen = self.vocab.to_tokens(gen.squeeze().tolist()) answer = ''.join(g for g in gen) answer = answer[answer.find('<sys>') + 5:] answer = answer[:answer.find('</s>')] answer = answer.replace('▁', ' ') print("A: {}".format(answer.strip()))
def __init__(self, input_path_or_input_list, output_path): # load file to process if isinstance(input_path_or_input_list, str): # if a path is given as path string self.file = open(input_path_or_input_list,'rt',encoding='utf8') else: # if a path is given as list self.file = input_path_or_input_list self.output_path = output_path self.is_filetype = lambda x: any([isinstance(x, io.TextIOBase), isinstance(x, io.BufferedIOBase), isinstance(x, io.RawIOBase), isinstance(x, io.IOBase)]) # tokenizer tok_path = get_tokenizer() self.tokenizer = SentencepieceTokenizer(tok_path) # rule set with open(config.post_process_rule_path,'rt',encoding='utf8') as f: self.rules = dict(map(lambda x:tuple(x.strip('\n').split('\t')),f)) #dict to store (x,y,y_pred) triplet self.idx_map = ['x','y','y_pred'] self.inst_dict = {} # numbers / hipen self.num_2_txt = {'^\(1\)':['우선,','먼저,','처음으로,'], '^\(2\)':['두 번째로,', '이어서,','다음으로,'], '^\(3\)':['세 번째로,','이어서,','다음으로,'], '^\(4\)':['네 번째로,','이어서,','다음으로,'], '^\(5\)':['다섯 번째로,','이어서,','다음으로,'], '^\(6\)':['여섯 번째로,','이어서,','다음으로,'] }
def main(parser): user_dic_path = 'userdic.txt' user_dic = _load_users_dict(user_dic_path) #print(users_dic) args = parser.parse_args() model_dir = Path(args.model_dir) model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer tok_path = "./tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) vocab_file = './kobert_model/kobertvocab_f38b8a4d6d.json' vocab_of_gluonnlp = nlp.vocab.BERTVocab.from_json(open(vocab_file, 'rt').read()) token_to_idx = vocab_of_gluonnlp.token_to_idx vocab = Vocabulary(token_to_idx=token_to_idx) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) # load ner_to_index.json with open(model_dir / "ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} # Model model = KobertCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab) model_state_dict = torch.load('{}/KobertCRF-lr5e-05-bs200/model.state'.format(model_dir)) model.load_state_dict(model_state_dict) model.eval() device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) decoder_from_res = DecoderFromNamedEntitySequence(user_dic=user_dic, tokenizer=tokenizer, index_to_ner=index_to_ner) while(True): input_text = input("입력하세요: ") list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text]) #print(list_of_input_ids) x_input = torch.tensor(list_of_input_ids).long() if torch.cuda.is_available(): x_input = x_input.cuda() ## for bert crf list_of_pred_ids = model(x_input) #print(list_of_pred_ids) list_of_ner_word, decoding_ner_sentence = decoder_from_res(input_text, list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids) ''' for item in list_of_ner_word: if item['word'] in user_dic: print('f**k') item['tag'] = user_dic[item['word']] ''' print("list_of_ner_word:", list_of_ner_word) print("decoding_ner_sentence:", decoding_ner_sentence[6:-5])
def get_kobert_model_and_tokenizer(): tok_path = get_tokenizer() basic_tokenizer = SentencepieceTokenizer(tok_path) bert_base, vocab = get_pytorch_kobert_model() kobert_tokenizer = KoBertTokenizer(basic_tokenizer, vocab) return bert_base, kobert_tokenizer
def chat(self, sent='0'): self.tok_path tok = SentencepieceTokenizer(self.tok_path, num_best=0, alpha=0) sent_tokens = tok(sent) with torch.no_grad(): while 1: q = input('user > ').strip() if q == 'quit': break q_tok = tok(q) a = '' a_tok = [] while 1: input_ids = torch.LongTensor([self.vocab[U_TKN]] + self.vocab[q_tok] + self.vocab[EOS, SENT] + self.vocab[sent_tokens] + self.vocab[EOS, S_TKN] + self.vocab[a_tok]).unsqueeze( dim=0) pred = self(input_ids) gen = self.vocab.to_tokens( torch.argmax(pred, dim=-1).squeeze().numpy().tolist())[-1] if gen == EOS or len(gen) >= 200: break a += gen.replace('▁', ' ') a_tok = tok(a) print("Simsimi > {}".format(a.strip()))
def sentencePieceTokenizer(): tok_path = get_tokenizer() sentencepieceTokenizer = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) return sentencepieceTokenizer
def chat(kogptqa, sent='0'): tok_path = get_tokenizer() _, vocab = get_pytorch_kogpt2_model() tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) sent_tokens = tok(sent) with torch.no_grad(): while 1: q = input('user > ').strip() if q == 'quit': break q_tok = tok(q) a = '' a_tok = [] while 1: input_ids = torch.LongTensor([ vocab[U_TKN]] + vocab[q_tok] + vocab[EOS, SENT] + vocab[sent_tokens] + vocab[EOS, S_TKN] + vocab[a_tok]).unsqueeze(dim=0) pred = kogptqa(input_ids) gen = vocab.to_tokens( torch.argmax( pred, dim=-1).squeeze().numpy().tolist())[-1] if gen == EOS: break a += gen.replace('▁', ' ') a_tok = tok(a) print("Simsimi > {}".format(a.strip()))
def __init__(self, test_file, vocab, max_token_cnt=300): self.tokenizer = SentencepieceTokenizer(get_tokenizer()) self.vocab = vocab self.max_token_cnt = max_token_cnt self.media_map = { '경기일보': 0, '광양신문': 1, '광주매일신문': 2, '광주일보': 3, '국제신문': 4, '기호일보': 5, '남도일보': 6, '당진시대': 7, '대구신문': 8, '대구일보': 9, '대전일보': 10 } print("medias", self.media_map) samples = [] with jsonlines.open(test_file) as f: for line in f.iter(): media = line['media'] id = line['id'] sentences = [] for i, sentence in enumerate(line['article_original']): sentences.append(sentence.replace('\n', '').strip()) samples.append([sentences, media, id]) self.samples = samples
def get_tokenizer(cls): if cls.tokenizer == None: tok_path = "./tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) cls.tokenizer = Tokenizer(vocab=cls.vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) return cls.tokenizer
def chat(model_params, sent='0'): tok_path = get_tokenizer() model, vocab = get_mxnet_kogpt2_model(ctx=ctx) tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) kogptqa = KoGPT2Chat(model) kogptqa.load_parameters(model_params, ctx=ctx) sent_tokens = tok(sent) while 1: q = input('user > ').strip() if q == 'quit': break q_tok = tok(q) a = '' a_tok = [] while 1: input_ids = mx.nd.array([vocab[U_TKN]] + vocab[q_tok] + vocab[EOS, SENT] + vocab[sent_tokens] + vocab[EOS, S_TKN] + vocab[a_tok]).expand_dims(axis=0) pred = kogptqa(input_ids.as_in_context(ctx)) gen = vocab.to_tokens( mx.nd.argmax( pred, axis=-1).squeeze().astype('int').asnumpy().tolist())[-1] if gen == EOS: break a += gen.replace('▁', ' ') a_tok = tok(a) print("Simsimi > {}".format(a.strip()))
def __init__(self): self.tok_path = get_tokenizer() self.sp = SentencepieceTokenizer(self.tok_path) self.v_dimension = 300 self.v_window = 8 self.hangul = re.compile("[^ㄱ-ㅎㅏ-ㅣ가-힣]+") self.mecab = Mecab()
def Tokenizer(item): item = list(np.array(item.tolist())) max = 0 tok_path = get_tokenizer() model, vocab = get_pytorch_kogpt2_model() tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) out = [] for i in item: toked = tok(i) input_ids = torch.tensor([ vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) size = input_ids.shape # print(input_ids) # print(input_ids.shape) y = torch.cat( [input_ids, torch.empty(1, max_seqlen - size[1])], axis=1) out = torch.cat([out, y], axis=0) print(out.shape) x_np = out.numpy() x_df = pd.DataFrame(x_np) x_df.to_csv('./data/encoded.csv', mode='w')
def __init__(self, name): self.name = name self.token2index = {} self.index2token = {} self.n_tokens = 0 tok_path = get_tokenizer() self.sp = SentencepieceTokenizer(tok_path)
def main(): nsmc_home_dir = 'NSMC_DIR' train_file = nsmc_home_dir + '/ratings_train.txt' # 150K test_file = nsmc_home_dir + '/ratings_test.txt' # 50K model, vocab = get_pytorch_kobert_model( ctx='cuda' if torch.cuda.is_available() else 'cpu') lr = 5e-5 batch_size = 16 epochs = 5 dropout_rate = 0.1 max_grad_norm = 1.0 num_total_steps = math.ceil(150000 / batch_size) * epochs num_warmup_steps = num_total_steps // 10 log_interval = 100 seed = 2019 num_workers = 2 num_classes = 2 pooler_out_dim = model.pooler.dense.out_features torch.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('device', device) tok_path = get_tokenizer() sp = SentencepieceTokenizer(tok_path) train_loader = torch.utils.data.DataLoader(MovieDataset( get_data(train_file, vocab, sp)), shuffle=True, batch_size=batch_size, num_workers=num_workers, collate_fn=batchify, pin_memory=True) test_loader = torch.utils.data.DataLoader(MovieDataset( get_data(test_file, vocab, sp)), batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=batchify, pin_memory=True) linear = torch.nn.Linear(pooler_out_dim, num_classes).to(device) all_params = list(model.parameters()) + list(linear.parameters()) optimizer = AdamW(all_params, lr=lr, correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_total_steps) for epoch in range(epochs): train(train_loader, device, model, linear, all_params, optimizer, scheduler, dropout_rate, max_grad_norm, log_interval, epoch) print(datetime.now(), 'Testing...') test(test_loader, device, model, linear)
def predict(sentence1, sentence2): ptr_dir = "C:/Users/aaaaa/workspace/fact-check/BERT_pairwise_text_classification/pretrained" data_dir = "C:/Users/aaaaa/workspace/fact-check/BERT_pairwise_text_classification/data" caseType = "skt" model_dir = "C:/Users/aaaaa/workspace/fact-check/BERT_pairwise_text_classification/experiments/base_model" checkpoint_model_file = "best_skt.tar" # ptr_dir = "BERT_pairwise_text_classification/pretrained" # data_dir = "BERT_pairwise_text_classification/data" # caseType = "skt" # model_dir = "BERT_pairwise_text_classification/experiments/base_model" # checkpoint_model_file = "best_skt.tar" # ptr_dir = "pretrained" # data_dir = "data" # caseType = "skt" # model_dir = "experiments/base_model" # checkpoint_model_file = "best_skt.tar" ptr_dir = Path(ptr_dir) data_dir = Path(data_dir) model_dir = Path(model_dir) checkpoint_model_file = Path(checkpoint_model_file) ptr_config = Config(ptr_dir / 'config_skt.json') data_config = Config(data_dir / 'config.json') model_config = Config(model_dir / 'config.json') # vocab with open(os.path.join(ptr_dir, ptr_config.vocab), mode='rb') as io: vocab = pickle.load(io) ptr_tokenizer = SentencepieceTokenizer(os.path.join(ptr_dir, ptr_config.tokenizer)) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=pad_sequence) # model (restore) checkpoint_manager = CheckpointManager(model_dir) checkpoint = checkpoint_manager.load_checkpoint(checkpoint_model_file) config = BertConfig(os.path.join(ptr_dir, ptr_config.config)) model = PairwiseClassifier(config, num_classes=model_config.num_classes, vocab=preprocessor.vocab) model.load_state_dict(checkpoint['model_state_dict']) device = torch.device('cpu') model.to(device) transform = preprocessor.preprocess if model.training: model.eval() indices, token_types = [torch.tensor([elm]) for elm in transform(sentence1, sentence2)] with torch.no_grad(): label = model(indices, token_types) label = label.max(dim=1)[1] label = label.numpy()[0] return label
def load_module(): global model, vocab, tok if not model or not vocab: model, vocab = get_pytorch_kogpt2_model() if not tok: tok_path = get_tokenizer() tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
def main(): cur_path = os.path.dirname(sys.argv[0]) if cur_path: os.chdir(cur_path) model_dir = Path('./experiments/base_model_with_crf') model_config = Config(json_path=model_dir / 'config.json') # load vocab & tokenizer tok_path = "./ptr_lm_model/tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) # load ner_to_index.json with open(model_dir / "ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} # model model = KobertCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # load model_dict = model.state_dict() checkpoint = torch.load("./model.bin", map_location=torch.device('cpu')) convert_keys = {} for k, v in checkpoint['model_state_dict'].items(): new_key_name = k.replace("module.", '') if new_key_name not in model_dict: print("{} is not int model_dict".format(new_key_name), file=sys.stderr) continue convert_keys[new_key_name] = v model.load_state_dict(convert_keys, strict=False) model.eval() device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') #model.to(device) decoder_from_res = DecoderFromNamedEntitySequence(tokenizer=tokenizer, index_to_ner=index_to_ner) try: while(True): input_text = input() list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text]) x_input = torch.tensor(list_of_input_ids).long() list_of_pred_ids = model(x_input) list_of_ner_word = decoder_from_res(list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids) if list_of_ner_word: print(",".join(list_of_ner_word)) else: print("/") except: print("EOF", file=sys.stderr)
def main(args): for arg in vars(args): print(arg, getattr(args, arg)) tokenizer = SentencepieceTokenizer(get_tokenizer()) lines_len = 0 src_docs = [] with jsonlines.open(args.train_file) as f: for line in f.iter(): lines_len += 1 sentences = [] for sentence in line['article_original']: sentences.append(sentence) src_docs.append(" ".join(sentences).replace('\n', '') + "\n") lens = [] tr_max_src = 0 for i, src_doc in enumerate(src_docs): if i % 100 == 0: print(i, len(src_docs)) tokens = tokenizer(src_doc) cur_len = len(tokens) lens.append(cur_len) if tr_max_src < cur_len: tr_max_src = cur_len src_docs = [] with jsonlines.open(args.test_file) as f: for line in f.iter(): lines_len += 1 sentences = [] for sentence in line['article_original']: sentences.append(sentence) src_docs.append(" ".join(sentences).replace('\n', '') + "\n") max_src = 0 test_lens = [] for i, src_doc in enumerate(src_docs): if i % 100 == 0: print(i, len(src_docs)) tokens = tokenizer(src_doc) cur_len = len(tokens) test_lens.append(cur_len) if max_src < cur_len: max_src = cur_len print("max source length train", tr_max_src) print("max source length test", max_src) print(sum(lens) / len(lens)) print(sum(test_lens) / len(test_lens)) import numpy as np print(np.median(np.array(lens))) print(np.median(np.array(test_lens))) print("done")
def __init__(self, filename, vocab, maxlen, use_emotion): #Store the contents of the file in a pandas dataframe self.df = pd.read_csv(filename, header=0, encoding='utf-8') # self.df = pd.read_csv(filename, delimiter = '\t') self.sp = SentencepieceTokenizer(get_tokenizer()) self.vocab = vocab self.maxlen = maxlen self.use_emotion = use_emotion self.sp.tokens.index('!')
def __init__(self): self.PAD_IDX = 0 self.UNK_IDX = 1 self.PAD_TOKEN = 'PAD_TOKEN' self.UNK_TOKEN = 'UNK_TOKEN' self.tok=Mecab() _, self.vocab = get_pytorch_kogpt2_model() self.tok_path = get_tokenizer() self.tok2 = SentencepieceTokenizer(self.tok_path, num_best=0, alpha=0)
def post(): value = request.form['input'] model_dir = Path('./experiments/base_model_with_crf') model_config = Config(json_path=model_dir / 'config.json') # load vocab & tokenizer tok_path = "ptr_lm_model/tokenizer_78b3253a26.model" ptr_tokenizer = SentencepieceTokenizer(tok_path) with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) # load ner_to_index.json with open(model_dir / "ner_to_index.json", 'rb') as f: ner_to_index = json.load(f) index_to_ner = {v: k for k, v in ner_to_index.items()} # model model = KobertCRFViz(config=model_config, num_classes=len(ner_to_index), vocab=vocab) # load model_dict = model.state_dict() checkpoint = torch.load( "./experiments/base_model_with_crf/best-epoch-16-step-1500-acc-0.993.bin", map_location=torch.device('cpu')) convert_keys = {} for k, v in checkpoint['model_state_dict'].items(): new_key_name = k.replace("module.", '') if new_key_name not in model_dict: print("{} is not int model_dict".format(new_key_name)) continue convert_keys[new_key_name] = v model.load_state_dict(convert_keys) model.eval() device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) decoder_from_res = DecoderFromNamedEntitySequence( tokenizer=tokenizer, index_to_ner=index_to_ner) input_text = value list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids( [input_text]) x_input = torch.tensor(list_of_input_ids).long() list_of_pred_ids, _ = model(x_input) list_of_ner_word, decoding_ner_sentence = decoder_from_res( list_of_input_ids=list_of_input_ids, list_of_pred_ids=list_of_pred_ids) return {'word': list_of_ner_word, 'decoding': decoding_ner_sentence}
def get_preprocessor(ptr_config_info, model_config): with open(ptr_config_info.vocab, mode='rb') as io: vocab = pickle.load(io) if model_config.type == 'etri': ptr_tokenizer = ETRITokenizer.from_pretrained(ptr_config_info.tokenizer, do_lower_case=False) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer.tokenize, pad_fn=pad_sequence) elif model_config.type == 'skt': ptr_tokenizer = SentencepieceTokenizer(ptr_config_info.tokenizer) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=pad_sequence) return preprocessor
def korean_gpt_long_setence_life_test(): config = get_config() kogpt2_config = get_kog_config() kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth" kogpt2_vocab_path = config['kogpt_vocab_path'] kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) torch.load(kogpt2_model_path) kogpt2model.load_state_dict(torch.load(kogpt2_model_path)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") kogpt2model.to(device) kogpt2model.eval() vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok = SentencepieceTokenizer(kogpt2_vocab_path) sent = '나는 밥을 먹었' toked = tok(sent) print(toked) sent_cnt = 0 input_ids = torch.tensor([ vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) input_ids = input_ids.to(device) outputs = kogpt2model.generate(input_ids=input_ids, max_length=100, min_length=50, repetition_penalty=1.2, do_sample=True, num_beams=3, bos_token_id=0, pad_token_id=3, eos_token_id=1, num_return_sequences=3) target = outputs[0] print("========수필===========") for i in range(3): # 3 output sequences were generated toked = vocab.to_tokens(outputs[i].squeeze().tolist()) ret = re.sub(r'(<s>|</s>|<pad>|<unk>)', '', ''.join(toked).replace('▁', ' ').strip()) print('Generated {}: {}'.format(i, ret))
def model_fn(model_dir): voc_file_name = glob.glob('{}/*.spiece'.format(model_dir))[0] model_param_file_name = glob.glob('{}/*.params'.format(model_dir))[0] # check if GPU is available if mx.context.num_gpus() > 0: ctx = mx.gpu() else: ctx = mx.cpu() model, vocab = get_kogpt2_model(model_param_file_name, voc_file_name, ctx) tok = SentencepieceTokenizer(voc_file_name) return model, vocab, tok, ctx
def __init__(self, model_path=con.NER_MODEL_PATH): with open(con.NER_UTIL_PATH["token_to_index"], 'rb') as f: self.token_to_index = pickle.load(f) with open(con.NER_UTIL_PATH["index_to_token"], 'rb') as f: self.index_to_token = pickle.load(f) with open(con.NER_UTIL_PATH["entity_to_index"], 'rb') as f: self.entity_to_index = pickle.load(f) with open(con.NER_UTIL_PATH["index_to_entity"], 'rb') as f: self.index_to_entity = pickle.load(f) self.tokenizer = SentencepieceTokenizer(con.NER_UTIL_PATH["tokenizer"]) self.model_config = con.MODEL_CONFIG self.model_path = model_path self.vocab = con.VOCAB self.__load_ner_model()
def chat(self, sent='0'): self.tok_path tok = SentencepieceTokenizer(self.tok_path, num_best=0, alpha=0) sent_tokens = tok(sent) with torch.no_grad(): while 1: q = input('user > ').strip() if q == 'quit': break q_tok = tok(q) a = '' a_tok = [] timeout = time.time() + 60 while 1: input_ids = torch.LongTensor([self.vocab[U_TKN]] + self.vocab[q_tok] + self.vocab[EOS, SENT] + self.vocab[sent_tokens] + self.vocab[EOS, S_TKN] + self.vocab[a_tok]).unsqueeze( dim=0) pred = self(input_ids) gen = self.vocab.to_tokens( torch.argmax(pred, dim=-1).squeeze().numpy().tolist())[-1] if gen == EOS: break a += gen.replace('▁', ' ') a_tok = tok(a) if time.time() > timeout: break answer_list = kss.split_sentences(a)[1:-2] Simsimi_answer = "".join(answer_list) sentence_list = Simsimi_answer.split('.') sentences = [] for s in sentence_list: word_list = s.split() #리스트 # sentences=[] for word in word_list: if word.endswith('*님이') == True: word_list[word_list.index(word)] = word.replace( word, "상담자님이") # print(word) else: pass sentence = " ".join(word_list) sentences.append(sentence) # print(sentence) print("Simsimi > ", ".".join(sentences))
def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')): if vocab is None or tokenizer is None: tok_path = get_tokenizer() self.ptr_tokenizer = SentencepieceTokenizer(tok_path) self.ptr_detokenizer = SentencepieceDetokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token2idx = vocab_of_gluonnlp.token_to_idx self.vocab = Vocabulary(token2idx=token2idx) self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen) else: self.vocab = vocab self.tokenizer = tokenizer self.maxlen = maxlen self.model_dir = model_dir
def Load_Model(): global vocab_global global sent_tokens_global global kogptqa_global global tok_global tok_path = get_tokenizer() model, vocab = get_mxnet_kogpt2_model(ctx=ctx) tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) kogptqa = KoGPT2Chat(model) kogptqa.load_parameters("KoGPT2-chatbot\kogpt2_chat.params", ctx=ctx) sent_tokens = tok("0") vocab_global = vocab sent_tokens_global = sent_tokens kogptqa_global = kogptqa tok_global = tok
def __init__(self, samples, vocab, media_map, word_dropout_prob=0.0, max_word_dropout_ratio=0.0, max_token_cnt=300): self.tokenizer = SentencepieceTokenizer(get_tokenizer()) self.vocab = vocab self.samples = samples self.targets = [s[1] for s in samples] self.media_map = media_map self.word_dropout_prob = word_dropout_prob self.max_word_dropout_ratio = max_word_dropout_ratio self.max_token_cnt = max_token_cnt