def jiayan_cut_nostop(content, load_lm_dir): lm = load_lm(load_lm_dir) tokenizer = CharHMMTokenizer(lm) word_list = [] if content != '' and content is not None: seg_list = tokenizer.tokenize(content) for word in seg_list: word_list.append(word) return " ".join(word_list)
def train_punctuator(lm_path, data_file, cut_model, out_model): lm = load_lm(lm_path) punctuator = CRFPunctuator(lm, cut_model) print('Building data...') X, Y = punctuator.build_data(data_file) train_x, train_y, test_x, test_y = punctuator.split_data(X, Y) X[:] = [] Y[:] = [] print('Training...') punctuator.train(train_x, train_y, out_model) punctuator.eval(test_x, test_y, out_model)
def train_sentencizer(lm_path, data_file, out_model): lm = load_lm(lm_path) sentencizer = CRFSentencizer(lm) print('Building data...') X, Y = sentencizer.build_data(data_file) train_x, train_y, test_x, test_y = sentencizer.split_data(X, Y) X[:] = [] Y[:] = [] print('Training...') sentencizer.train(train_x, train_y, out_model) sentencizer.eval(test_x, test_y, out_model)
def jiayan_cut_sample(content, load_lm_dir): #stop_word=get_stop_words(stop_word_dir) lm = load_lm(load_lm_dir) tokenizer = CharHMMTokenizer(lm) word_list = [] if content != '' and content is not None: seg_list = tokenizer.tokenize(content) for word in seg_list: #if word not in stop_word and '\u4e00' <= word <= '\u9fa5': word_list.append(word) return word_list
def tag_text(root, target_root): # jiayan ancient text cut tool lm = load_lm('jiayan.klm') punctuator = CRFPunctuator(lm, 'cut_model') punctuator.load('punc_model') make_dir(target_root) file_ob_list = get_all_files(root) for f_name in tqdm.tqdm(file_ob_list, desc="Process unmarked file"): file = open(root + f_name, "r", encoding='utf-8-sig',errors='ignore') output = open(target_root + f_name, "w+", encoding='utf-8-sig', errors='ignore') for line in file.readlines(): if len(line.strip()) == 0: continue output.write(punctuator.punctuate(line.strip())) output.write('\n') file.close() output.close()
def crf_punctuate(lm_path, cut_model, punc_model, text): lm = load_lm(lm_path) punctuator = CRFPunctuator(lm, cut_model) punctuator.load(punc_model) print(punctuator.punctuate(text))
def crf_sentencize(lm_path: str, cut_model, text): lm = load_lm(lm_path) sentencizer = CRFSentencizer(lm) sentencizer.load(cut_model) print(sentencizer.sentencize(text))
def hmm_tokenize(lm_path: str, text: str): lm = load_lm(lm_path) tokenizer = CharHMMTokenizer(lm) print(list(tokenizer.tokenize(text)))
def load_tokenizer(self): if self.tokenizer is None: lm = jiayan.load_lm("source/jiayan.klm") self.tokenizer = jiayan.CharHMMTokenizer(lm)
from jiayan import PMIEntropyLexiconConstructor from jiayan import CharHMMTokenizer from jiayan import WordNgramTokenizer from jiayan import CRFSentencizer from jiayan import CRFPunctuator from jiayan import CRFPOSTagger from jiayan import load_lm import os import shutil from tqdm import tqdm import re from collections import Counter from pprint import pprint lm = load_lm('/home/zy/mnt/nlp_test/Jiayan/jiayan_models/jiayan.klm') tokenizer = CharHMMTokenizer(lm) out_ls = [] words = [] reg = "[^0-9A-Za-z\u4e00-\u9fa5]" fp = open("origin_record.txt", "r") fg = open("tmp.out", "w") ls = fp.readlines() for i in ls: i = re.sub(reg, "", i) tstr = "" gg = list(tokenizer.tokenize(i)) for j in gg: tstr = tstr + j + " " words.append(j) tstr += "\n" out_ls.append(tstr)
f.write("\n") f.close() def list_to_text(p_list): txt = "" for p in p_list: txt = txt + p + " " return txt[:-1] if __name__ == '__main__': lm_path = 'C:/TJlab/Tang/chinese_poetry/jiayan.klm' print('\nTokenizing test text with HMM...') # init_file() lm = load_lm(lm_path) hmm_tokenizer = CharHMMTokenizer(lm) tang_tokenizer = TangCharHMMTokenizer(lm) # f = open("resource/qujiang_raw.txt", encoding='utf-8') # line = f.readline() # while line: # # list_to_file("resource/qujiang_hmm.txt", list(tang_tokenizer.tokenize(line))) # list_to_file("resource/qujiang_tang.txt", tang_tokenizer.intervene_tokenize(line)) # # list_to_file("resource/qujiang_tang_trans.txt", tang_tokenizer.intervene(line)) # line = f.readline() # f.close() text0 = "送春归,三月尽日日暮时。去年杏园花飞御沟绿,何处送春曲江曲。今年杜鹃花落子规啼,送春何处西江西。帝城送春犹怏怏" \ ",天涯送春能不加惆怅。莫惆怅,送春人。冗员无替五年罢,应须准拟再送浔阳春。五年炎凉凡十变,又知此身健不健。" \ "好去今年江上春,明年未死还相见。"