def train(self, filename): verbose = False mc = 10 # min_count ft = 0.3 # force_abs_threshold nt = -0.3 # nonspace_threshold st = 0.3 # space_threshold model = CountSpace() rootDirPath = self.util.getRootPath("SmiToText.SmiToText") corpus_fname = rootDirPath + os.path.sep + "data" + os.path.sep + "koDetokenizerData" + os.path.sep + "ko_law_common_space.txt" model_fname = rootDirPath + os.path.sep + "kosoy-models" + os.path.sep + "soyspacing.model" ### 학습 # model.train(corpus_fname) # model.save_model(model_fname, json_format=False) ## 모델 로드 model.load_model(model_fname, json_format=False) #sent = '이건진짜좋은영화 라라랜드진짜좋은영화' # sent = '그일단그구성원인사람들과,,' sent = 'DAB는, 결정과 관련한 각 위원들의 모든 일당 수수료와 경비에 대한 청구금액이 완전하게 지급될 때 까지는, 결정문을 발급할 의무를 갖지 아니한다.' sent_input = sent.replace(" ", "") # with parameters setn_output_1, tags = model.correct(doc=sent_input, verbose=verbose, force_abs_threshold=ft, nonspace_threshold=nt, space_threshold=st, min_count=mc) # without parameters setn_output_2, tags = model.correct(sent_input) print(sent) print(setn_output_1) print(setn_output_2)
class Pixir: def __init__(self, max_seq_len): self.max_seq_len = max_seq_len self.input_text = None self.input_tokens = None self.input_embedding = None self.spacing_model = CountSpace() self.stage1_generator = None self.bert_model = None def load_spacing_model(self, model_path): self.spacing_model.load_model(model_path, json_format=False) def load_bert_model(self, model_path): paths = get_checkpoint_paths(model_path) self.bert_model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, training=False, seq_len=self.max_seq_len) def load_stage1_generator(self, model_path): self.stage1_generator = Stage1WGANGP(768, 64, 0.1, 0.1, 1, 1, 1).generator # self.stage1_generator.load_weights(model_path) def spacing(self, text): sentence_corrected, tags = self.spacing_model.correct(text) self.input_text = sentence_corrected print(self.input_text) def tokenize(self): tokenizer = FullTokenizer('vocab.korean.rawtext.list') tokens = tokenize(self.input_text, tokenizer, self.max_seq_len) self.input_tokens = tokens def embedding(self): segments = np.ones_like(self.input_tokens) self.input_embedding = self.bert_model.predict([self.input_tokens, segments]) def generate_stage1(self): z_noise = np.random.normal(0, 1, (self.input_embedding.shape[0], 100)) img, _ = self.stage1_generator.predict([self.input_embedding, z_noise]) img = (img + 1) / 2 return Image.fromarray(img) def text2img(self, input_text): self.spacing(input_text) self.tokenize() self.embedding() img = self.generate_stage1() return img
def run_preprocess(inputPath: str, outputPath: str, modelPath: str, module: str): if module == "countSpace": model = CountSpace() model.load_model(modelPath, json_format=False) with open(inputPath, 'r', encoding='utf-8') as inputData, \ open(outputPath, 'w', encoding='utf-8') as outputData: for sentence in inputData: sentence = sentence.strip() if not sentence: continue sentence_corrected, _ = model.correct(sentence) outputData.writelines(sentence_corrected + "\n") elif module == "normalizer": print("do something") elif module == "noun": print("do something")
def apply_space_correct(corpus_fname, model_fname, output_corpus_fname, with_label=False): model = CountSpace() model.load_model(model_fname, json_format=False) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_corpus_fname, 'w', encoding='utf-8') as f2: for sentence in f1: if with_label: sentence, label = sentence.strip().split("\u241E") else: sentence = sentence.strip() label = None if not sentence: continue sent_corrected, _ = model.correct(sentence) if with_label: f2.writelines(sent_corrected + "\u241E" + label + "\n") else: f2.writelines(sent_corrected + "\n")
# model.load_model('model_spacing_3.h5', json_format=False) # model.train('./korquad_3.txt') # model.save_model('model_spacing_4.h5', json_format=False) verbose = False mc = 10 # min_count ft = 0.4 # force_abs_threshold nt = -0.3 # nonspace_threshold st = 0.4 # space_threshold sentence = '지않고' # with parameters sentence_corrected, tags = model.correct(doc=sentence, verbose=verbose, force_abs_threshold=ft, nonspace_threshold=nt, space_threshold=st, min_count=mc) # without parameters sentence_corrected, tags = model.correct(sentence) f = open('rules.txt', mode='wt', encoding='utf-8') # f.write('진짜 101\n') # f.write('방울 101\n') # f.write('나는 101\n') # f.write('너를 101\n') # f.write('영화 101\n') # f.write('마리의 1001\n') # f.write('강아지가 10001\n') # f.write('저글링을 10001\n') # f.write('한다 101\n')
model = CountSpace() model.load_model(model_fname, json_format=False) rule_dict = RuleDict('rules.txt') text1 = '감사합니다 앞으로도 잘부탁드려요 풍성한토핑 맛난피자로 보답하겠습니다' text2 = '맛있게 잘 먹었습니다~' text3 = '마시써효!!!떡볶이도좋아요' text4 = '불고기는 처음 시켜봤는데 상상 그이상....' text5 = '냠냠~너무 맛있어용^^ 또 시켜먹어요넘나맛있네여피짜로덤왜인기가잇는지알겟둠원픽예약임툐쿄' text6 = '영등포피자중 이찌방' text7 = 'ㅋㅋㅋㅋ 파인애플 당연 추가한줄알고 실수했네요죄송염~~오늘도 맛나게 잘 먹겠습니다^^샐러드가 생각보다 푸짐하게 왔네요' sent_corrected, tags = model.correct(text1, rules=rule_dict) sent_corrected2, tags = model.correct(text2, rules=rule_dict) sent_corrected3, tags = model.correct(text3, rules=rule_dict) sent_corrected4, tags = model.correct(text4, rules=rule_dict) sent_corrected5, tags = model.correct(text5, rules=rule_dict) sent_corrected6, tags = model.correct(text6, rules=rule_dict) sent_corrected7, tags = model.correct(text7, rules=rule_dict) print('======soynlp====') print(sent_corrected) print(sent_corrected2) print(sent_corrected3) print(sent_corrected4) print(sent_corrected5) print(sent_corrected6) print(sent_corrected7)
class Tag_dict: def __init__(self, content): self.content = content self.komoran = Komoran(userdic=os.getcwd() + '/user_dic.txt') self.model = CountSpace() self.adjective_dict = dict() # 형용사: VA, VCN, VCP self.adverb_dict = dict() # 부사: MAG self.conjunction_dict = dict() # 접속사: MAJ self.determiner_dict = dict() # 관형사: MM self.eomi_dict = dict() # 어미: EC, EF, ETM, ETN self.josa_dict = dict( ) # 조사: JC, JKC, JKG, JKV, JKB, JKO, JKQ, JKS, JX self.noun_dict = dict() # 명사: NNG, NNB, NNP, NP, NR self.preEomi_dict = dict() # 선어말어미: EP self.suffix_dict = dict() # 접사: XPN, XSA, XSN, XSV self.verb_dict = dict() # 동사: VV, VX self.wordDict = dict() def judge_tag(self): for text in self.content: posList = self.komoran.pos(text) for pos in posList: # preprocessing word = re.sub("[ㄱ-ㅎ|ㅏ-ㅣ|.,?!]", repl="", string=str(pos[0])) if word == "": continue # seperate tag & count tagName = tag_switch(pos[1]) if tagName != -1: if tagName == "adjective": self.adjective_dict = tag_cnt(word, self.adjective_dict) elif tagName == "adverb": self.adverb_dict = tag_cnt(word, self.adverb_dict) elif tagName == "conjunction": self.conjunction_dict = tag_cnt( word, self.conjunction_dict) elif tagName == "determiner": self.determiner_dict = tag_cnt(word, self.determiner_dict) elif tagName == "eomi": self.eomi_dict = tag_cnt(word, self.eomi_dict) elif tagName == "josa": self.josa_dict = tag_cnt(word, self.josa_dict) elif tagName == "noun": self.noun_dict = tag_cnt(word, self.noun_dict) elif tagName == "preEomi": self.preEomi_dict = tag_cnt(word, self.preEomi_dict) elif tagName == "suffix": self.suffix_dict = tag_cnt(word, self.suffix_dict) elif tagName == "verb": self.verb_dict = tag_cnt(word, self.verb_dict) def cnt_origin_word(self): if type(self.wordDict) is list: return for text in self.content: sent_corrected, tags = self.model.correct(text) words = del_special_char(sent_corrected).split(" ") for word in words: if word not in self.wordDict.keys(): self.wordDict[word] = 0 self.wordDict[word] += 1 def print_len(self): print("text line:", len(self.content)) def print_noun_list(self): self.judge_tag() print(self.noun_dict) def print_tag_frequency(self, cnt=30): """ print dict values frequency (descending) Args: :param: cnt(int) Returns: :param: tagDict(1st ~ until cnt-th) (dict) """ self.judge_tag() self.adjective_dict = sorted(self.adjective_dict.items(), key=lambda x: x[1], reverse=True) self.adverb_dict = sorted(self.adverb_dict.items(), key=lambda x: x[1], reverse=True) self.conjunction_dict = sorted(self.conjunction_dict.items(), key=lambda x: x[1], reverse=True) self.determiner_dict = sorted(self.determiner_dict.items(), key=lambda x: x[1], reverse=True) self.eomi_dict = sorted(self.eomi_dict.items(), key=lambda x: x[1], reverse=True) self.josa_dict = sorted(self.josa_dict.items(), key=lambda x: x[1], reverse=True) self.noun_dict = sorted(self.noun_dict.items(), key=lambda x: x[1], reverse=True) self.preEomi_dict = sorted(self.preEomi_dict.items(), key=lambda x: x[1], reverse=True) self.suffix_dict = sorted(self.suffix_dict.items(), key=lambda x: x[1], reverse=True) self.verb_dict = sorted(self.verb_dict.items(), key=lambda x: x[1], reverse=True) print("형용사(adjective):") print(self.adjective_dict[:cnt]) print("\n부사(adverb):") print(self.adverb_dict[:cnt]) print("\n접속사(conjunction):") print(self.conjunction_dict[:cnt]) print("\n관형사(determiner):") print(self.determiner_dict[:cnt]) print("\n어미(eomi):") print(self.eomi_dict[:cnt]) print("\n조사(josa):") print(self.josa_dict[:cnt]) print("\n명사(noun):") print(self.noun_dict[:cnt]) print("\n선어말어미(preEomi):") print(self.preEomi_dict[:cnt]) print("\n접사(suffix):") print(self.suffix_dict[:cnt]) print("\n동사(verb):") print(self.verb_dict[:cnt]) def print_origin_frequency(self, cnt=30): """ print origin values frequency (descending) Args: :param: cnt(int) """ self.cnt_origin_word() self.wordDict = sorted(self.wordDict.items(), key=lambda x: x[1], reverse=True) print(self.wordDict[:cnt]) def print_dict(self, tagName): self.judge_tag() if tagName == "adjective": for tag in self.adjective_dict.keys(): print(tag) elif tagName == "adverb": for tag in self.adverb_dict.keys(): print(tag) elif tagName == "conjunction": for tag in self.conjunction_dict.keys(): print(tag) elif tagName == "determiner": for tag in self.determiner_dict.keys(): print(tag) elif tagName == "eomi": for tag in self.eomi_dict.keys(): print(tag) elif tagName == "josa": for tag in self.josa_dict.keys(): print(tag) elif tagName == "noun": for tag in self.noun_dict.keys(): print(tag) elif tagName == "preEomi": for tag in self.preEomi_dict.keys(): print(tag) elif tagName == "suffix": for tag in self.suffix_dict.keys(): print(tag) elif tagName == "verb": for tag in self.verb_dict.keys(): print(tag) def print_morph(self): for text in self.content: result = self.komoran.morphs(text) print(result) def print_pos(self): for text in self.content: result = self.komoran.pos(text) print(result) def save_compare(self, form): result = "" if form is "morph": for text in self.content: result += text + str(self.komoran.morphs(text)) + "\n\n" elif form is "pos": for text in self.content: result += text + str(self.komoran.pos(text)) + "\n\n" save_text_file(filename, result, form) def save_origin_frequency(self): result = "" self.judge_tag() self.cnt_origin_word() if type(self.wordDict) is dict: self.wordDict = sorted(self.wordDict.items(), key=lambda x: x[1], reverse=True) """ save result as .txt """ for key_value in self.wordDict: result += str(key_value) + "\n" save_text_file(filename, result, "origin") """ save new word dict to misspell_origin.xlsx """ # load existence values & make as a dictionary pastData = read_xlsx_file() pastDataDict = dict() for i in range(pastData.shape[0]): valList = list() for j in range(1, pastData.shape[1]): if type(pastData.loc[i][j]) is str: valList.append(pastData.loc[i][j]) else: break pastDataDict[pastData.loc[i][0]] = valList pastData_keyList = list(pastDataDict.keys()) # for delete overlap word # make current values as a list current_data_list = list(dict(self.wordDict).keys()) # make new dict list (delete overlap word) newDictList = list(set(pastData_keyList + current_data_list)) newDictList.remove("") # delete empty element # re-write contents (data/misspell_origin.xlsx) rewrite_xlxs_file(pastDataDict, newDictList, "misspell_origin.xlsx") print( "===== Finish: save new word list to data/misspell_origin.xlsx =====" ) def save_noun_standard(self): # count origin word frequency self.judge_tag() self.cnt_origin_word() if type(self.wordDict) is dict: self.wordDict = sorted(self.wordDict.items(), key=lambda x: x[1], reverse=True) # load existence values & make as a dictionary pastData = read_xlsx_file("noun_standard") pastDataDict = dict() for i in range(pastData.shape[0]): valList = list() for j in range(1, pastData.shape[1]): if type(pastData.loc[i][j]) is str: valList.append(pastData.loc[i][j]) else: break pastDataDict[pastData.loc[i][0]] = valList pastData_keyList = list(pastDataDict.keys()) # for delete overlap word # make current values as a list current_data_list = list() for noun in self.noun_dict.keys(): current_data_list.append(noun) for key_value in self.wordDict: tmp = key_value[0] for noun in current_data_list: if noun in key_value[0]: tmp = tmp.replace(noun, "") if tmp != "": current_data_list.append(tmp) # make new dict list (delete overlap word) newDictList = list(set(pastData_keyList + current_data_list)) if "" in newDictList: newDictList.remove("") # delete empty element # re-write contents (data/misspell_origin.xlsx) rewrite_xlxs_file(pastDataDict, newDictList, "misspell_noun_standard.xlsx") print( "===== Finish: save new word list to data/misspell_noun_standard.xlsx =====" )