class Pixir: def __init__(self, max_seq_len): self.max_seq_len = max_seq_len self.input_text = None self.input_tokens = None self.input_embedding = None self.spacing_model = CountSpace() self.stage1_generator = None self.bert_model = None def load_spacing_model(self, model_path): self.spacing_model.load_model(model_path, json_format=False) def load_bert_model(self, model_path): paths = get_checkpoint_paths(model_path) self.bert_model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, training=False, seq_len=self.max_seq_len) def load_stage1_generator(self, model_path): self.stage1_generator = Stage1WGANGP(768, 64, 0.1, 0.1, 1, 1, 1).generator # self.stage1_generator.load_weights(model_path) def spacing(self, text): sentence_corrected, tags = self.spacing_model.correct(text) self.input_text = sentence_corrected print(self.input_text) def tokenize(self): tokenizer = FullTokenizer('vocab.korean.rawtext.list') tokens = tokenize(self.input_text, tokenizer, self.max_seq_len) self.input_tokens = tokens def embedding(self): segments = np.ones_like(self.input_tokens) self.input_embedding = self.bert_model.predict([self.input_tokens, segments]) def generate_stage1(self): z_noise = np.random.normal(0, 1, (self.input_embedding.shape[0], 100)) img, _ = self.stage1_generator.predict([self.input_embedding, z_noise]) img = (img + 1) / 2 return Image.fromarray(img) def text2img(self, input_text): self.spacing(input_text) self.tokenize() self.embedding() img = self.generate_stage1() return img
def run_preprocess(inputPath: str, outputPath: str, modelPath: str, module: str): if module == "countSpace": model = CountSpace() model.load_model(modelPath, json_format=False) with open(inputPath, 'r', encoding='utf-8') as inputData, \ open(outputPath, 'w', encoding='utf-8') as outputData: for sentence in inputData: sentence = sentence.strip() if not sentence: continue sentence_corrected, _ = model.correct(sentence) outputData.writelines(sentence_corrected + "\n") elif module == "normalizer": print("do something") elif module == "noun": print("do something")
def apply_space_correct(corpus_fname, model_fname, output_corpus_fname, with_label=False): model = CountSpace() model.load_model(model_fname, json_format=False) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_corpus_fname, 'w', encoding='utf-8') as f2: for sentence in f1: if with_label: sentence, label = sentence.strip().split("\u241E") else: sentence = sentence.strip() label = None if not sentence: continue sent_corrected, _ = model.correct(sentence) if with_label: f2.writelines(sent_corrected + "\u241E" + label + "\n") else: f2.writelines(sent_corrected + "\n")
def train(self, filename): verbose = False mc = 10 # min_count ft = 0.3 # force_abs_threshold nt = -0.3 # nonspace_threshold st = 0.3 # space_threshold model = CountSpace() rootDirPath = self.util.getRootPath("SmiToText.SmiToText") corpus_fname = rootDirPath + os.path.sep + "data" + os.path.sep + "koDetokenizerData" + os.path.sep + "ko_law_common_space.txt" model_fname = rootDirPath + os.path.sep + "kosoy-models" + os.path.sep + "soyspacing.model" ### 학습 # model.train(corpus_fname) # model.save_model(model_fname, json_format=False) ## 모델 로드 model.load_model(model_fname, json_format=False) #sent = '이건진짜좋은영화 라라랜드진짜좋은영화' # sent = '그일단그구성원인사람들과,,' sent = 'DAB는, 결정과 관련한 각 위원들의 모든 일당 수수료와 경비에 대한 청구금액이 완전하게 지급될 때 까지는, 결정문을 발급할 의무를 갖지 아니한다.' sent_input = sent.replace(" ", "") # with parameters setn_output_1, tags = model.correct(doc=sent_input, verbose=verbose, force_abs_threshold=ft, nonspace_threshold=nt, space_threshold=st, min_count=mc) # without parameters setn_output_2, tags = model.correct(sent_input) print(sent) print(setn_output_1) print(setn_output_2)
# model.load_model('model_spacing.h5', json_format=False) # model.train('./korquad_1.txt') # model.save_model('model_spacing_2.h5', json_format=False) # model.train(corpus_file_name) # model.save_model('model_spacing.h5', json_format=False) # model = CountSpace.load_model('model_spacing.h5', json_format=False) # model.train() # model_2_file_name = '../KorQuAD_2.1_train_00/korquad2.1_train_0.json' # model_2 = CountSpace() # model.train(model_2_file_name) # model.save_model('model_2_spacing', json_format=False) model = CountSpace() model.load_model('model_spacing', json_format=False) model.train('korquad.txt') model.save_model('korean_spacing_model.h5', json_format=False) # model = CountSpace() # model.load_model('model_spacing_3.h5', json_format=False) # model.train('./korquad_3.txt') # model.save_model('model_spacing_4.h5', json_format=False) verbose = False mc = 10 # min_count ft = 0.4 # force_abs_threshold nt = -0.3 # nonspace_threshold st = 0.4 # space_threshold sentence = '지않고'
import pickle from krwordrank.word import summarize_with_keywords from wordcloud import WordCloud from soykeyword.lasso import LassoKeywordExtractor from soynlp.noun import LRNounExtractor_v2 from soynlp.tokenizer import LTokenizer, MaxScoreTokenizer from soyspacing.countbase import RuleDict, CountSpace import matplotlib.pyplot as plt space = CountSpace() space.load_model('soyspacing.model', json_format=False) with open('grouped.pickle', 'rb') as f: grouped = pickle.load(f) with open('nouns.pickle', 'rb') as f: nouns = pickle.load(f) with open('words.pickle', 'rb') as f: words = pickle.load(f) scores = {w: s.score for w, s in nouns.items()} #scores.update( # {w:s.cohesion_forward+scores.get(w, 0) for w,s in words.items()}) #print(scores["가"]) tokenizer = MaxScoreTokenizer(scores) #tokenizer = LTokenizer(scores) def keywords(doc): space.correct(doc) tokens = tokenizer.tokenize(doc, flatten=False)
from soyspacing.countbase import CountSpace from soyspacing.countbase import RuleDict 모델을 저장 =====soynlp======= model_fname = 'ver1spacing.h5' model.save_model(model_fname, json_format=False) model = CountSpace() model.load_model(model_fname, json_format=False) rule_dict = RuleDict('rules.txt') text1 = '감사합니다 앞으로도 잘부탁드려요 풍성한토핑 맛난피자로 보답하겠습니다' text2 = '맛있게 잘 먹었습니다~' text3 = '마시써효!!!떡볶이도좋아요' text4 = '불고기는 처음 시켜봤는데 상상 그이상....' text5 = '냠냠~너무 맛있어용^^ 또 시켜먹어요넘나맛있네여피짜로덤왜인기가잇는지알겟둠원픽예약임툐쿄' text6 = '영등포피자중 이찌방' text7 = 'ㅋㅋㅋㅋ 파인애플 당연 추가한줄알고 실수했네요죄송염~~오늘도 맛나게 잘 먹겠습니다^^샐러드가 생각보다 푸짐하게 왔네요' sent_corrected, tags = model.correct(text1, rules=rule_dict) sent_corrected2, tags = model.correct(text2, rules=rule_dict) sent_corrected3, tags = model.correct(text3, rules=rule_dict) sent_corrected4, tags = model.correct(text4, rules=rule_dict) sent_corrected5, tags = model.correct(text5, rules=rule_dict) sent_corrected6, tags = model.correct(text6, rules=rule_dict) sent_corrected7, tags = model.correct(text7, rules=rule_dict) print('======soynlp====')
output = x for idx, each_output in enumerate(output) : for key, value in hanbon_dict.items() : output = output.replace(key, value) return output def kana_to_hangul_converter(x) : output = x for idx, each_output in enumerate(output) : for key, value in hanbon_dict.items() : output = output.replace(value, key) return output spacing_model = CountSpace() spacing_model.load_model('./embedding/healthnews_spacing_model', json_format=False) sp = spm.SentencePieceProcessor() sp.Load('healthcare_hanbon.model') ftmodel = FastText.load('./embedding/fasttext_healthqna.model') def ft_dimension_retriever(x) : try : return ftmodel.wv[x] except : return np.repeat(0,200) def final_meanvector_retriever(x) : return np.mean(ft_dimension_retriever([kana_to_hangul_converter(each) for each in sp.EncodeAsPieces(hangul_to_kana_converter(split_text_cleaner(spacing_model.correct(str(x))[0])))]), axis=0) classifier = joblib.load('ensembled_classifier.pkl') def symptom_classifier(x) :