Example #1
0
class Pixir:
    def __init__(self, max_seq_len):
        self.max_seq_len = max_seq_len

        self.input_text = None
        self.input_tokens = None
        self.input_embedding = None

        self.spacing_model = CountSpace()
        self.stage1_generator = None
        self.bert_model = None

    def load_spacing_model(self, model_path):
        self.spacing_model.load_model(model_path, json_format=False)

    def load_bert_model(self, model_path):
        paths = get_checkpoint_paths(model_path)
        self.bert_model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint,
                                                             training=False, seq_len=self.max_seq_len)

    def load_stage1_generator(self, model_path):
        self.stage1_generator = Stage1WGANGP(768, 64, 0.1, 0.1, 1, 1, 1).generator
        # self.stage1_generator.load_weights(model_path)

    def spacing(self, text):
        sentence_corrected, tags = self.spacing_model.correct(text)
        self.input_text = sentence_corrected
        print(self.input_text)

    def tokenize(self):
        tokenizer = FullTokenizer('vocab.korean.rawtext.list')
        tokens = tokenize(self.input_text, tokenizer, self.max_seq_len)
        self.input_tokens = tokens

    def embedding(self):
        segments = np.ones_like(self.input_tokens)

        self.input_embedding = self.bert_model.predict([self.input_tokens, segments])

    def generate_stage1(self):
        z_noise = np.random.normal(0, 1, (self.input_embedding.shape[0], 100))

        img, _ = self.stage1_generator.predict([self.input_embedding, z_noise])
        img = (img + 1) / 2
        return Image.fromarray(img)

    def text2img(self, input_text):
        self.spacing(input_text)
        self.tokenize()
        self.embedding()
        img = self.generate_stage1()
        return img
Example #2
0
def run_preprocess(inputPath: str, outputPath: str, modelPath: str,
                   module: str):
    if module == "countSpace":
        model = CountSpace()
        model.load_model(modelPath, json_format=False)
        with open(inputPath, 'r', encoding='utf-8') as inputData, \
                open(outputPath, 'w', encoding='utf-8') as outputData:
            for sentence in inputData:
                sentence = sentence.strip()
                if not sentence: continue
                sentence_corrected, _ = model.correct(sentence)
                outputData.writelines(sentence_corrected + "\n")
    elif module == "normalizer":
        print("do something")
    elif module == "noun":
        print("do something")
Example #3
0
def apply_space_correct(corpus_fname, model_fname, output_corpus_fname, with_label=False):
    model = CountSpace()
    model.load_model(model_fname, json_format=False)
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_corpus_fname, 'w', encoding='utf-8') as f2:
        for sentence in f1:
            if with_label:
                sentence, label = sentence.strip().split("\u241E")
            else:
                sentence = sentence.strip()
                label = None
            if not sentence: continue
            sent_corrected, _ = model.correct(sentence)
            if with_label:
                f2.writelines(sent_corrected + "\u241E" + label + "\n")
            else:
                f2.writelines(sent_corrected + "\n")
Example #4
0
    def train(self, filename):
        verbose = False
        mc = 10  # min_count
        ft = 0.3  # force_abs_threshold
        nt = -0.3  # nonspace_threshold
        st = 0.3  # space_threshold

        model = CountSpace()

        rootDirPath = self.util.getRootPath("SmiToText.SmiToText")
        corpus_fname = rootDirPath + os.path.sep + "data" + os.path.sep + "koDetokenizerData" + os.path.sep + "ko_law_common_space.txt"
        model_fname = rootDirPath + os.path.sep + "kosoy-models" + os.path.sep + "soyspacing.model"

        ### 학습
        # model.train(corpus_fname)
        # model.save_model(model_fname, json_format=False)

        ## 모델 로드
        model.load_model(model_fname, json_format=False)

        #sent = '이건진짜좋은영화 라라랜드진짜좋은영화'
        # sent = '그일단그구성원인사람들과,,'
        sent = 'DAB는, 결정과 관련한 각 위원들의 모든 일당 수수료와 경비에 대한 청구금액이 완전하게 지급될 때 까지는, 결정문을 발급할 의무를 갖지 아니한다.'

        sent_input = sent.replace(" ", "")

        # with parameters
        setn_output_1, tags = model.correct(doc=sent_input,
                                            verbose=verbose,
                                            force_abs_threshold=ft,
                                            nonspace_threshold=nt,
                                            space_threshold=st,
                                            min_count=mc)

        # without parameters
        setn_output_2, tags = model.correct(sent_input)

        print(sent)
        print(setn_output_1)
        print(setn_output_2)
Example #5
0
# model.load_model('model_spacing.h5', json_format=False)
# model.train('./korquad_1.txt')
# model.save_model('model_spacing_2.h5', json_format=False)

# model.train(corpus_file_name)
# model.save_model('model_spacing.h5', json_format=False)
# model = CountSpace.load_model('model_spacing.h5', json_format=False)
# model.train()

# model_2_file_name = '../KorQuAD_2.1_train_00/korquad2.1_train_0.json'
# model_2 = CountSpace()
# model.train(model_2_file_name)
# model.save_model('model_2_spacing', json_format=False)

model = CountSpace()
model.load_model('model_spacing', json_format=False)
model.train('korquad.txt')
model.save_model('korean_spacing_model.h5', json_format=False)

# model = CountSpace()
# model.load_model('model_spacing_3.h5', json_format=False)
# model.train('./korquad_3.txt')
# model.save_model('model_spacing_4.h5', json_format=False)

verbose = False
mc = 10  # min_count
ft = 0.4  # force_abs_threshold
nt = -0.3  # nonspace_threshold
st = 0.4  # space_threshold

sentence = '지않고'
Example #6
0
import pickle
from krwordrank.word import summarize_with_keywords
from wordcloud import WordCloud
from soykeyword.lasso import LassoKeywordExtractor
from soynlp.noun import LRNounExtractor_v2
from soynlp.tokenizer import LTokenizer, MaxScoreTokenizer
from soyspacing.countbase import RuleDict, CountSpace

import matplotlib.pyplot as plt

space = CountSpace()
space.load_model('soyspacing.model', json_format=False)

with open('grouped.pickle', 'rb') as f:
    grouped = pickle.load(f)
with open('nouns.pickle', 'rb') as f:
    nouns = pickle.load(f)
with open('words.pickle', 'rb') as f:
    words = pickle.load(f)

scores = {w: s.score for w, s in nouns.items()}
#scores.update(
#    {w:s.cohesion_forward+scores.get(w, 0) for w,s in words.items()})
#print(scores["가"])
tokenizer = MaxScoreTokenizer(scores)


#tokenizer = LTokenizer(scores)
def keywords(doc):
    space.correct(doc)
    tokens = tokenizer.tokenize(doc, flatten=False)
Example #7
0
from soyspacing.countbase import CountSpace
from soyspacing.countbase import RuleDict


모델을 저장 =====soynlp=======
model_fname = 'ver1spacing.h5'
model.save_model(model_fname, json_format=False)

model = CountSpace()
model.load_model(model_fname, json_format=False)

rule_dict = RuleDict('rules.txt')


text1 = '감사합니다 앞으로도 잘부탁드려요 풍성한토핑 맛난피자로 보답하겠습니다'
text2 = '맛있게 잘 먹었습니다~'
text3 = '마시써효!!!떡볶이도좋아요'
text4 = '불고기는 처음 시켜봤는데 상상 그이상....'
text5 = '냠냠~너무 맛있어용^^ 또 시켜먹어요넘나맛있네여피짜로덤왜인기가잇는지알겟둠원픽예약임툐쿄'
text6 = '영등포피자중 이찌방'
text7 = 'ㅋㅋㅋㅋ 파인애플 당연 추가한줄알고 실수했네요죄송염~~오늘도 맛나게 잘 먹겠습니다^^샐러드가 생각보다 푸짐하게 왔네요'

sent_corrected, tags = model.correct(text1, rules=rule_dict)
sent_corrected2, tags = model.correct(text2, rules=rule_dict)
sent_corrected3, tags = model.correct(text3, rules=rule_dict)
sent_corrected4, tags = model.correct(text4, rules=rule_dict)
sent_corrected5, tags = model.correct(text5, rules=rule_dict)
sent_corrected6, tags = model.correct(text6, rules=rule_dict)
sent_corrected7, tags = model.correct(text7, rules=rule_dict)

print('======soynlp====')
Example #8
0
    output = x
    for idx, each_output in enumerate(output) :
        for key, value in hanbon_dict.items() :
            output = output.replace(key, value)
    return output


def kana_to_hangul_converter(x) :
    output = x
    for idx, each_output in enumerate(output) :
        for key, value in hanbon_dict.items() :
            output = output.replace(value, key)
    return output

spacing_model = CountSpace()
spacing_model.load_model('./embedding/healthnews_spacing_model', json_format=False)
sp = spm.SentencePieceProcessor()
sp.Load('healthcare_hanbon.model')
ftmodel = FastText.load('./embedding/fasttext_healthqna.model')
def ft_dimension_retriever(x) :
    try :
        return ftmodel.wv[x]
    except :
        return np.repeat(0,200)
    
def final_meanvector_retriever(x) :
    return np.mean(ft_dimension_retriever([kana_to_hangul_converter(each) for each in sp.EncodeAsPieces(hangul_to_kana_converter(split_text_cleaner(spacing_model.correct(str(x))[0])))]), axis=0)

classifier = joblib.load('ensembled_classifier.pkl')

def symptom_classifier(x) :