Python Kiwi.prepareの例、kiwipiepy.Kiwi.prepare Pythonの例

コード例 #1

0

ファイルを表示

ファイル: views.py プロジェクト: jang-jin/ChromeLogDataAnalysisSite

def index(request):
    context = {}

    # History 경로 생성
    homepath = os.path.expanduser("~")
    abs_chrome_path = os.path.join(homepath, 'AppData', 'Local', 'Google', 'Chrome', 'User Data', 'Default', 'History')
    # History 파일 복사
    shutil.copyfile(abs_chrome_path, abs_chrome_path+"_sample")
    # 복사본 데이터 추출
    con = sqlite3.connect(abs_chrome_path+"_sample")
    cursor = con.cursor()
    cursor.execute("SELECT term FROM keyword_search_terms")
    term_data = cursor.fetchall()

    # 형태소 분석
    kiwi = Kiwi()
    kiwi.prepare()
    word_list = []
    for term in term_data:
        for word, tag, _, _ in kiwi.analyze(term[0], top_n=1)[0][0]:
            if tag in ['NNG','NNP','NNB','SL']:
                word_list.append(word)
    
    # count
    counts = Counter(word_list)
    tags = counts.most_common()

    # wordcloud
    mask = plt.imread("./static/images/mask.jpg")
    wc = WordCloud(font_path='./static/webfonts/NanumBarunGothicBold.ttf',
                    background_color='white', 
                    width=800, 
                    height=800,
                    mask=mask)

    cloud = wc.generate_from_frequencies(dict(tags))
    plt.figure(figsize=(10, 8))
    plt.axis('off')
    plt.imshow(cloud,  interpolation="bilinear")
    plt.savefig("./static/images/wordcloud_keyword.png", dpi=300, bbox_inches='tight')

    # 상위 9개 단어
    
    top9_list = []
    for rank in range(9):
        top9 = {}
        top9['rank'] = rank+1
        top9['word'] = tags[rank][0]
        top9['count'] = tags[rank][1]
        top9_list.append(top9)

    context['top9'] = top9_list

    return render(request, 'mainapp/index.html', context)

コード例 #2

0

ファイルを表示

def test_kiwi(environ):
    _, tagger = environ
    kiwi = Kiwi()
    kiwi.prepare()

    for _, line in EXAMPLES:
        res1 = tagger.tagSentence(line)[0]
        res2 = kiwi.analyze(line)

        res1 = [(m.surface, m.originalTag) for w in res1 for m in w]
        res2 = [m[:2] for m in res2[0][0]]

        assert res1 == res2

コード例 #3

0

ファイルを表示

def prepare_kiwi(train_file):
    """
    input: train file i.e. corpora
    output: kiwi model
    """
    numThread = 4
    kiwi = Kiwi(numThread)
    reader = ReaderExam(train_file)
    minCount = 5
    maxWordLength = 6
    minScore = 0.25
    kiwi.extractWords(reader.read, minCount, maxWordLength, minScore)
    kiwi.prepare()
    return kiwi

コード例 #4

0

ファイルを表示

class kiwi_dictionary_n_fuction:
    def __init__(self, path):
        self.kiwi = Kiwi(options=Option.LOAD_DEFAULT_DICTIONARY
                         | Option.INTEGRATE_ALLOMORPH)
        self.kiwi.load_user_dictionary(path)
        self.kiwi.prepare()

        self.josa = [
            'JK', 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC'
        ]

    def get_noun(self, sen):
        _, self.nn_list, _, _ = self.generate_morp_word(sen, 1)
        return self.nn_list

    # 문장 전체를 리스트형태로 띄어쓰기만 해서 리턴
    def get_all_token(self, sen):
        morp_list, _, _, _ = self.generate_morp_word(sen, 1)
        return morp_list

    # 문장 전체를 토큰화 후 문자열 리턴
    def get_token_str(self, sen):
        morp_list, _, _, _ = self.generate_morp_word(sen, 1)
        string = ''.join(morp_list)
        # if '\\' in self.string:
        #     self.string = self.string.translate({ord('\\'):'\\\\'})
        return string

    def get_vv(self, sen):
        _, _, vv_list, _ = self.generate_morp_word(sen, 1)
        return vv_list

    def get_nn_list(self, sen):
        _, nn_list, _, _ = self.generate_morp_word(sen, 1)
        return nn_list

    # 조사 없애고 나머지부분 문자열형태로 리턴.
    def get_no_josa_token(
        self, sen
    ):  # EX) 관찰 가능 하 고 처리 가능 하 ᆫ 범위 내 문장 입력 받 어 정해진 형태 출력 제한 되 ᆫ 시간 내 출력 하 어야 하 ᆫ다는 제약 적 용도 고려 하 ᆫ 관점 이 다 .
        _, _, _, nosa_list = self.generate_morp_word(sen, 1)
        string = ''.join(nosa_list)
        return string

    # 튜플 리스트 리턴
    def k_pos(
        self, sentence
    ):  # [('관찰', 'NNG'), ('가능', 'NNG'), ('하', 'XSA'), ('고', 'EC'), ('처리', 'NNG'), ('가능', 'NNG'), ('하', 'XSA'), ('ᆫ', 'ETM'), ('범위', 'NNG')]
        tuple_list = []
        result = self.kiwi.analyze(sentence, 1)
        for i in result[0][0]:
            word, pos = i[0], i[1]
            new_tuple = (word, pos)
            tuple_list.append(new_tuple)
        return tuple_list

    def k_analyze(self, sentence):
        return self.kiwi.analyze(sentence, 1)

    # 단순 단어만 리스트형태로 리턴
    def k_morphs(self, sen):  # ['관찰', '가능', '하', '고', '처리', '가능', '하' ... ..]
        token_list = []
        result = self.kiwi.analyze(sen, 1)
        for i in result[0][0]:
            token_list.append(i[0])
        return token_list

    # 문장에서 형태소를 뽑아냄
    def generate_morp_word(self, sentence, analyze_num):
        try:
            result = self.kiwi.analyze(sentence, analyze_num)
            morp_word_list = []
            morp_nn_list = []
            morp_vv_list = []
            morp_not_josa_list = []
            for i in range(0, analyze_num):
                morp_word = ''
                morp_nn = ''
                morp_vv = ''
                morp_not_josa = ''
                nn = []
                for word in result[i][0]:
                    morp_word += word[0]
                    morp_word += ' '

                    if word[1] not in self.josa:
                        morp_not_josa += word[0]
                        morp_not_josa += ' '
                        if word[1] in ['NNG', 'NNP', 'NNB', 'NP', 'NR', 'SL']:
                            morp_nn += word[0]
                            morp_nn += ' '
                            nn.append(word[0])
                        elif word[1] in ['VV', 'VA', 'VX', 'VCP', 'VCN']:
                            morp_vv += word[0]
                            morp_vv += ' '
                    else:
                        pass
                morp_word_list.append(morp_word)
                morp_nn_list.append(morp_nn)
                morp_vv_list.append(morp_vv)
                morp_not_josa_list.append(morp_not_josa)

            return morp_word_list, morp_nn_list, morp_vv_list, morp_not_josa_list

        except Exception as e:
            print(e)
            print("### ERROR 형태소 분석기 부분 에 뭐가 잘못된게 있는듯 ERROR ### ")

    def __del__(self):
        print("EXIT kiwi")

コード例 #5

0

ファイルを表示

ファイル: changer.py プロジェクト: kosohae/AIpjt-1

class Changer(object):
    def __init__(self):    
        try:
            self.kiwi = Kiwi()
            self.kiwi.prepare()
        except:
            print("[INFO] please install kiwipiepy   ")
            
        self.replace = formaldic()
        self.utils = Utils()

    def dechanger(self, stc):
        """
        change formal speech to informal
        Args : str
        """
        pattern = r'하세요|이예요|이에요|에요|예요|시겠어요|죠|합니까|습니까'
        pattern = re.compile(pattern)

        result = []


        stc = self.utils._remove_blank(stc)
        stc = self.utils._clean_up_tokenization(stc)

        if len(re.findall(pattern, stc)) > 0:
            tokens = self.kiwi.analyze(stc.replace(" ","|"))
            
            key = informaldic().keys()
            lk = list(key)
            key2 = abnormaldic().keys()
            ak = list(key2)
            
            tmp = []
            for token in tokens[0][0]:
                if token[:2] in lk:
                    #key로 value
                    token = informaldic().get(token[:2])
                if token[:2] in ak:
                    token = abnormaldic().get(token[:2])
                tmp.append(token)

            changed = ''
            for t in tmp:
                if isinstance(t[0], tuple):
                    for i in range(len(t[0])):
                        changed += hgtk.text.decompose(t[i][0])
                else:
                    changed += hgtk.text.decompose(t[0])
                    
            one_char = re.compile('ᴥ[ㅂㄴㄹ]ᴥ')
            if one_char.search(changed):
                words = changed.split('ᴥ')
                for idx in range(1,len(words)):
                    # 앞 글자가 종성이 없음
                    if len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 2:
                        #앞 글자에 합침
                        words[idx - 1] = words[idx-1]+words[idx]
                        words[idx] = ""
                    # 있음
                    elif len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 3:
                        shp = ['ㅆ','ㅍ','ㄱ','ㅄ','ㄶ']
                        ep = ['ㄹ']
                        if words[idx] == 'ㅂ' and len(words[idx - 1].replace('|', "")) == 3 :
                            if words[idx - 1][-1] in shp :
                                if words[idx].count("|") > 0:
                                    words[idx] = "|습"
                                else:
                                    words[idx ] = "습"
                                continue
                            else :
                                if words[idx].count("|") > 0:
                                    words[idx] = "|입"
                                else:
                                    words[idx] = "입"
                                # words[idx] = ""
                        elif words[idx] =='ㄴ' and len(words[idx-1].replace('|',"")) == 3 and words[idx - 1].endswith('ㄹ'):
                            if words[idx-1].count("|") >0 :
                                words[idx - 1] = "|" + words[idx - 1].replace("|","")[:2] + words[idx]
                            else :
                                words[idx - 1] = words[idx - 1][:2] + words[idx]
                            # 지움
                            words[idx] = ""
                        elif words[idx] =='ㄹ':
                            if words[idx].count("|") > 0:
                                words[idx] = "|일"
                            else:
                                words[idx] = "일"

                changed = "ᴥ".join([x for x in words if x is not ""])+"ᴥ"
            # For cases which wasn't covered,
            changed = self._makePretty(changed)
            changed = hgtk.text.compose(changed).replace("|"," ")
            # excetion 처리
            try:
                if changed[-1] == '요':
                    changed = re.sub('요', '', changed)
                changed = re.sub('그렇죠', '', changed)
            except:
                pass
            result.append(changed)

        else:
            try:
                result.append(stc)
            except:
                pass
        return result[0]
        

    def _makePretty(self, line):
        """
        Convert the jaso orderings which wasn't properly covered by
        Jaso restructuring process of function Mal_Gillge_Haeraing
        :param line: jaso orderings which wasn't properly covered
        :return: Converted jaso ordering
        """
        test = line
        test = test.replace("ᴥㅎㅏᴥㅇㅏᴥ", "ᴥㅎㅐᴥ")
        test = test.replace("ㅎㅏᴥㅇㅏᴥㅇㅛᴥ", "ᴥㅎㅐᴥ")
        test = test.replace("ㅎㅏᴥㄴㅣᴥㄷㅏᴥ", "ㅎㅏㅂᴥㄴㅣᴥㄷㅏᴥ")
        test = test.replace("ㅎㅏᴥㅇㅏㅆᴥ", "ᴥㅎㅐㅆᴥ")
        test = test.replace("ㄴㅏᴥㅇㅏㅆᴥ", "ᴥㅎㅐㅆᴥ")
        test = test.replace("ㄱㅏᴥㅇㅏㅆᴥ", "ᴥㄱㅏㅆᴥ")
        test = test.replace("ㅇㅣᴥㄴㅣᴥ", "ᴥㄴㅣᴥ")
        test = test.replace("ㄴㅓㄹㄴᴥ","ㄴㅓㄴᴥ")
        test = test.replace("ㄱㅡᴥㄹㅓㅎᴥㅇㅓᴥ","ㄱㅡᴥㄹㅐᴥ")
        test = test.replace("ㅡᴥㅇㅏᴥ","ㅏᴥ")
        test = test.replace("ㄱㅓㄹᴥㄴㅏᴥㅇㅛᴥ", "ㄱㅓㄴᴥㄱㅏᴥㅇㅛᴥ")
        return test

    def changer(self, text):
        """
        change informal speech to formal speech
        Args : str
        """
        tokens = self.kiwi.analyze(text.replace(" ","|"))
        
        key = formaldic().keys()
        key2 = abnormaldic().keys()
        lk = list(key)
        ak = list(key2)
        
        num = len(tokens[0][0])
        result = []
        for idx, token in enumerate(tokens[0][0]):
            if idx > int(num*0.8):        
                if token[:2] in lk:
                    #key로 value
                    token = formaldic().get(token[:2])
                    result.append(token)
                else:
                    if token[:2] in ak:
                        token = abnormaldic().get(token[:2])
                        result.append(token)
                    else:
                        result.append(token[:2])
            else:
                if token[:2] in ak:
                    token = abnormaldic().get(token[:2])
                    result.append(token)
                else:
                    result.append(token[:2])
                
        # change tuple to text
        changed = ''
        for t in result:
            if isinstance(t[0], tuple):
                for i in range(len(t[0])):
                    changed += hgtk.text.decompose(t[i][0])
            else:
                changed += hgtk.text.decompose(t[0])

        # Restructuring sentence from jaso ordering.
        one_char = re.compile('ᴥ[ㅂㄴㄹ]ᴥ')
        if one_char.search(changed):
            words = changed.split('ᴥ')
            for idx in range(1,len(words)):
                # 앞 글자가 종성이 없음
                if len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 2:
                    #앞 글자에 합침
                    words[idx - 1] = words[idx-1]+words[idx]
                    words[idx] = ""
                # 있음
                elif len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 3:
                    shp = ['ㅆ','ㅍ','ㄱ','ㅄ','ㄶ']
                    ep = ['ㄹ']
                    if words[idx] == 'ㅂ' and len(words[idx - 1].replace('|', "")) == 3 :
                        if words[idx - 1][-1] in shp :
                            if words[idx].count("|") > 0:
                                words[idx] = "|습"
                            else:
                                words[idx ] = "습"
                            continue
                        else :
                            if words[idx].count("|") > 0:
                                words[idx] = "|입"
                            else:
                                words[idx] = "입"
                            # words[idx] = ""
                    elif words[idx] =='ㄴ' and len(words[idx-1].replace('|',"")) == 3 and words[idx - 1].endswith('ㄹ'):
                        if words[idx-1].count("|") >0 :
                            words[idx - 1] = "|" + words[idx - 1].replace("|","")[:2] + words[idx]
                        else :
                            words[idx - 1] = words[idx - 1][:2] + words[idx]
                        # 지움
                        words[idx] = ""
                    elif words[idx] =='ㄹ':
                        if words[idx].count("|") > 0:
                            words[idx] = "|일"
                        else:
                            words[idx] = "일"

            changed = "ᴥ".join([x for x in words if x is not ""])+"ᴥ"
        # For cases which wasn't covered,
        changed = self._makePretty(changed)
        changed = hgtk.text.compose(changed).replace("|"," ")
        return changed
        
    def addData(self, key, val):
        """
        Add new data to dictionary, changer dictionary update
        :param key: key to be added into Dictionary self.replace
        :param val: Value to be added into Dictionary self.replace
        :return: None
        """
        with open('dictionary.py', 'r', encoding='utf-8') as f:
            data = f.read()

        lines = data.split("\n")
        lines[-2] += ','
        lines[-1] = "                    " + str(key) + ": " + str(val)
        with open('dictionary.py', 'w', encoding='utf-8') as f:
            for i in range(len(lines)):
                f.write(lines[i] + "\n")
            f.write("                    }")

    def checker(self, result):
        """
        Check the abnormal setnecnes and remove them.
        Args : result, updated, idx : list 
        """
        updated = []
        idxes = []
        normal = ['요', '까', '다', '죠', '가']
        for idx, stc in enumerate(result):
            try:
                if stc[-1] not in normal:
                    print(f"[INFO] Abnormal Sentence, remove {idx}....")
                    idxes.append(idx)
                else:
                    updated.append(stc)
            except:
                idxes.append(idx)

        return updated, idxes

コード例 #6

0

ファイルを表示

ファイル: korea_elect_cosine_simility.py プロジェクト: kbj804/Django_server

data['token'].isnull().sum()
# overview에서 Null 값을 가진 경우에는 값 제거
data['token'] = data['token'].fillna('')

#%% 형태소 분석이 끝난 후의 질문들로 학습 벡터 생성
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data['token'])
# overview에 대해서 tf-idf 수행
print(tfidf_matrix.shape)

# %%
from kiwipiepy import Kiwi
kiwi = Kiwi()
kiwi.load_user_dictionary(r'./userDict.txt')
kiwi.prepare()
def generate_morp_word(sentence,analyze_num):
    try:
        result = kiwi.analyze(sentence, analyze_num)
        print(result)
        morp_word_list =[]
        morp_nn_list=[]
        morp_vv_list=[]

        for i in range(0, analyze_num):
            morp_word = ''
            morp_nn=''
            morp_vv=''
            #print(i)
            for word in result[i][0]:
                morp_word += word[0]

コード例 #7

0

ファイルを表示

ファイル: Morpheme_cloud.py プロジェクト: GoDK36/Morpheme-Cloud

from kiwipiepy import Kiwi, Option
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import codecs

tagger = Kiwi()
tagger.prepare()

# def flat(content):
#     return ["{}/{}".format(word,tag) for word, tag in tagger.pos(content)

rsc = r'E:\Programming\python\창회선배스터디\Morpheme_Cloud\자료\토지2.txt'

target_corpus = codecs.open(rsc, 'r', encoding='utf-8')

#텍스트 태깅 작업
tagged_temp = []

# with open(rsc, 'r', encoding="utf8") as kr_f:
#     for line in kr_f:
#         line = line.strip()
#         tagged_temp += flat(line)

for i in target_corpus:
    i = i.strip()
    temp_tagging = [x[0] for x in tagger.analyze(i, top_n=1)]
    inner_temp = ["{}/{}".format(word, tag) for word, tag, score1, score2 in temp_tagging[0]]
    tagged_temp.append(tuple(inner_temp))

print(tagged_temp[:3])