Beispiel #1
0
def load_w2v(section, target, max_n, season='all'):
    model = word2vec.Word2Vec.load(f'{save_dir}/{section}_{season}.model')
    li = model.wv.most_similar(positive=[target], topn=max_n)
    komoran = Komoran()
    word_list = []
    dist_list = []
    for word, dist in li:
        temp = [tt[1] for tt in komoran.pos(word)]
        if len(set(temp).intersection(["NNG", "NNP"])) != 0:
            word_list.append(word)
            dist_list.append(dist)

    # 가까운 단어와 거리를 csv로 저장
    df = pd.DataFrame({'word': word_list, 'dist': dist_list})
    df.to_csv(f'{save_dir}/{section}_{target}_{season}.csv', encoding='ms949')

    # 그림으로 저장
    word_list.append('미세먼지')
    x = model[word_list]
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=2000)
    x_tsne = tsne.fit_transform(x)
    df2 = pd.DataFrame(x_tsne, index=word_list, columns=['x', 'y'])

    plt.figure(figsize=(16, 9))
    plt.scatter(df2['x'], df2['y'])
    for word, pos in df2.iterrows():
        if word == '미세먼지':
            plt.annotate(word, pos, color='red')
        else:
            plt.annotate(word, pos, va='bottom')
    plt.savefig(f'{save_dir}/{section}_{target}_{season}.png')
    plt.close()
def sentences_komoran(filelist):
    komoran = Komoran()
    sentences = []
    for i, file in enumerate(filelist):
        with open(file, 'r', encoding='utf-8') as fp:
            while True:
                try:
                    line = fp.readline()
                    if not line: 
                        break

                    line = re.sub("\xa0", " ", line).strip()
                    if line == "" : 
                        continue

                    tokens = komoran.nouns(line)
                    if len(tokens) == 0: 
                        continue

                    sentences.append(tokens)

                except Exception as e:
                    print(e)
                    continue
    return sentences
Beispiel #3
0
 def __init__(self, tagger=None):
     if tagger:
         self.tagger = tagger
     else:
         from konlpy.tag import Komoran
         self.tagger = Komoran()
     self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
Beispiel #4
0
 def __init__(self, filepath, tagger=None):
     if tagger:
         self.tagger = tagger
     else:
         self.tagger = Komoran()
     self.filepath = filepath
     self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
Beispiel #5
0
def words_check(request):

    # 필요한 라이브러리 및 변수 초기화
    data = request.POST.get('data')
    komoran = Komoran()
    words = Counter(komoran.nouns(data))
    print(words.keys())
    # 1글자 단어 걸러내기
    nouns = dict()
    for data in words.keys():
        if len(data) != 1:
            nouns[data] = words.get(data)
    nouns = sorted(nouns.items(), key=lambda x: x[1], reverse=True)
    hashing = random.choice(range(100))
    context = {
        'nouns': nouns,
        'hashing': hashing,
    }
    # 워드클라우드
    taglist = pytagcloud.make_tags(nouns, minsize=10, maxsize=60)
    link = 'static/wordcloud/wordcloud' + str(hashing) + '.jpg'
    #link = 'static/wordcloud/wordcloud.jpg'
    pytagcloud.create_tag_image(taglist,
                                link,
                                size=(600, 600),
                                layout=3,
                                fontname='CookieRun',
                                rectangular=True)

    return HttpResponse(json.dumps(context), content_type='application/json')
Beispiel #6
0
 def __init__(self, textIter, tagger=None):
     if tagger:
         self.tagger = tagger
     else:
         self.tagger = Komoran()
     if type(textIter) == str: self.textIter = textIter.split('\n')
     else: self.textIter = textIter
     self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
Beispiel #7
0
def parse_sentence_pos(line):
    komoran = Komoran()
    idx, raw, label = line.split('\t')
    pos = ""
    for elem in komoran.pos(line):
        pos += elem[0] + '/' + elem[1] + '|'
    pos = pos[-1]
    return idx, pos, raw, label
Beispiel #8
0
 def __init__(self, file_path, tagger=None):
     if tagger:
         self.tagger = tagger
     else:
         from konlpy.tag import Komoran
         self.tagger = Komoran(userdic='./text_rank/dic.txt')
     self.file_path = file_path
     self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
Beispiel #9
0
    def __init__(self,
                 path="curse_detection/dataset/long.txt",
                 one_hot=False,
                 max_len=30):
        self.path = path
        self.one_hot = one_hot  # True: [0~1, 0~1]  False: 0~1
        self.max_len = max_len

        self.komoran = Komoran()
Beispiel #10
0
    def __init__(self, driver, db, taggedSen):
        #self.searchInput = searchInput
        #self.searchInputAnnex = searchInputAnnex
        self.driver = driver
        self.db = db
        self.taggedSen = taggedSen

        from konlpy.tag import Komoran
        self.Komoran = Komoran()
Beispiel #11
0
def token(doc):
    # kkma = Kkma()
    km = Komoran()
    pos_doc = []
    for doc_item in doc:
        for pos in km.pos(doc_item):
            if pos[1][0] == 'M':
                pos_doc.append(pos[0])
    return pos_doc
Beispiel #12
0
 def __init__(self, userdic=None):
     from konlpy.tag import Komoran
     import os
     if userdic is not None:
         print("user dict " + str(os.path.abspath(userdic)))
         self.inst = Komoran(userdic=os.path.abspath(userdic))
     else:
         self.inst = Komoran()
     self.OUT_TYPE = [list, tuple]
Beispiel #13
0
def samerank(db, emotion_dict):  # Null값 전처리 파일 출력 - 1,2등 동순위 전처리

    komoran = Komoran()

    cursor = db.cursor()

    emotion = ['happy', 'enjoy', 'comfort', 'horror', 'angry', 'sad']

    sql = "SELECT DISTINCT title, artist, lyrics FROM musicl WHERE (DATE, ranking) IN (SELECT DATE, ranking FROM emoti_test WHERE rank1 IS NULL)"
    cursor.execute(sql)
    null_data = cursor.fetchall()
    null_data = pd.DataFrame(null_data, columns=['제목', '가수', '가사'])

    null_data_rating = pd.DataFrame(columns=['제목', '가수', '순위', '수치'])

    for title, singer, lyrics in null_data.values:

        # 형태소 나누기
        lyrics = lyrics.replace('\n', '')
        words_temp = komoran.morphs(lyrics)

        # 6개의 감정 섹션
        happy = 0
        enjoy = 0
        comfort = 0
        angry = 0
        horror = 0
        sad = 0

        # 가사의 감성 분석
        lyrics_emotion = pd.DataFrame(index=emotion)

        for word in words_temp:
            if word in emotion_dict['happy']: happy += 1
            if word in emotion_dict['enjoy']: enjoy += 1
            if word in emotion_dict['comfort']: comfort += 1
            if word in emotion_dict['angry']: angry += 1
            if word in emotion_dict['horror']: horror += 1
            if word in emotion_dict['sad']: sad += 1

        # 어떤 감성이 더 많이 나왔는지 정렬
        result_emotion = [happy, enjoy, comfort, angry, horror, sad]
        lyrics_emotion[0] = result_emotion
        rating = lyrics_emotion[0].sort_values(ascending=False).index
        value = lyrics_emotion[0].sort_values(ascending=False).values

        null_data_rating = null_data_rating.append(
            {
                '제목': title.strip(),
                '가수': singer,
                '순위': list(rating),
                '수치': list(value)
            },
            ignore_index=True)

    null_data_rating.to_excel('data/samepointSong.xlsx', encoding='utf-8')
def noun_tokenizer(df_col_name, preprocessed_df):
    from konlpy.tag import Komoran
    komoran = Komoran()
    globals()["noun_" + str(df_col_name)] = []
    i = 0
    for words in preprocessed_df:
        i += 1
        globals()["noun_" + str(df_col_name)].append(komoran.nouns(words))
        print('row ' + str(i) + ' finished')
    print("noun_tokenized")
 def __init__(self, word_model, rec_dao, search_dao, movie_info_dao):
     self.word_model = word_model
     self.rec_dao = rec_dao
     self.search_dao = search_dao
     self.movie_info_dao = movie_info_dao
     self.komoran = Komoran()
     self.flatten = itertools.chain.from_iterable
     self.pos_targets = {
         'NNG', 'NNP', 'VV', 'VA', 'MAG', 'VX', 'NF', 'NV', 'XR'
     }
Beispiel #16
0
 def sentence_pos(sentence):
     print('# before user dic')
     komo = Komoran()
     result = komo.pos(sentence)
     print('전체 확인하기')
     for myitem in result:
         somedata = '단어 : %s, 품사 : %s' % (myitem[0], myitem[1])
         print(somedata)
     print('-' * 30)
     return result
Beispiel #17
0
def tokenize_komoran(doc):

    komoran = Komoran()
    result = []

    for token in komoran.pos(doc):
        if not token[1] in ['SP','SF','SE','SO']:
            result.append('/'.join(token))

    return result
Beispiel #18
0
def kor_tokenizer(list_sentences):
    komoran = Komoran(max_heap_size=1024)
    list_output = []
    for sentence in list_sentences:
        sentence = re.sub("[^가-힣\s]", "", sentence)
        tokenized_sentence = komoran.morphs(sentence)
        list_output.append(tokenized_sentence)
        with open('./result/tokens.pickle', 'wb') as f:
            pickle.dump(list_output, f, pickle.HIGHEST_PROTOCOL)
    return list_output
def morpheme(sentence):
    komoran = Komoran()
    morphs = komoran.pos(sentence)
    #print(morphs)

    noun_morph = []
    for morph in morphs:
        if morph[1] == 'NNP':
            noun_morph.append(morph[0])

    return noun_morph
Beispiel #20
0
    def interactive_shell(self, tags, processing_word):
        kkma = Komoran()
        idx_to_tag = {idx: tag for tag, idx in tags.items()}
        saver = tf.train.Saver()
        with tf.Session() as sess:
            saver.restore(sess, self.config.model_output)
            self.logger.info("""
This is an interactive mode.
To exit, enter 'exit'. 
You can enter a sentence like
input> I love Paris""")
            while True:
                try:
                    try:
                        # for python 2
                        sentence = raw_input("input> ")
                    except NameError:
                        # for python 3
                        sentence = input("input> ")

                    if "exit" in sentence:
                        break
                    print(sentence)

                    sentence = sentence.split(" ")
                    print(sentence)

                    words_raw = []
                    words_list = []
                    positions = []
                    for i, sen in enumerate(sentence):
                        if sen is None:
                            continue
                        poses = kkma.pos(sen)

                        for pos in poses:
                            words_raw.append(pos[0] + "/" + pos[1])
                            words_list.append(pos[0])
                            positions.append(i + 1)
                    print(words_raw)

                    words = [processing_word(w) for w in words_raw]
                    for w in words:
                        print(w)
                    if type(words[0]) == tuple:
                        words = zip(*words)
                    print("go batch")
                    pred_ids, _ = self.predict_batch(sess, [words],
                                                     [positions])
                    preds = [idx_to_tag[idx] for idx in list(pred_ids[0])]
                    print_sentence(self.logger, {"x": words_list, "y": preds})

                except Exception:
                    pass
Beispiel #21
0
def process(sentence):
    ko = Komoran()
    pos = ko.pos(sentence)
    # print(pos)
    exceptions = ['J', 'E', 'JX', 'EC', 'JKS', 'EC', 'EF', 'NA']
    s = ''
    for x in pos:
        if x[1] not in exceptions:
            s += x[0]
            s += ' '
    return s
def get_recommend_query(query):
    all_products = SpProduct.objects.values('name')
    results = []
    # Query word exist (should be perfect form): 쿼리값과 정확하게 일치하는 값을 찾는다.
    for i in range(len(all_products)):
        if query in all_products[i]['name']:
            results.append(all_products[i]['name'])
    # Tokenizer
    komoran = Komoran()
    query_tokens = komoran.pos(query)
    all_products_tokens = []
    for i in range(len(all_products)):
        all_products_tokens.append(komoran.pos(all_products[i]['name']))
    # Query word exist (can not be perfect form): 쿼리값과 토큰값이 일치하는 값을 찾는다.
    for i in range(len(all_products_tokens)):
        if query_tokens[0] in all_products_tokens[i]:
            results.append(all_products[i]['name'])
    # NspProduct에도 똑같이 적용한다.
    all_nsp_products = NspProduct.objects.values('name')
    for i in range(len(all_nsp_products)):
        if query in all_nsp_products[i]['name']:
            results.append(all_nsp_products[i]['name'])
    all_products_nsp_tokens = []
    for i in range(len(all_nsp_products)):
        all_products_nsp_tokens.append(komoran.pos(
            all_nsp_products[i]['name']))
    for i in range(len(all_products_nsp_tokens)):
        if query_tokens[0] in all_products_nsp_tokens[i]:
            results.append(all_nsp_products[i]['name'])
    # Relative Query and target (same NspProduct class): 같은 NspProduct 클래스에 있는 값을 찾는다.
    results_copy = results.copy()
    for r in results_copy:
        sp_product_ = SpProduct.objects.filter(name=r)
        nsp_product_ = NspProduct.objects.filter(name=r)
        if len(sp_product_) > 0:
            sp_product_nsp_id = sp_product_.values()[0]['product_id']
            related_nsp = NspProduct.objects.filter(id=sp_product_nsp_id)
            # 관련 NspProduct 넣기
            results.append(related_nsp.values()[0]['name'])
            # 관련 NspProduct에 관련된 Sp 넣기
            related_sp = SpProduct.objects.filter(
                product_id=sp_product_nsp_id).values()
            for sp in related_sp:
                results.append(sp['name'])
        if len(nsp_product_) > 0:
            nsp_product_id = nsp_product_.values()[0]['id']
            related_sp = SpProduct.objects.filter(
                product_id=nsp_product_id).values()
            # 관련 SpProduct 넣기
            for sp in related_sp:
                results.append(sp['name'])
    # 위의 세 가지 경우를 모두 찾아 결과값을 반환한다.
    results = list(set(results))
    return results
Beispiel #23
0
def question_generator(text):
    komo = Komoran()

    Text = text
    question = []
    for value, tag in set(komo.pos(Text)):
        if tag == 'NNP' and value != '제가':
            question.append('왜 ' + value + '인가요?')
            question.append(value + '에 대해서 자신이 아는대로 설명해주세요.')
            question.append(value + '의 장단점이 무엇이라고 생각하시나요?')
    print(sample(question, 3))
    return sample(question, 3)
Beispiel #24
0
class Tagger:
    def __init__(self, mode: str = "nouns"):
        """
        konlpy pos tagger
        """
        self.tagger = Komoran()
        self.mode = mode  # nouns, morphs

    def __call__(self, *args, **kwargs) -> list:
        if self.mode == "nouns":
            return self.tagger.nouns(*args, **kwargs)
        elif self.mode == "morphs":
            return self.tagger.morphs(*args, **kwargs)
    def __init__(self, fpath):
        self.dataframe = pd.read_csv(fpath, encoding='utf-8')
        self.dataframe.keyword = self.dataframe.keyword.apply(literal_eval)

        plt.style.use('seaborn-darkgrid')

        font_name = './static/fonts/AppleSDGothicNeo.ttc'
        font_family = fm.FontProperties(fname=font_name).get_name()

        plt.rcParams['font.family'] = font_family
        plt.rcParams['font.size'] = 18

        self.komoran = Komoran(userdic='./data/user_dic.tsv')
Beispiel #26
0
def question_generator(text):
    komo = Komoran()

    Text = text
    question = []
    for value, tag in set(komo.pos(Text)):
        if tag == 'NNP' and value != '제가' and '!' not in value:
            question.append('왜 ' + value + '인가요?')
            question.append(value + '에 대해서 설명해주세요.')
            question.append(value + '의 장단점이 무엇이라고 생각하시나요?')
            question.append(value + '에 대해 영어로 묘사해주세요.')
    # print(sample(question, 3))
    return sample(question, 3) if len(question) >= 3 else []
    def calculate_result(self, text):
        komoran = Komoran()
        pos_l = list(komoran.pos(text))

        # change format to compare with polarity.xlsx
        for i in range(len(pos_l)):
            pos_l[i] = "/".join(list(pos_l[i]))

        POS = 0
        NEG = 0

        l = []

        i = 0

        while i < len(pos_l):
            if pos_l[i] in self.sentiments:
                text = pos_l[i]
                if i + 1 < len(pos_l):
                    if pos_l[i + 1] in self.sentiments[pos_l[i]]:
                        text = pos_l[i] + ";" + pos_l[i + 1]
                        i += 1
                        if i + 1 < len(pos_l):
                            if pos_l[i] + ";" + pos_l[i + 1] in self.sentiments[pos_l[i - 1]]:
                                text = pos_l[i - 1] + ";" + pos_l[i] + ";" + pos_l[i + 1]
                                index = self.sentiments[pos_l[i - 1]][0] + self.sentiments[pos_l[i - 1]].index(
                                    pos_l[i] + ";" + pos_l[i + 1])
                                i += 1
                            else:
                                index = self.sentiments[pos_l[i - 1]][0] + self.sentiments[pos_l[i - 1]].index(pos_l[i])
                        else:
                            index = self.sentiments[pos_l[i - 1]][0] + self.sentiments[pos_l[i - 1]].index(pos_l[i])
                    else:
                        index = self.sentiments[pos_l[i]][0]
                else:
                    index = self.sentiments[pos_l[i]][0]

                neg = int(float(self.sentiment_file.cell(row=index, column=2).value) * 100) / 100
                pos = int(float(self.sentiment_file.cell(row=index, column=3).value) * 100) / 100
                POS += pos
                NEG += neg
                l.append((text, pos, neg))
            i += 1

        POS = int(POS * 100) / 100
        NEG = int(NEG * 100) / 100
        mood = (POS + NEG)

        print("CURRENT MOOD IS POS:{},NEG:{}, TOTAL:{}".format(POS, NEG, mood))

        return pos_l, POS, NEG, mood
Beispiel #28
0
 def __init__(self,word2index_dic='', userdic=None):
     # 형태소 분석기 초기화
     self.komoran = Komoran(userdic=userdic)
     #제외할 품사 - 관계언, 기호, 어미, 접미사
     #참조: http://docs.komoran.kr/firststep/postypes.html
     self.exclusion_tags = [
         'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC', 'SF', 'SP', 'SS', 'SE', 'SO', 'EP', 'EF', 'EC', 'ETN', 'ETM',
         'XSN', 'XSV', 'XSA'
     ]
     if (word2index_dic != ''):
         f = open(word2index_dic, "rb")
         self.word_index = pickle.load(f)
     else:
         self.word_index = None
Beispiel #29
0
    def __init__(self, userdic=None):
        # 형태소 분석기 초기화
        self.komoran = Komoran(userdic=userdic)
        print(self.komoran)

        # 제외할 품사
        # 관계인 제거, 기호 제거
        # 어미 제거
        # 접미사 제거
        self.exclusion_tags = [
            'JKS', 'JKC', 'JKG', 'JKB', 'JKV', 'JKQ', 'JK', 'JC', 'SF', 'SP',
            'SS', 'SE', 'SO', 'EP', 'EF', 'EC', 'ETN', 'ETM', 'XSN', 'XSV',
            'XSA'
        ]
Beispiel #30
0
def run():

    tr = TextRank()
    from konlpy.tag import Komoran

    #텍스트요약
    tagger = Komoran()
    stopword = set([('있', 'VV'), ('하', 'VV'), ('되', 'VV')])
    tr.loadSents(
        RawSentence(sys.argv[1]), lambda sent: filter(
            lambda x: x not in stopword and x[1] in
            ('NNG', 'NNP', 'VV', 'VA'), tagger.pos(sent)))
    tr.build()
    print(json.dumps(tr.summarize()))
Beispiel #31
0
# -*- coding:utf8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import json
import operator 
import urllib2

import requests

from konlpy.tag import Komoran

import collections

komoran = Komoran()

allTitle = urllib2.urlopen("http://polatics.news/all").read().split('\n')
allTitle = "\n".join([" ".join(komoran.nouns(t)) for t in allTitle])

vocaAll = allTitle.split()
print (len(vocaAll))

# 길이 1 이하인 것을 제외만 해도, 상당한 노이즈를 제거할 수 있다
vocaAll = [v.strip() for v in vocaAll if len(v) > 1]

# result ; list of tuple
# 모든 애들에 대해서 할필요가 전혀 없다.
# 유용하지 못한 관계만 추가될 것 같다
voca_topK = collections.Counter(vocaAll).most_common(300)
Beispiel #32
0
VERBOSE = 0

import sys
reload(sys)
sys.setdefaultencoding('utf-8')


if __name__ == '__main__':

	parser = OptionParser()
	parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode")
	(options, args) = parser.parse_args()

	if options.verbose : VERBOSE = 1

	komoran = Komoran()

	while 1:
		try:
			line = sys.stdin.readline()
		except KeyboardInterrupt:
			break
		if not line:
			break

		analyzed = komoran.pos(line)
		seq = 1
		for morph, tag in analyzed :
			tp = [seq, morph, morph, tag, tag, '_', 0, '_', '_', '_']
			print '\t'.join([str(e) for e in tp])
			seq += 1
Beispiel #33
0
from konlpy.tag import Twitter
from konlpy.tag import Komoran
twitter = Komoran()

text = open('190747347803005_191149761096097','r').read().decode('utf8')
print text
print '---------------'
sentence = []
for i in twitter.pos(text):
    #if i[1] == 'Unknown' or i[1] == 'Punctuation':
    #    continue
    if i[1] == 'SF' or i[1] == 'SE' or i[1] == 'SP' or i[1] == 'SS':
        continue
    sentence.append(i[0])
    if i[1] == 'EF' :
        print ''.join(sentence)
        sentence = []