def load_w2v(section, target, max_n, season='all'): model = word2vec.Word2Vec.load(f'{save_dir}/{section}_{season}.model') li = model.wv.most_similar(positive=[target], topn=max_n) komoran = Komoran() word_list = [] dist_list = [] for word, dist in li: temp = [tt[1] for tt in komoran.pos(word)] if len(set(temp).intersection(["NNG", "NNP"])) != 0: word_list.append(word) dist_list.append(dist) # 가까운 단어와 거리를 csv로 저장 df = pd.DataFrame({'word': word_list, 'dist': dist_list}) df.to_csv(f'{save_dir}/{section}_{target}_{season}.csv', encoding='ms949') # 그림으로 저장 word_list.append('미세먼지') x = model[word_list] tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=2000) x_tsne = tsne.fit_transform(x) df2 = pd.DataFrame(x_tsne, index=word_list, columns=['x', 'y']) plt.figure(figsize=(16, 9)) plt.scatter(df2['x'], df2['y']) for word, pos in df2.iterrows(): if word == '미세먼지': plt.annotate(word, pos, color='red') else: plt.annotate(word, pos, va='bottom') plt.savefig(f'{save_dir}/{section}_{target}_{season}.png') plt.close()
def sentences_komoran(filelist): komoran = Komoran() sentences = [] for i, file in enumerate(filelist): with open(file, 'r', encoding='utf-8') as fp: while True: try: line = fp.readline() if not line: break line = re.sub("\xa0", " ", line).strip() if line == "" : continue tokens = komoran.nouns(line) if len(tokens) == 0: continue sentences.append(tokens) except Exception as e: print(e) continue return sentences
def __init__(self, tagger=None): if tagger: self.tagger = tagger else: from konlpy.tag import Komoran self.tagger = Komoran() self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
def __init__(self, filepath, tagger=None): if tagger: self.tagger = tagger else: self.tagger = Komoran() self.filepath = filepath self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
def words_check(request): # 필요한 라이브러리 및 변수 초기화 data = request.POST.get('data') komoran = Komoran() words = Counter(komoran.nouns(data)) print(words.keys()) # 1글자 단어 걸러내기 nouns = dict() for data in words.keys(): if len(data) != 1: nouns[data] = words.get(data) nouns = sorted(nouns.items(), key=lambda x: x[1], reverse=True) hashing = random.choice(range(100)) context = { 'nouns': nouns, 'hashing': hashing, } # 워드클라우드 taglist = pytagcloud.make_tags(nouns, minsize=10, maxsize=60) link = 'static/wordcloud/wordcloud' + str(hashing) + '.jpg' #link = 'static/wordcloud/wordcloud.jpg' pytagcloud.create_tag_image(taglist, link, size=(600, 600), layout=3, fontname='CookieRun', rectangular=True) return HttpResponse(json.dumps(context), content_type='application/json')
def __init__(self, textIter, tagger=None): if tagger: self.tagger = tagger else: self.tagger = Komoran() if type(textIter) == str: self.textIter = textIter.split('\n') else: self.textIter = textIter self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
def parse_sentence_pos(line): komoran = Komoran() idx, raw, label = line.split('\t') pos = "" for elem in komoran.pos(line): pos += elem[0] + '/' + elem[1] + '|' pos = pos[-1] return idx, pos, raw, label
def __init__(self, file_path, tagger=None): if tagger: self.tagger = tagger else: from konlpy.tag import Komoran self.tagger = Komoran(userdic='./text_rank/dic.txt') self.file_path = file_path self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
def __init__(self, path="curse_detection/dataset/long.txt", one_hot=False, max_len=30): self.path = path self.one_hot = one_hot # True: [0~1, 0~1] False: 0~1 self.max_len = max_len self.komoran = Komoran()
def __init__(self, driver, db, taggedSen): #self.searchInput = searchInput #self.searchInputAnnex = searchInputAnnex self.driver = driver self.db = db self.taggedSen = taggedSen from konlpy.tag import Komoran self.Komoran = Komoran()
def token(doc): # kkma = Kkma() km = Komoran() pos_doc = [] for doc_item in doc: for pos in km.pos(doc_item): if pos[1][0] == 'M': pos_doc.append(pos[0]) return pos_doc
def __init__(self, userdic=None): from konlpy.tag import Komoran import os if userdic is not None: print("user dict " + str(os.path.abspath(userdic))) self.inst = Komoran(userdic=os.path.abspath(userdic)) else: self.inst = Komoran() self.OUT_TYPE = [list, tuple]
def samerank(db, emotion_dict): # Null값 전처리 파일 출력 - 1,2등 동순위 전처리 komoran = Komoran() cursor = db.cursor() emotion = ['happy', 'enjoy', 'comfort', 'horror', 'angry', 'sad'] sql = "SELECT DISTINCT title, artist, lyrics FROM musicl WHERE (DATE, ranking) IN (SELECT DATE, ranking FROM emoti_test WHERE rank1 IS NULL)" cursor.execute(sql) null_data = cursor.fetchall() null_data = pd.DataFrame(null_data, columns=['제목', '가수', '가사']) null_data_rating = pd.DataFrame(columns=['제목', '가수', '순위', '수치']) for title, singer, lyrics in null_data.values: # 형태소 나누기 lyrics = lyrics.replace('\n', '') words_temp = komoran.morphs(lyrics) # 6개의 감정 섹션 happy = 0 enjoy = 0 comfort = 0 angry = 0 horror = 0 sad = 0 # 가사의 감성 분석 lyrics_emotion = pd.DataFrame(index=emotion) for word in words_temp: if word in emotion_dict['happy']: happy += 1 if word in emotion_dict['enjoy']: enjoy += 1 if word in emotion_dict['comfort']: comfort += 1 if word in emotion_dict['angry']: angry += 1 if word in emotion_dict['horror']: horror += 1 if word in emotion_dict['sad']: sad += 1 # 어떤 감성이 더 많이 나왔는지 정렬 result_emotion = [happy, enjoy, comfort, angry, horror, sad] lyrics_emotion[0] = result_emotion rating = lyrics_emotion[0].sort_values(ascending=False).index value = lyrics_emotion[0].sort_values(ascending=False).values null_data_rating = null_data_rating.append( { '제목': title.strip(), '가수': singer, '순위': list(rating), '수치': list(value) }, ignore_index=True) null_data_rating.to_excel('data/samepointSong.xlsx', encoding='utf-8')
def noun_tokenizer(df_col_name, preprocessed_df): from konlpy.tag import Komoran komoran = Komoran() globals()["noun_" + str(df_col_name)] = [] i = 0 for words in preprocessed_df: i += 1 globals()["noun_" + str(df_col_name)].append(komoran.nouns(words)) print('row ' + str(i) + ' finished') print("noun_tokenized")
def __init__(self, word_model, rec_dao, search_dao, movie_info_dao): self.word_model = word_model self.rec_dao = rec_dao self.search_dao = search_dao self.movie_info_dao = movie_info_dao self.komoran = Komoran() self.flatten = itertools.chain.from_iterable self.pos_targets = { 'NNG', 'NNP', 'VV', 'VA', 'MAG', 'VX', 'NF', 'NV', 'XR' }
def sentence_pos(sentence): print('# before user dic') komo = Komoran() result = komo.pos(sentence) print('전체 확인하기') for myitem in result: somedata = '단어 : %s, 품사 : %s' % (myitem[0], myitem[1]) print(somedata) print('-' * 30) return result
def tokenize_komoran(doc): komoran = Komoran() result = [] for token in komoran.pos(doc): if not token[1] in ['SP','SF','SE','SO']: result.append('/'.join(token)) return result
def kor_tokenizer(list_sentences): komoran = Komoran(max_heap_size=1024) list_output = [] for sentence in list_sentences: sentence = re.sub("[^가-힣\s]", "", sentence) tokenized_sentence = komoran.morphs(sentence) list_output.append(tokenized_sentence) with open('./result/tokens.pickle', 'wb') as f: pickle.dump(list_output, f, pickle.HIGHEST_PROTOCOL) return list_output
def morpheme(sentence): komoran = Komoran() morphs = komoran.pos(sentence) #print(morphs) noun_morph = [] for morph in morphs: if morph[1] == 'NNP': noun_morph.append(morph[0]) return noun_morph
def interactive_shell(self, tags, processing_word): kkma = Komoran() idx_to_tag = {idx: tag for tag, idx in tags.items()} saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, self.config.model_output) self.logger.info(""" This is an interactive mode. To exit, enter 'exit'. You can enter a sentence like input> I love Paris""") while True: try: try: # for python 2 sentence = raw_input("input> ") except NameError: # for python 3 sentence = input("input> ") if "exit" in sentence: break print(sentence) sentence = sentence.split(" ") print(sentence) words_raw = [] words_list = [] positions = [] for i, sen in enumerate(sentence): if sen is None: continue poses = kkma.pos(sen) for pos in poses: words_raw.append(pos[0] + "/" + pos[1]) words_list.append(pos[0]) positions.append(i + 1) print(words_raw) words = [processing_word(w) for w in words_raw] for w in words: print(w) if type(words[0]) == tuple: words = zip(*words) print("go batch") pred_ids, _ = self.predict_batch(sess, [words], [positions]) preds = [idx_to_tag[idx] for idx in list(pred_ids[0])] print_sentence(self.logger, {"x": words_list, "y": preds}) except Exception: pass
def process(sentence): ko = Komoran() pos = ko.pos(sentence) # print(pos) exceptions = ['J', 'E', 'JX', 'EC', 'JKS', 'EC', 'EF', 'NA'] s = '' for x in pos: if x[1] not in exceptions: s += x[0] s += ' ' return s
def get_recommend_query(query): all_products = SpProduct.objects.values('name') results = [] # Query word exist (should be perfect form): 쿼리값과 정확하게 일치하는 값을 찾는다. for i in range(len(all_products)): if query in all_products[i]['name']: results.append(all_products[i]['name']) # Tokenizer komoran = Komoran() query_tokens = komoran.pos(query) all_products_tokens = [] for i in range(len(all_products)): all_products_tokens.append(komoran.pos(all_products[i]['name'])) # Query word exist (can not be perfect form): 쿼리값과 토큰값이 일치하는 값을 찾는다. for i in range(len(all_products_tokens)): if query_tokens[0] in all_products_tokens[i]: results.append(all_products[i]['name']) # NspProduct에도 똑같이 적용한다. all_nsp_products = NspProduct.objects.values('name') for i in range(len(all_nsp_products)): if query in all_nsp_products[i]['name']: results.append(all_nsp_products[i]['name']) all_products_nsp_tokens = [] for i in range(len(all_nsp_products)): all_products_nsp_tokens.append(komoran.pos( all_nsp_products[i]['name'])) for i in range(len(all_products_nsp_tokens)): if query_tokens[0] in all_products_nsp_tokens[i]: results.append(all_nsp_products[i]['name']) # Relative Query and target (same NspProduct class): 같은 NspProduct 클래스에 있는 값을 찾는다. results_copy = results.copy() for r in results_copy: sp_product_ = SpProduct.objects.filter(name=r) nsp_product_ = NspProduct.objects.filter(name=r) if len(sp_product_) > 0: sp_product_nsp_id = sp_product_.values()[0]['product_id'] related_nsp = NspProduct.objects.filter(id=sp_product_nsp_id) # 관련 NspProduct 넣기 results.append(related_nsp.values()[0]['name']) # 관련 NspProduct에 관련된 Sp 넣기 related_sp = SpProduct.objects.filter( product_id=sp_product_nsp_id).values() for sp in related_sp: results.append(sp['name']) if len(nsp_product_) > 0: nsp_product_id = nsp_product_.values()[0]['id'] related_sp = SpProduct.objects.filter( product_id=nsp_product_id).values() # 관련 SpProduct 넣기 for sp in related_sp: results.append(sp['name']) # 위의 세 가지 경우를 모두 찾아 결과값을 반환한다. results = list(set(results)) return results
def question_generator(text): komo = Komoran() Text = text question = [] for value, tag in set(komo.pos(Text)): if tag == 'NNP' and value != '제가': question.append('왜 ' + value + '인가요?') question.append(value + '에 대해서 자신이 아는대로 설명해주세요.') question.append(value + '의 장단점이 무엇이라고 생각하시나요?') print(sample(question, 3)) return sample(question, 3)
class Tagger: def __init__(self, mode: str = "nouns"): """ konlpy pos tagger """ self.tagger = Komoran() self.mode = mode # nouns, morphs def __call__(self, *args, **kwargs) -> list: if self.mode == "nouns": return self.tagger.nouns(*args, **kwargs) elif self.mode == "morphs": return self.tagger.morphs(*args, **kwargs)
def __init__(self, fpath): self.dataframe = pd.read_csv(fpath, encoding='utf-8') self.dataframe.keyword = self.dataframe.keyword.apply(literal_eval) plt.style.use('seaborn-darkgrid') font_name = './static/fonts/AppleSDGothicNeo.ttc' font_family = fm.FontProperties(fname=font_name).get_name() plt.rcParams['font.family'] = font_family plt.rcParams['font.size'] = 18 self.komoran = Komoran(userdic='./data/user_dic.tsv')
def question_generator(text): komo = Komoran() Text = text question = [] for value, tag in set(komo.pos(Text)): if tag == 'NNP' and value != '제가' and '!' not in value: question.append('왜 ' + value + '인가요?') question.append(value + '에 대해서 설명해주세요.') question.append(value + '의 장단점이 무엇이라고 생각하시나요?') question.append(value + '에 대해 영어로 묘사해주세요.') # print(sample(question, 3)) return sample(question, 3) if len(question) >= 3 else []
def calculate_result(self, text): komoran = Komoran() pos_l = list(komoran.pos(text)) # change format to compare with polarity.xlsx for i in range(len(pos_l)): pos_l[i] = "/".join(list(pos_l[i])) POS = 0 NEG = 0 l = [] i = 0 while i < len(pos_l): if pos_l[i] in self.sentiments: text = pos_l[i] if i + 1 < len(pos_l): if pos_l[i + 1] in self.sentiments[pos_l[i]]: text = pos_l[i] + ";" + pos_l[i + 1] i += 1 if i + 1 < len(pos_l): if pos_l[i] + ";" + pos_l[i + 1] in self.sentiments[pos_l[i - 1]]: text = pos_l[i - 1] + ";" + pos_l[i] + ";" + pos_l[i + 1] index = self.sentiments[pos_l[i - 1]][0] + self.sentiments[pos_l[i - 1]].index( pos_l[i] + ";" + pos_l[i + 1]) i += 1 else: index = self.sentiments[pos_l[i - 1]][0] + self.sentiments[pos_l[i - 1]].index(pos_l[i]) else: index = self.sentiments[pos_l[i - 1]][0] + self.sentiments[pos_l[i - 1]].index(pos_l[i]) else: index = self.sentiments[pos_l[i]][0] else: index = self.sentiments[pos_l[i]][0] neg = int(float(self.sentiment_file.cell(row=index, column=2).value) * 100) / 100 pos = int(float(self.sentiment_file.cell(row=index, column=3).value) * 100) / 100 POS += pos NEG += neg l.append((text, pos, neg)) i += 1 POS = int(POS * 100) / 100 NEG = int(NEG * 100) / 100 mood = (POS + NEG) print("CURRENT MOOD IS POS:{},NEG:{}, TOTAL:{}".format(POS, NEG, mood)) return pos_l, POS, NEG, mood
def __init__(self,word2index_dic='', userdic=None): # 형태소 분석기 초기화 self.komoran = Komoran(userdic=userdic) #제외할 품사 - 관계언, 기호, 어미, 접미사 #참조: http://docs.komoran.kr/firststep/postypes.html self.exclusion_tags = [ 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC', 'SF', 'SP', 'SS', 'SE', 'SO', 'EP', 'EF', 'EC', 'ETN', 'ETM', 'XSN', 'XSV', 'XSA' ] if (word2index_dic != ''): f = open(word2index_dic, "rb") self.word_index = pickle.load(f) else: self.word_index = None
def __init__(self, userdic=None): # 형태소 분석기 초기화 self.komoran = Komoran(userdic=userdic) print(self.komoran) # 제외할 품사 # 관계인 제거, 기호 제거 # 어미 제거 # 접미사 제거 self.exclusion_tags = [ 'JKS', 'JKC', 'JKG', 'JKB', 'JKV', 'JKQ', 'JK', 'JC', 'SF', 'SP', 'SS', 'SE', 'SO', 'EP', 'EF', 'EC', 'ETN', 'ETM', 'XSN', 'XSV', 'XSA' ]
def run(): tr = TextRank() from konlpy.tag import Komoran #텍스트요약 tagger = Komoran() stopword = set([('있', 'VV'), ('하', 'VV'), ('되', 'VV')]) tr.loadSents( RawSentence(sys.argv[1]), lambda sent: filter( lambda x: x not in stopword and x[1] in ('NNG', 'NNP', 'VV', 'VA'), tagger.pos(sent))) tr.build() print(json.dumps(tr.summarize()))
# -*- coding:utf8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') import json import operator import urllib2 import requests from konlpy.tag import Komoran import collections komoran = Komoran() allTitle = urllib2.urlopen("http://polatics.news/all").read().split('\n') allTitle = "\n".join([" ".join(komoran.nouns(t)) for t in allTitle]) vocaAll = allTitle.split() print (len(vocaAll)) # 길이 1 이하인 것을 제외만 해도, 상당한 노이즈를 제거할 수 있다 vocaAll = [v.strip() for v in vocaAll if len(v) > 1] # result ; list of tuple # 모든 애들에 대해서 할필요가 전혀 없다. # 유용하지 못한 관계만 추가될 것 같다 voca_topK = collections.Counter(vocaAll).most_common(300)
VERBOSE = 0 import sys reload(sys) sys.setdefaultencoding('utf-8') if __name__ == '__main__': parser = OptionParser() parser.add_option("--verbose", action="store_const", const=1, dest="verbose", help="verbose mode") (options, args) = parser.parse_args() if options.verbose : VERBOSE = 1 komoran = Komoran() while 1: try: line = sys.stdin.readline() except KeyboardInterrupt: break if not line: break analyzed = komoran.pos(line) seq = 1 for morph, tag in analyzed : tp = [seq, morph, morph, tag, tag, '_', 0, '_', '_', '_'] print '\t'.join([str(e) for e in tp]) seq += 1
from konlpy.tag import Twitter from konlpy.tag import Komoran twitter = Komoran() text = open('190747347803005_191149761096097','r').read().decode('utf8') print text print '---------------' sentence = [] for i in twitter.pos(text): #if i[1] == 'Unknown' or i[1] == 'Punctuation': # continue if i[1] == 'SF' or i[1] == 'SE' or i[1] == 'SP' or i[1] == 'SS': continue sentence.append(i[0]) if i[1] == 'EF' : print ''.join(sentence) sentence = []