def keyword_extractor(title, highlight): konl = Kkma() eng_title = TextBlob(re.sub("[^A-Za-z]", ",", title.strip())).noun_phrases eng_highlight = TextBlob(re.sub("[^A-Za-z]", " ", highlight.strip())).noun_phrases title_nouns = konl.nouns(title) highlight_nouns = konl.nouns(highlight) #line = re.sub("[^A-Za-z]", "", title.strip()) keyword_list = {i: 2 for i in title_nouns} for i in highlight_nouns: try: keyword_list[i] += 1 except: keyword_list[i] = 1 for i in eng_title: keyword_list[i] = 2 for i in eng_highlight: try: keyword_list[i] += 1 except: keyword_list[i] = 1 keyword_list = sorted(keyword_list.items(), key=operator.itemgetter(1), reverse=True)[:10] keywords = {} for i, k in enumerate(keyword_list): keywords[str("k" + str(i))] = k[0] return keyword_list
def get_nouns(file_path): from konlpy.tag import Kkma km = Kkma() noun_list = [] with open(file_path, 'r') as f: lines = f.readlines() # 한 라인씩 읽지 않으면 kn.nouns() 에서 에러남 for line in lines: if km.nouns(line) != list(''): nouns = km.nouns(line) for noun in nouns: stopwords = get_stopwords() if noun not in stopwords: noun_list.append(noun) return (noun_list)
def tokenize(self, text): try: from konlpy.tag import Kkma except ImportError as e: raise ValueError("Korean tokenizer requires konlpy. Please, install it by command 'pip install konlpy'.") kkma = Kkma() return kkma.nouns(text)
def get_tags(text, ntags=50, multiplier=10): # h = Hannanum() h = Kkma() nouns = h.nouns(text) count = Counter(nouns) return [{ 'color': color(), 'tag': n, 'size': c*multiplier*2 }\ for n, c in count.most_common(ntags)]
def atemedic(df701, ingredients): df801 = df701.iloc[:, [0, 5, 6]] #('item_seq','patient','prohibit_ingredient_id') df801 = df801.drop_duplicates() list5 = [] list5 = df801.values.tolist() doc3_list = np.array(list5).flatten().tolist() doc4_list = [] for i in range(1, len(doc3_list), 2): doc4_list.append(doc3_list[i]) kkma = Kkma() list6 = [] list7 = [] for i in range(1, len(doc4_list)): list6 = kkma.nouns(doc4_list[i]) list7.append(' '.join(list6)) tfidf_vect_simple = TfidfVectorizer() list7.insert(0, ingredients) feature_vect_simple = tfidf_vect_simple.fit_transform(list7) feature_vect_dense = feature_vect_simple.todense() similarity_simple_pair = cosine_similarity(feature_vect_simple[0], feature_vect_simple) list8 = [] for i in range(len(list5)): if (similarity_simple_pair[0, i] != 0.0): list8.append(i) return (doc3_list, list8)
def set_knlpy(self): """ Kkma 는 nouns() 시 단어를 한번만 표시되고 속도가 느리지만 추출결과가 깔끔하다 Hannanum 은 nouns() 시 단어를 매번 표시되고 (빈도수체크가능) 속도가 빠르지만 추출결과가 매끄럽지 않다. """ k = Kkma() # k = Hannanum() if not k: logger.error("must have knlpy!") exit() startTime = time.time() k.nouns("intial ") self.k = k checkTime = time.time() - startTime logger.debug("intial time : %f", checkTime)
def search_keyword(): url = 'https://openapi.naver.com/v1/search/book.json?' client_id = "QpvvkiISGC1mn16KVb3d" client_secret = "ukKNKa8DVk" keyword = request.form.get('keyword') query_string = "query=" + keyword + "&display=10&start=1&sort=count" header = { "X-Naver-Client-ID": client_id, "X-Naver-Client-secret": client_secret } r = requests.get(url + query_string, headers=header) books = json.loads(r.text)['items'] all_text = '' for book in books: all_text += book['description'] kkma = Kkma() ex_sent = kkma.sentences(all_text) nouns = [] for sent in ex_sent : for noun in kkma.nouns(sent): if len(str(noun)) > 2 and not(match('^[0-9]', noun)): nouns.append(noun) nouns_count = Counter(nouns) chart_index = sorted(nouns_count, reverse=True, key=lambda item: item[1]) chart_values = list(nouns_count.values()) return jsonify({'result': 'success', 'books': books, 'chart_index': chart_index, 'chart_values' : chart_values})
class bot(object): def __init__(self): self.kkma = Kkma() self.Prime_Word = pd.read_csv('1. Word Table.csv', encoding='CP949') self.Answer_DB = pd.read_csv('2. Info DB.csv', encoding='CP949') self.Sentence_DB = pd.read_csv('3. Info DB Sentence.csv', encoding='CP949') def Conversation(self, chat_text): Analized_Nouns = self.kkma.nouns(chat_text) return Analized_Nouns def Translating_Word(self, Words): for i in range(len(Words)): for j in range(len(self.Prime_Word)): if Words[i] == self.Prime_Word.loc[j, 'Word']: Words[i] = self.Prime_Word.loc[j, 'Mapping'] return Words def Answering(self, Words): for i in range(len(Words)): for j in range(len(self.Answer_DB)): if Words[i] == self.Answer_DB.loc[j, '항목']: Answer_Sentence = self.Sentence_DB.loc[j, '문장1'] Answer_Sentence = Answer_Sentence.replace( '[' + self.Answer_DB.loc[j, '항목'] + ']', self.Answer_DB.loc[j, '#1']) return Answer_Sentence return '없음'
def get_q_type(dbconn, cursor, u_text) : try : cursor.execute(f""" SELECT Q_TEXT, A_TEXT, Q_MORPHEMES, Q_TYPE FROM TBL_QNA_CHAT_SET_LIST """) rows = cursor.fetchall() except Exception as e : print(f'error! >> insert_qna_content >> {e}') finally : q_type = 0 all_values = [] for row in rows : all_values.append(row) match_q_type_list = [] for all_value in all_values : matchPer = SequenceMatcher(None, u_text, all_value[0]).ratio() * 100 # print(matchPer >= 70) # print(f'[{matchPer}% 일치][{all_value[3]}] {all_value[0]}') if matchPer >= 55 : # print(70) match_q_type_list.append(all_value[3]) print(f'[{matchPer}% 일치][Q_type : {all_value[3]}] {all_value[0]}') if len(match_q_type_list) > 0 : match_q_type_list = sorted(match_q_type_list, reverse=True) q_type = match_q_type_list[0] # 정의해놓은 대화뭉치가 없는 경우 > 답변을 직접 등록할 수 있도록 유도 # 새로운 질문과 기존 질문들의 유사도 체크하여 높은 유사도의 질문을 (최소 70% 이상) 노출 if q_type == 0 : kkma = Kkma() # 사용자 질문 명사 u_text_nouns = kkma.nouns(u_text) q_text_nouns_group = [] for all_value in all_values : # 텍스트 뭉치 명사 if all_value[2] != '[]' : q_text_nouns_group.append([ast.literal_eval(all_value[2]), all_value[3]]) point_list = [] for q_text_nouns in q_text_nouns_group : match_point = 0 for q_noun in q_text_nouns[0] : for u_noun in u_text_nouns : if q_noun == u_noun : match_point += 1 if match_point > 0 : point_list.append([match_point, q_text_nouns[1]]) if len(point_list) > 0 : point_list = sorted(point_list, reverse=True) print(point_list[0][0]) q_type = point_list[0][1] print(q_type) return [q_type, all_values]
def tokenizer(text_data, stopword_list): kkma = Kkma() texts = [] for t in text_data: tmp = [n for n in kkma.nouns(t) if n not in stopword_list] texts.append(tmp) return texts
class Doc2Vec: def __init__(self): self.doc2vec = TfidfVectorizer() self.kkma = Kkma() def fit_doc2vec(self, doc_nouns): self.doc2vec.fit(doc_nouns) def get_nouns(self, doc): return self.kkma.nouns(doc) def get_split(self, doc): return doc.split(' ') def get_vec(self, doc): return self.doc2vec.transform([doc]).todense() def cos_similarity(self, vect1, vect2): dot_procduct = np.dot(vect1, vect2.reshape(-1, 1)) l2_norm = np.sqrt(np.sum(np.square(vect1), axis=-1)) * np.sqrt( np.sum(np.square(vect2), axis=-1)) return dot_procduct / l2_norm def get_score(self, doc1, doc2): vect1 = self.get_vec(doc1) vect2 = self.get_vec(doc2) return self.cos_similarity(vect1, vect2) def get_similarity(self, doc1, doc2): vec1 = self.get_vec(doc1) vec2 = self.get_vec(doc2) return np.dot(vec1, vec2.T)
def find_most_mentioned(self): """ Returns how many times that people has been mentioned in conversation :return: dictionary containing person's name and number of mentions """ self._rewind() # get all conversations all_conversations = self.get_all_conversations() # word parser objects mecab = Mecab() kkma = Kkma() # parse all conversation words, and get only nouns all_nouns = list() for conversation in all_conversations: all_nouns += mecab.nouns(conversation) # exclude family name(성) from name names_list = list() for name in self.get_all_names(): preprocessed_name = kkma.nouns(name) for data in preprocessed_name: if len(data) != 1: names_list.append(data) # compare two list mentioned_people = [ person for person in all_nouns if person in names_list ] # count using Counter and return cnt = Counter(mentioned_people) return cnt.most_common(len(cnt))
def konlpy(): # value = request.form['konlpy_tag'] value = request.args.get('search') kkma = Kkma() a = kkma.pos(value) noun = kkma.nouns(value) word = [] pos = [] for i in a: # print(i[0] + ',' + i[1]) word.append(i[0]) pos.append(i[1]) print(noun) print(word) print(pos) result = {'word': word , 'pos': pos , 'noun': noun } print(result) return result
def run_kkma(): kkma = Kkma() start_time = time.time() print('kkma 시작') kkma_morphs = kkma.morphs(news1) kkma_nouns = kkma.nouns(news1) kkma_pos = kkma.pos(news1) end_time = time.time() print('kkma 끝 - %s 초' % str(end_time - start_time)) kkma_sentences = kkma.sentences(news1) with open('kkma.txt', 'w', encoding='utf-8') as fstream: fstream.write('kkma time : %s s\n' % str(end_time - start_time)) fstream.write('kkma_morphs\n') write_list(kkma_morphs, fstream) fstream.write('\n\n') fstream.write('kkma_nouns\n') write_list(kkma_nouns, fstream) fstream.write('\n\n') fstream.write('kkma_pos\n') write_pos(kkma_pos, fstream) fstream.write('\n\n') fstream.write('kkma_sentences\n') write_list(kkma_sentences, fstream) fstream.write('\n')
def parse_data(self, label, h, i): Y = self.y_vocab.get(label) if Y is None and self.div in ['dev', 'test']: Y = 0 if Y is None and self.div != 'test': return [None] * 2 product = h['product'][i] image = h['img_feat'][i] bcate = h['bcateid'][i] mcate = h['mcateid'][i] scate = h['scateid'][i] dcate = h['dcateid'][i] maker = h['maker'][i] brand = h['brand'][i] price = h['price'][i] model = h['model'][i] if not (('참조' in brand) or ('기타' in brand) or ('없음' in brand) or ('미분류' in brand)): product += ' ' + brand if not (('참조' in maker) or ('기타' in maker) or ('없음' in maker) or ('미분류' in maker)): product += ' ' + maker print product kkma = Kkma() p_model = kkma.nouns(product) for pStr in p_model: product += ' ' + pStr product = re_sc.sub(' ', product).strip().split() words = [w.strip() for w in product] words = [ w for w in words if len(w) >= opt.min_word_length and len(w) < opt.max_word_length ] if not words: return [None] * 2 if cate_type == 'bm': x = [hash(w) % opt.bm_unigram_hash_size + 1 for w in words] elif cate_type == 's': x = [hash(w) % opt.s_unigram_hash_size + 1 for w in words] elif cate_type == 'd': x = [hash(w) % opt.d_unigram_hash_size + 1 for w in words] xv = Counter(x).most_common(opt.max_len) x = np.zeros(opt.max_len, dtype=np.float32) v = np.zeros(opt.max_len, dtype=np.int32) for i in range(len(xv)): x[i] = xv[i][0] v[i] = xv[i][1] return Y, (x, v, image, bcate, mcate, scate, dcate)
def wordcloud(news_content_list, page_info_list, img_url): for i in range(len(news_content_list)): try: kkma = Kkma() tokens_ko = kkma.nouns(news_content_list[i]) ko = nltk.Text(tokens_ko, name=page_info_list[i][0]) data = ko.vocab().most_common(100) tmp_data = dict(data) korea_coloring = np.array(Image.open("C:\Study\project_idol\web_crawl\/Korea.png")) image_colors = ImageColorGenerator(korea_coloring) wordcloud= WordCloud(font_path = 'c:\\windows\\fonts\\NanumGothic.ttf', relative_scaling=0.1, mask=korea_coloring, background_color='black', min_font_size=4, max_font_size=40, ).generate_from_frequencies(tmp_data) plt.figure(figsize=(12,12)) plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear") title = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', page_info_list[i][1]) wordcloud.to_file("C:\Study\project_idol\static\wordcloud\/" + title +".png") plt.axis("off") plt.cla() url = "wordcloud\/" + title +".png" url = re.sub('/','',url) img_url.append(url) # plt.show() except Exception as e: no_img = 'wordcloud\/noimage.jpg' no_img = re.sub('/', '', no_img) img_url.append(no_img) print("exceptions is ", e) pass # input = input('검색할 단어 > ') # page_info_list = [] # url_list = [] # title_list = [] # news_content_list = [] # img_url = [] # value = [] # content_summarize_list = [] # call_and_print(input, 1, page_info_list) # for i in range(len(page_info_list)): # url_list.append(page_info_list[i][1]) # print(url_list) # # make_content(url_list, news_content_list, content_summarize_list, title_list) # wordcloud(news_content_list, page_info_list, img_url) # for i in range(len(title_list)): # values = (title_list[i], url_list[i], img_url[i], content_summarize_list[i]) # value.append(values) # print(value) # print(page_info_list) # print(title_list)
class Analyze: def __init__(self, string): self.string = u"%s" %string self.kkma = Kkma() def parse_phrase_to_morphemes(self): return self.kkma.morphs(self.string) def noun_extractor(self): return self.kkma.nouns(self.string)
def get_tags(text, ntags=50, multiplier=30): # 폰트 크기 조절은 multiplier값을 조정해서 # h = Hannanum() r = lambda: random.randint(0, 255) color = lambda: (r(), r(), r()) h = Kkma() text = unicode(text, 'utf-8') nouns = h.nouns(text) count = Counter(nouns) return [{ 'color': color(), 'tag': n, 'size': c*multiplier }\ for n, c in count.most_common(ntags)]
def get_tags(text, ntags=50): # 빈도수 계산 함수 spliter = Kkma() nouns = spliter.nouns(text) # text에서 명사 추출 count = Counter(nouns) # 명사의 빈도 수 저장 return_list = [] for n, c in count.most_common(ntags): temp = {'tag': n, 'count': c} return_list.append(temp) return return_list
def get_tags(text, ntags=50, multiplier=30): # 폰트 크기 조절은 multiplier값을 조정해서 # h = Hannanum() r = lambda: random.randint(0,255) color = lambda: (r(), r(), r()) h = Kkma() text = unicode(text, 'utf-8') nouns = h.nouns(text) count = Counter(nouns) return [{ 'color': color(), 'tag': n, 'size': c*multiplier }\ for n, c in count.most_common(ntags)]
def _chat_with_mybot(text): # 키워드 매칭을 위한 딕셔너리 words = { 'jobs': [], 'names': ['이름', '성함'], 'age': ['나이', '살', '쌀'], 'question': ['취업정보', '질문'], 'greetings': ['안녕', '반가워', '하이', '방가'] } jobs = _read_file_job_category('job_category.txt') anw = [] check = True if text == '<@UL9K54M32>': anw.append(u'안녕나는 챗봇이얌~~!! 취업정보를 알려주는 봇이얌 ^_^ 난 한국말만 알아들어~!!') return u'\n'.join(anw) kkma = Kkma() keywords = kkma.nouns(text) words['jobs'] = list(jobs.keys()) #print(keywords) # 사용자 입력 키워드 추출 (확인용) for i in range(len(words['greetings'])): if words['greetings'][i] in keywords: anw.append('안녕~ 나도' + words['greetings'][i] + '^_^\n') check = False for i in range(len(words['names'])): if words['names'][i] in keywords: anw.append('내 이름은 봇이얌 봇봇봇~!!\n') check = False for i in range(len(words['question'])): if words['question'][i] in keywords: anw.append(words['question'][i] + '??' + '알았어~~\n') check = False for i in range(len(words['age'])): if words['age'][i] in keywords: anw.append('내 나이는 20살이야~ 아주 젊지 ^_*\n') check = False for i in range(len(words['jobs'])): if words['jobs'][i] in keywords: # 크롤링한 직업정보 반환받고 #print(jobs[words['jobs'][i]]) _jobs = crawl._crawl_newbie_info(jobs[words['jobs'][i]]) return _jobs if check: anw.append('뭐라는 거야 ~~ -3- 그건 몰라~\n') return u'\n'.join(anw)
class input_preprocessing: def __init__(self): self.textfile = 'url_text.txt' self.kkma = Kkma() def get_text(self): # 복사 붙여넣기로 넣을 수 없어서 텍스트파일에 넣고 가져옴 # 나중에는 그냥 string 받아서 시작하면 된다. with open('.\\Dataset\\' + self.textfile, 'rb') as f: tmp = f.read().decode('utf-8') return tmp def extractkor(self, _s): try: hangul = re.compile('[^ ㄱ-ㅣ가-힣]+') result = hangul.sub('', _s) # print(result) result2 = result.split(' ') result2 = [item for item in result2 if item != ' '] result2 = [item for item in result2 if item != "" '' ""] return result2 except Exception as error: # print("error!",error) return 1 def splitkor_kornouns(self, _list): _temp_korbag = [] try: # for item in tqdm(_list, ascii= True, desc='명사 추출'): for item in _list: tempstr = str(item) tplist = self.kkma.nouns(tempstr) # print(tplist) _temp_korbag.append(tplist) except Exception as error: print("for문 에러", error) _temp_korbag = list(itertools.chain(*_temp_korbag)) return _temp_korbag def length_join(self, _bodytext): junk = [] for word in _bodytext: if len(word) < 2: junk.append(word) for word2 in junk: try: _bodytext.remove(word2) except: pass _bodytext = [" ".join(_bodytext)] return _bodytext
def newpost(request): if request.method == "POST": form = PostForm(request.POST) qu = (request.POST.get('text')) ##qu에 문장들 들어온다 texts = [] sentList = [] result = "" kkma = Kkma() texts = kkma.sentences(qu) for text in texts: text = text.strip("\n") print('start initializing : ', text) kkma = Kkma() kkma.nouns('initializing') #형태소 분석 시작 answer, rowVerb, rightVerb, subj = start(text) temp = [] temp.append(answer) temp.append(rowVerb) temp.append(rightVerb) temp.append(subj) sentList.append(temp) ## 리스트 [['주문하신 커피가 ', '나오셨습니다.', '나왔습니다']] #print(sentList) if form.is_valid(): return render(request, 'elections/output.html', {'sentList': sentList}) else: form = PostForm() return render(request, 'elections/index.html', {'form': form})
class AnalysisDiction: """ This class is for analysis of korean texts using kkma and twitter dictionaries """ def __init__(self, on_kkma=False, on_twitter=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_kkma: kkma instance :param on_twitter: twitter instance """ if on_kkma is True: self.kkma = Kkma() if on_twitter is True: self.twitter = Twitter() def analyzer_kkma(self, string_data, mode): """ This method is for kkma. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma """ if mode is 'morphs': return self.kkma.morphs(string_data) elif mode is 'nouns': return self.kkma.nouns(string_data) elif mode is 'pos': return self.kkma.pos(string_data) else: return False def analyzer_twitter(self, string_data, mode): """ This method is for twitter. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter """ if mode is 'morphs': return self.twitter.morphs(string_data) elif mode is 'nouns': return self.twitter.nouns(string_data) elif mode is 'pos': return self.twitter.pos(string_data) elif mode is 'posmore': return self.twitter.pos(string_data, True, True) else: return False
def summarize(contents, radius, k): kkma = Kkma() keywords = [] nouns = np.array([hash(elem) for elem in kkma.nouns(contents)], np.int64) size = nouns.size points = np.zeros([1000, size], np.int) i = 0 sentences = contents.replace('\n', '').split('.') for sentence in sentences: hash_words = np.array([hash(elem) for elem in kkma.nouns(sentence)]) arr = [] for j in range(size): points[i][j] = np.count_nonzero(hash_words == nouns[j]) # print(points[i]) i += 1 group = np.random.randn(i) for i in range(i): arr = [] count = 0 for j in range(i): if i != j: distance = np.sum((points[i] - points[j])**2)**0.5 if distance <= radius: arr.append(j) if len(arr) >= k: for elem in arr: group[elem] = group[i] result = [] for elem in np.unique(group): try: # result.append(sentences[tolist.index(elem)]) result.append(sentences[np.where(group == elem)[0][0]]) except IndexError: pass return result
def insert_qna_content(dbconn, cursor): file_path = f'ap_qna_set2.xlsx' load_wb = load_workbook(file_path, data_only=True) load_ws = load_wb['chat4'] all_values = [] for row in load_ws.rows: row_value = [] for cell in row: row_value.append(cell.value) all_values.append(row_value) kkma = Kkma() for idx, values in enumerate(all_values): if idx > 0: q_text = remove_sc(values[0]) a_text = remove_sc(values[1]) intent = values[2] img_src = values[3] entities = values[4] intent_tag = values[5] a_type = values[6] home_btn = values[7] q_nouns = kkma.nouns(q_text) if len(q_nouns) > 0: q_nouns = str(q_nouns) else: q_nouns = '[]' print(q_nouns) try: cursor.execute(f""" INSERT IGNORE INTO TEST_TBL_AP_QNA_CHAT_SET_LIST ( Q_TEXT, INTENT, A_TYPE, A_TEXT, PREV_BTN, HOME_BTN, Q_MORPHEMES, IMG_SRC, ENTITIES, INTENT_TAG, UPDATE_DATE ) VALUES ( "{q_text}", "{intent}", "{a_type}", "{a_text}", 3, {home_btn}, "{q_nouns}", "{img_src}", "{entities}", "{intent_tag}", NOW() ) """) except Exception as e: print(f'error! >> insert_ap_qna_content >> {e}') finally: print( f'[{idx}/{len(all_values)}({round((idx / len(all_values) * 100), 2)}%)] complete!!' ) dbconn.commit() time.sleep(0.1)
def make_tags(words): result_tag = "" try: kkma = Kkma() nouns = kkma.nouns(words) for index, n in enumerate(nouns): if (index > 0): result_tag = result_tag + ", " result_tag = result_tag + ("#" + n) except Exception as e: error_print(e, "make_tags", "", "M") finally: return result_tag
def ceshi(i): conn = pymysql.connect( # 创建数据库连接 host='gujiakai.softether.net', # 要连接的数据库所在主机ip user='******', # 数据库登录用户名 password='******', # 登录用户密码 database='library', # 连接的数据库名,也可以后续通过cursor.execture('user test_db')指定 charset='utf8mb4' # 编码,注意不能写成utf-8 ) cursor = conn.cursor() t = Kkma() cursor.execute("select introduction from book_info where book_id=%s",i) res=cursor.fetchall() cut_words="" res=str(res) nouns = t.nouns(res) cut_words="" return nouns
def chat_response(): kkma = Kkma() query = request.args.get('q') response = None query_noun = kkma.nouns(str(query)) print(query_noun) if "안녕" in query_noun: response = '안녕하세요!' return json.jsonify(response=response) elif "교열" and "가격" in query_noun: response = "3장 기준으로 5천원입니다.\n3장 이상의 경우에는 8천원입니다." return json.jsonify(response=response) elif "교열" and "언어" in query_noun: response = "현재는 중국학생들을 대상으로 한국어 문장을 교열하고 있습니다.\n영어 보고서 교열 서비스도 준비중에 있습니다." return json.jsonify(response=response) elif "교열" and '얼마' in query_noun: response = "3장 기준으로 5천원입니다.\n3장 이상의 경우에는 8천원입니다." return json.jsonify(response=response) elif "교열" and "견적" in query_noun: response = "3장 기준으로 5천원입니다.\n3장 이상의 경우에는 8천원입니다." return json.jsonify(response=response) elif "교열가" in query_noun: response = "교열가 정보는 실제 의뢰를 해주신 경우에 매칭되는 시스템입니다." return json.jsonify(response=response) elif "비밀" in query_noun: response = "물론입니다.\n교열이 완료된 문서에 대해 2주간 보관후 폐기를 원칙으로 하고 있습니다." return json.jsonify(response=response) elif "감사" in query_noun: response = "저희 바오가오 서비스를 이용해 주셔서 감사합니다.\n더욱 완성도 높은 서비스로 보답하겠습니다." return json.jsonify(response=response) elif "결제" in query_noun: response = "6월 1일까지는 무료로 이용이 가능합니다.\n유료화 이후에는 신용카드와 체크카드로 결제가 가능합니다." return json.jsonify(response=response) elif "교정" and "번역" and "차이" in query_noun: response = "번역은 원문만 제공해주시면 번역요청언어로 번역 서비스를 제공합니다.\n교정은 스스로 문장을" return json.jsonify(response=response)
def _mk_word_cloud_korean(self): target = ' '.join(self.text) kkma = Kkma() n = kkma.nouns(target) n = [temp for temp in n if len(temp) != 1 if not temp.isdecimal()] text = nltk.Text(n) data = text.vocab() data500 = data.most_common(500) dic = dict(data500) # Make word cloud object wc = WordCloud(font_path='/Library/Fonts/Arial Unicode.ttf', max_font_size=80, min_font_size=10, background_color=self.color, mask=self.mask) self.wordcloud = wc.generate_from_frequencies(dic)
def process(info, sim): input_sim = sim # input data 유사성 민감도 지정 / 숫자가 작을수록 관련 없는게 나올 확률이 커짐 / 최소 50이상 설정 input_text = info input_text_list = input_text.split(' ') # input data 띄어쓰기로 나누기 eng_text = re.sub('[^a-zA-z]', ' ', input_text).strip() # print(word_tokenize(input_text)) # print(input_text_list) kkma = Kkma() # 꼬마를 작용시 분모가 중복 되는 경우가 생김, 이를 제거해야 함 copy = [] for txt in input_text_list: txt_ = kkma.nouns(txt) # print(txt_) if len(txt_) > 1: # (명사가 쪼개졌을 경우) max_string = max(txt_, key=len) # 가장 긴 값을 제거 (중복값) txt_.remove(max_string) copy += txt_ # print(copy) if len(copy) > 3: del_list = [] for i in range(math.ceil(len(copy) - 2)): overlap_txt = ''.join( (itemgetter(i, i + 2)(copy)) ) # abc를 kkma로 쪼갤 경우 => a, ab, abc, b, c => abc 제거 => ab를 제거하는 과정 if overlap_txt in copy: del_list.append(overlap_txt) # print(del_list) [i for i in del_list if not i in copy or copy.remove(i)] # 차집합인데 순서가 안 바뀜 text = ' '.join(copy) if input_sim > 45: text += ',' # ,를 넣을 경우 강제로 기업설명으로 인식시켜서 조금 더 제한적인 슬로건 등장 # 영어 슬로건이 포함 된 경우 초기상태로 if eng_text: if eng_text in input_text: text = input_text return text
def insert_qna_content(dbconn, cursor): file_path = f'qna_set.xlsx' load_wb = load_workbook(file_path, data_only=True) load_ws = load_wb['chat4'] all_values = [] for row in load_ws.rows: row_value = [] for cell in row: row_value.append(cell.value) all_values.append(row_value) kkma = Kkma() for idx, values in enumerate(all_values): q_text = values[0] a_text = values[1] q_type = values[2] q_nouns = kkma.nouns(q_text) if len(q_nouns) > 0: q_nouns = str(q_nouns) else: q_nouns = '[]' print(q_nouns) try: cursor.execute(f""" INSERT IGNORE INTO TBL_QNA_CHAT_SET_LIST ( Q_TEXT, A_TEXT, Q_MORPHEMES, Q_TYPE, UPDATE_DATE ) VALUES ( "{q_text}", "{a_text}", "{q_nouns}", {q_type}, NOW() ) """) except Exception as e: print(f'error! >> insert_qna_content >> {e}') finally: print( f'[{idx}/{len(all_values)}({round((idx / len(all_values) * 100), 2)}%)] complete!!' ) dbconn.commit() time.sleep(0.1)
def getKeywords(src): kkma = Kkma() words = kkma.nouns(src) words = list(set(words)) words_calc = [] words_num = len(words) for word in words: if not word.isdigit() and not u'서울' in word and re.match('(.*)?\d+(.*)?', word) is None: word_count = src.count(word) word_idf = word_count * math.log(len(word)) if word_idf > 1: words_calc.append((word, word_idf)) words_sort = sorted(words_calc, key = lambda w: w[1], reverse = True) words_real = [] for word in words_sort: words_real.append(word[0]) print (" / ".join(words_real[:5])).encode('utf-8')
def SortNoun(filename): # Read file f = open(filename, 'r') text = f.read().decode('utf-8') # read file as utf8 decoded f.close() # tagging from konlpy.tag import Kkma #from konlpy.utils import pprint kkma = Kkma() print ('now extracting nouns...') tagged = kkma.nouns(text) # Write tagged file (path,fnameExt) = os.path.split(filename) (fname,fext) = os.path.splitext(fnameExt) tagged_file = fname+'_'+'noun'+fext fw = open(tagged_file,'w') for line in tagged: strs = line.encode('utf-8') fw.write(strs+"\n") fw.close() print '%s is created' % (tagged_file)
class Crawler: def __init__(self): self.kkma = Kkma() self.conn = sqlite3.connect('yebi.db') self.cursor = self.conn.cursor() self.count = 20 reload(sys) sys.setdefaultencoding('utf-8') def do(self): print '트위터 타임라인 탐색 중.' for x in TwitterFetcher().get_time_line(self.count): user_id = x['user']['id'] print '' print '=' * 80 print '... @%s: %s' % (x['user']['name'], x['text']) t = (user_id, ) self.cursor.execute('select count(*) from users where id=?', t) count_user = self.cursor.fetchone()[0] if count_user == 0: #DB안에 User가 없으면 ( 0 ) name = x['user']['name'] screen_name = x['user']['screen_name'] profile_image = x['user']['profile_image_url_https'] t = (user_id, name, screen_name, profile_image) self.cursor.execute('insert into users values(?, ?, ?, ?)', t) self.conn.commit() print "... 유저 %s를 User 디비에 추가중" % x['user']['name'] i = 1 tweet_id = x['id'] t = (tweet_id, ) self.cursor.execute('select count(*) from tweets where id=?', t) count_tweets = self.cursor.fetchone()[0] print "... 트윗 디비를 검색중" if count_tweets == 0: print "... 아직 디비에 없어요." text = x['text'] created_at = x['created_at'] t = (tweet_id, text, created_at, user_id) self.cursor.execute('insert into tweets values(?, ?, ?, ?)', t) self.conn.commit() print '... %s 추가 중' % x['text'] for n in self.kkma.nouns(x['text']): t = (user_id, n) self.cursor.execute('select count from user_nouns where user_id=? and noun=?', t) count_noun = self.cursor.fetchone() screen_name = x['user']['screen_name'] if count_noun is not None: print "... %s가 명사 \"%s\"의 갯수는 %d회 사용하였습니다." % \ (screen_name, n, count_noun[0]) if count_noun is None: print "... %s가 명사 \"%s\"를 처음 사용하였습니다." % (screen_name, n) #t = (user_id, n) self.cursor.execute('insert into user_nouns values(?, ?, 1)', t) else: self.cursor.execute('update user_nouns set count=count+1 where user_id=? and noun=?', t) else: print "... 이미 디비에 있어요. (그래도 명사를 분석하겠습니다.)" for n in self.kkma.nouns(x['text']): # print "...... %s" % n t = (user_id, n) self.cursor.execute('select count from user_nouns where user_id=? and noun=?', t) count_noun = self.cursor.fetchone() screen_name = x['user']['screen_name'] if count_noun is not None: print "... %s가 명사 \"%s\"의 갯수는 %d회 사용하였습니다." \ % (screen_name, n, count_noun[0]) i += 1
__author__ = 'woojin' # -*- coding: utf-8 -*- from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() pprint(kkma.sentences('네, 안녕하세요. 반갑습니다.')) pprint(kkma.nouns('질문이나 건의사항은 깃허브 이슈 트래커에 남겨주세요.')) pprint(kkma.pos('오류보고는 실행환경, 에러메시지와 함께 설명을 최대한 상세히!!^^'))
for line in data: i += 1 if(line[8].strip().isdigit()): obj = { 'name': line[7].strip(), 'sum': int(line[8].strip()) * 1000, 'categories': [ line[2].strip(), line[3].strip(), line[4].strip(), line[5].strip() ] } words = kkma.nouns(line[7].strip().decode('utf-8')) for j, word in enumerate(words): words[j] = word.encode('utf-8') obj['words'] = words print str(i) + ' / 4014' output.append(obj) jsonfile = open('./data/services.json', 'w') json.dump(output, jsonfile) print '--- finished ---'
poets = get_reviews() for poet in poets: sentences = poet.split('\n') for sentence in sentences: try: c += Counter(kkma.nouns(sentence)) except NameError: c = Counter(kkma.nouns(sentence)) except: pass #poets = get_poets() poets = get_reviews() kkma = Kkma() for idx, poet in enumerate(poets): tags = [] for noun in kkma.nouns(poet): if noun in TAGS: tags.append(noun) hash_object = hashlib.sha1(poet.encode('utf-8', 'ignore')) hex_dig = hash_object.hexdigest() results = collection.find_one({'hex':hex_dig}) if not results: document = {'text': poet, 'index': idx, 'tags': tags, 'hex': hex_dig, 'like': 0, 'date': datetime.datetime.utcnow()} collection.insert(document)
def get_tags(text, ntags=40, multiplier=1): h = Kkma() nouns = h.nouns(text) count = Counter(nouns) return [{ 'tag': n, 'count': c }\ for n, c in count.most_common(ntags)]
def excel_noun(): def excel_write(row_val, column_val, data): new_sheet.cell(row = row_val, column = column_val, value="%s" %data) wb=load_workbook('reference.xlsx') sheetList = wb.get_sheet_names() sheet = wb.get_sheet_by_name(sheetList[0]) row_count = sheet.get_highest_row() new_sheet = wb.create_sheet(title='extraction') for i in range(2, row_count): if sheet.row_dimensions[i].visible : pass else : excel_write(i,1,'') new_sheet.row_dimensions[i].hidden = True #new_sheet.row_dimensions[i].outlineLevel = 1 continue noun_val = "" full_qua = "" cellValue_name = sheet.cell(row=i, column=1).value cellValue = sheet.cell(row=i, column=2).value try : QUA = cellValue.count(u'\u201c') except : continue if QUA != -1: if QUA == 1 : START_QUA = cellValue.find(u"\u201c") + 1 # position of first quatation mark CELL_VALUE_LEN = len(cellValue) cellValue_re = cellValue[START_QUA:CELL_VALUE_LEN] END_QUA = cellValue_re.find(u"\u201d") # position of last quatation mark cellValue_final = cellValue_re[0:END_QUA] print str(i) + " "+ cellValue_name + " " + cellValue_final kkma = Kkma() #pprint (kkma.nouns(cellValue_final)) s = (kkma.nouns(cellValue_final)) for j in range(0,len(s)): noun_val = noun_val + s[j].encode('utf-8') + ',' excel_write(i, 1, cellValue_name) excel_write(i, 2, cellValue_final) excel_write(i, 3, noun_val) elif QUA == 0 : #print str(i) + " " + cellValue ANOTHER_QUA = cellValue.find("\"") + 1 # position of first quatation mark ANOTHER_QUA_LEN = len(cellValue) another_cellValue = cellValue[ANOTHER_QUA:ANOTHER_QUA_LEN] ANOTHER_END_QUA = another_cellValue.find("\"") another_cellValue_final = another_cellValue[0:ANOTHER_END_QUA] #print str(i) + " " + cellValue_name + " " + another_cellValue_final kkma = Kkma() #pprint (kkma.nouns(cellValue_final)) s = (kkma.nouns(another_cellValue_final)) for j in range(0,len(s)): noun_val = noun_val + s[j].encode('utf-8') + ',' excel_write(i, 1, cellValue_name) excel_write(i, 2, another_cellValue_final) excel_write(i, 3, noun_val) elif QUA > 1 : #print str(i) + " " + str(QUA) for q in range(0,QUA): arr = cellValue.split(u"\u201d") arr_start_qua = arr[q].find(u"\u201c") + 1 arr_len = len(arr[q]) arr_cellValue = arr[q][arr_start_qua:arr_len] full_qua = full_qua + arr_cellValue kkma = Kkma() #pprint (kkma.nouns(cellValue_final)) s = (kkma.nouns(arr_cellValue)) for j in range(0,len(s)): noun_val = noun_val + s[j].encode('utf-8') + ',' #print str(i) + " " + arr_cellValue excel_write(i, 1, cellValue_name) excel_write(i, 2, full_qua) excel_write(i, 3, noun_val) wb.save('reference.xlsx')
from konlpy.utils import pprint kkma = Kkma() from multiprocessing import Pool port = 46000 context = zmq.Context() socket = context.socket(zmq.REP) socket.bind('tcp://127.0.0.1:%s' % port) while True: print 'in the loop' # Wait for next request from client message = socket.recv() result = kkma.nouns(message); result = ', '.join(result) print '------' print result socket.send_string(result) # for socker.end unicode is not allowed use send_string """ string = u'안녕하세요. 건국대학교에 오신걸 환영합니다. 도서관은 우측에 있습니다.' string2 = u'5학년되니까 학교근처엔 도저히 먹을게없다' string3 = u'카이리스님께 사과문올립니다' start = time.time() pprint(kkma.nouns(string)) pprint(kkma.nouns(string2)) pprint(kkma.nouns(string3))
# print(soup.prettify()) # print(soup) newsbody = soup.find(id="articleBodyContents") # print(newsbody.contents) bodystr = "" try: for child in newsbody.children: if (isinstance(child, NavigableString) and not isinstance(child, Comment)): # print(child.string.strip()) bodystr += child.string.strip() # 형태소 분석 kkma = Kkma() # pprint(kkma.nouns(bodystr)) # pprint(kkma.pos(bodystr)) wordList = kkma.nouns(bodystr) print('k : ', k) if k == 0: testEntry = wordList testIssueDate = issueDate testTitle = soup.title.string k = k + 1 else: if (int(df[df['날짜'] >= issueDate].tail(1)['종가']) > int(df[df['날짜'] < issueDate].head(1)['종가'])): print('up') docList.append(wordList) classList.append(1) else: if (int(df[df['날짜'] >= issueDate].tail(1)['종가']) < int(df[df['날짜'] < issueDate].head(1)['종가'])): print('down') docList.append(wordList)
from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() pprint(kkma.sentences(u'네, 안녕하세요. 의류매장 입니다')); pprint(kkma.nouns(u'구입하실 물건 있으시면 말씀해주세요.')); pprint(kkma.pos(u'하하하 즐거운 쇼핑입니다.'));
from konlpy.tag import Kkma from konlpy.utils import pprint kkma = Kkma() string = u'안녕하세요. 건국대학교에 오신걸 환영합니다. 도서관은 우측에 있습니다.' pprint(kkma.nouns(string))
def excel_noun(): def excel_write(row_val, column_val, data): new_sheet.cell(row = row_val, column = column_val, value="%s" %data) wb=load_workbook(REFERENCE_EXCEL) sheetList = wb.get_sheet_names() sheet = wb.get_sheet_by_name(sheetList[0]) row_count = sheet.get_highest_row() new_sheet = wb.create_sheet(title='extraction') news_info = {} for i in range(1, row_count): noun_val = "" full_qua = "" cellValue_name = sheet.cell(row=i, column=1).value cellValue = sheet.cell(row=i, column=2).value cellValue_id = sheet.cell(row=i, column=3).value # u201c 'LEFT DOUBLE QUOTATION MARK' # u201d 'RIGHT DOUBLE QUOTATION MARK' try : QUA = cellValue.count(u'\u201c') # u201c 'LEFT DOUBLE QUOTATION MARK' except : continue if QUA != -1: if QUA == 1 : START_QUA = cellValue.find(u"\u201c") + 1 # position of first quatation mark CELL_VALUE_LEN = len(cellValue) cellValue_re = cellValue[START_QUA:CELL_VALUE_LEN] END_QUA = cellValue_re.find(u"\u201d") # position of last quatation mark cellValue_final = cellValue_re[0:END_QUA] #print str(i) + " "+ cellValue_name + " " + cellValue_final kkma = Kkma() #pprint (kkma.nouns(cellValue_final)) s = (kkma.nouns(cellValue_final)) for j in range(0,len(s)): noun_val = noun_val + s[j].encode('utf-8') + ',' news_tuple=(cellValue_name, cellValue, noun_val, cellValue_id) news_info[i]={news_tuple} MyPrettyPrinter().pprint(news_info[i]) excel_write(i, 1, cellValue_name) excel_write(i, 2, cellValue_final) excel_write(i, 3, noun_val) excel_write(i, 4, cellValue_id) elif QUA == 0 : #print str(i) + " " + cellValue ANOTHER_QUA = cellValue.find("\"") + 1 # position of first quatation mark ANOTHER_QUA_LEN = len(cellValue) another_cellValue = cellValue[ANOTHER_QUA:ANOTHER_QUA_LEN] ANOTHER_END_QUA = another_cellValue.find("\"") another_cellValue_final = another_cellValue[0:ANOTHER_END_QUA] #print str(i) + " " + cellValue_name + " " + another_cellValue_final kkma = Kkma() #pprint (kkma.nouns(cellValue_final)) s = (kkma.nouns(another_cellValue_final)) for j in range(0,len(s)): noun_val = noun_val + s[j].encode('utf-8') + ',' news_tuple=(cellValue_name, cellValue, noun_val, cellValue_id) news_info[i]={news_tuple} MyPrettyPrinter().pprint(news_info[i]) excel_write(i, 1, cellValue_name) excel_write(i, 2, another_cellValue_final) excel_write(i, 3, noun_val) excel_write(i, 4, cellValue_id) elif QUA > 1 : #print str(i) + " " + str(QUA) for q in range(0,QUA): arr = cellValue.split(u"\u201d") if arr is not None: try : arr_start_qua = arr[q].find(u"\u201c") + 1 except : continue arr_len = len(arr[q]) arr_cellValue = arr[q][arr_start_qua:arr_len] full_qua = full_qua + arr_cellValue kkma = Kkma() #pprint (kkma.nouns(cellValue_final)) s = (kkma.nouns(arr_cellValue)) for j in range(0,len(s)): noun_val = noun_val + s[j].encode('utf-8') + ',' #print str(i) + " " + arr_cellValue news_tuple=(cellValue_name, cellValue, noun_val, cellValue_id) news_info[i]={news_tuple} MyPrettyPrinter().pprint(news_info[i]) excel_write(i, 1, cellValue_name) excel_write(i, 2, full_qua) excel_write(i, 3, noun_val) excel_write(i, 4, cellValue_id) wb.save(REFERENCE_EXCEL) nt.saveObjectBinaryFast(news_info, DICT_NEWS_INFO)
# This is script to test KoNLPy. # Project started at 01/18/2016. Author by Jaehyun Ahn([email protected]) __author__ = 'Sogo' from konlpy.tag import Kkma from collections import Counter print('Number of lines in document:') k = Kkma() f = open('test.txt', 'r') lines = f.read().splitlines() nlines = len(lines) print(nlines) nouns = [k.nouns(lines[i]) for i in range(0, nlines)] cnt = Counter() for i in range(len(nouns)): for j in range(len(nouns[i])): cnt[nouns[i][j]] += 1 print(cnt.most_common(15)) # let's get words! It's a steal! print(cnt.most_common(15)[0][0]) print(cnt.most_common(15)[1])