def keyword_extract(): okt = Okt() data = request.get_json() texts = data wordrank_extractor = KRWordRank( min_count=4, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=12, # 단어의 최대 길이 verbose=True) beta = 0.5 # PageRank의 decaying factor beta max_iter = 10 keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) if request.method == 'POST': word_list = list() test = {} r_list = list() for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:30]: # print('%8s:\t%.4f' % (word, r)) word_list.append(word) r_list.append(r) test[word] = r new_word_list = [' '.join(okt.nouns(word)) for word in test] while '' in new_word_list: new_word_list.remove('') print(new_word_list) print(test.keys()) # print(test) # print(word_list) return json.dumps(test, ensure_ascii=False) return 'wordExtract'
def make_list(): file_list = search('./Result_free/') # 폴더 경로만 바꿔서 사용하세요~ data = file_read(file_list) wordrank_extractor = KRWordRank( min_count=10, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=15, # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 input_text = str() text = list() for content in data: text.append(content['text']) #text.append(content['text']) #게시판에 타이틀이 있으면 주석해제 for comment in content['comment_text']: text.append(comment) input_text = ' '.join(text) keywords = None rank = '' graph = None keywords, rank, graph = wordrank_extractor.extract(text, beta, max_iter) return keywords, rank, graph
def krwordrank_noun(sentence_list=[], min_count=5, max_length=10, beta=0.85, max_iter=10, verbose=False): krword_rank_noun = [] krword_rank_noun_score = {} wordrank_extractor = KRWordRank(min_count, max_length, verbose) try: keywords, rank, graph = wordrank_extractor.extract( sentence_list, beta, max_iter) for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:len(keywords)]: # print(r, word) word = re.sub("[\s]+", " ", word) if len(word) > 1: word_cleansing = re.sub( '[-=+,#/\?:^$.@*\"※~&%ㆍ!”』\\‘|\(\)\[\]\<\>`\'…》\^\)\(▶]', '', word) if len(word_cleansing) == len(word): krword_rank_noun.append(word) krword_rank_noun_score[word] = r return sorted_dict(krword_rank_noun_score) except: krword_rank_noun = [] krword_rank_noun_score = {} return sorted_dict(krword_rank_noun_score)
def mapper(row: pd.Series): extractor = KRWordRank( min_count=7, # Minimum word occurrence max_length=15, # Maximum word length verbose=False, ) beta = 0.85 # decaying factor beta of PageRank max_iter = 10 sentences = generate_input(row["articles"]) try: score, rank, graph = extractor.extract(sentences, beta, max_iter) score = dict(filter(filter_stopwords, score.items())) except Exception as e: print(e) return None return dict({ "date": os.path.splitext(_f)[0], "press": row["press"], "category": row["category"], "size": len(" ".join(sentences).encode("utf8")), "score": score, "rank_size": len(rank), "graph_size": len(graph), })
def wordRank(): #Retreive text from elasticsearch results = es.get(index='nkdb', doc_type='nkdb', id='5dc9fc5033ec463330e97e94') texts = json.dumps(results['_source'], ensure_ascii=False) # split the text by sentences sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', texts) # normalize the text texts = [normalize(text, number=True) for text in sentences] wordrank_extractor = KRWordRank( min_count=3, # Minimum frequency of word max_length=10, # Maximum length of word verbose=True) beta = 0.85 # Decaying factor beta of PageRank max_iter = 10 keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) result = [] dic = {} # Make a dictionary [word, weight] for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:30]: dic["y"] = r dic["label"] = word result.append(dic) dic = {} return json.dumps(result, ensure_ascii=False)
def __get_keyword(splited_sentence, decay_factor, max_iteration_num): try: pick_keyword = KRWordRank(min_count=4, max_length=10, verbose=True) decay_factor = decay_factor # 이 단어가 계속 선호하는 단어인지 (소멸되지 않을 확률), 보통 85%로 잡는다. max_iteration_num = max_iteration_num # 최대 반복횟수 keyword, _, _ = pick_keyword.extract(splited_sentence, decay_factor, max_iteration_num) # 키워드 추출 except ValueError: keyword = "NULL" finally: return keyword
def mkKeywords(texts): SIZE = 5 extractor = KRWordRank(min_count=1, max_length=120) keywords, rank, graph = extractor.extract(texts, beta=0.85, max_iter=30) outputs = [] for w, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True): outputs.append(w) # return json.dumps( dict( zip(range(SIZE), outputs[:SIZE] ) ),ensure_ascii=False) return dict(zip(range(SIZE), outputs[:SIZE]))
def test_keyword(test_config): data_path = test_config['data_path'] with open(data_path, encoding='utf-8') as f: texts = [line.rsplit('\t')[0].strip() for line in f] wordrank_extractor = KRWordRank(min_count=5, max_length=10) keywords, rank, graph = wordrank_extractor.extract(texts, beta=0.85, max_iter=10) selected_keywords = [ word for word, r in sorted( keywords.items(), key=lambda x: x[1], reverse=True)[:30] ] assert selected_keywords[:5] == ['영화', '너무', '정말', '음악', '마지막'] print('\nKR-WordRank 라라랜드 영화 리뷰 30 개 키워드\n{}\n'.format(selected_keywords))
def __init__(self, min_count=2, max_length=10, beta=0.85, max_iter=10, verbose=True, num_words=20): self.min_count = min_count self.max_length = max_length self.beta = beta self.max_iter = max_iter self.verbose = verbose self.num_words = num_words self.inst = KRWordRank(min_count, max_length, self.verbose)
class KeywordExtractionKorean(BaseKeywordExtraction): def __init__(self, min_count=2, max_length=10, beta=0.85, max_iter=10, verbose=True, num_words=20): self.min_count = min_count self.max_length = max_length self.beta = beta self.max_iter = max_iter self.verbose = verbose self.num_words = num_words self.inst = KRWordRank(min_count, max_length, self.verbose) def __call__(self, *args, **kwargs): _num_keywords = 10 #print(str(args[0]) + "\n") keywords, rank, graph = self.inst.extract(args[0], self.beta, self.max_iter, self.num_words) return keywords
def get_keywords(title, text): """ :param title: title of article :param text: body of article :return: key_words """ texts = text texts = [texts] texts = [normalize(text, english=True, number=True) for text in texts] wordrank_extractor = KRWordRank( min_count=2, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=10, # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) # rank 이용 분류 tagger = Komoran() stopword = set([('있', 'VV'), ('하', 'VV'), ('되', 'VV')]) keyword_list = [] for i in keywords: noun = tagger.nouns(i) if noun != []: keyword_list.append([noun[0], keywords[i]]) keywords = [] for i in keyword_list[:5]: keywords.append(i[0]) title_keywords = [] for j in keywords: if j in title: title_keywords.append(j) for i in title_keywords: if i in stopword_list: title_keywords.remove(i) return title_keywords
def keyword_extract(lyrics): for l in lyrics: if len(l['lyric']) < 10: print(l['lyric']) continue # print(l['track_id']) wordrank_extractor = KRWordRank( min_count=5, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=10, # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 keywords, rank, graph = wordrank_extractor.extract( l['lyric'], beta, max_iter) l['keywords'] = keywords return lyrics
def extract_keyword_textrank(input_list): min_count = 5 max_length = 10 wordrank_extractor = KRWordRank(min_count=min_count, max_length=max_length) beta = 0.85 max_iter = 10 texts = input_list keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) stop_words = {'뉴스', '기자', '기사', '평점', '주연', '방송', '편성표'} filtered_words = { word: score for word, score in sorted(keywords.items(), key=lambda x: -x[1])[:100] if not (word in stop_words) } related_keyword = list(filtered_words.keys()) return related_keyword[:15]
def keywords_kor(): from krwordrank.word import KRWordRank # wordrank-kr 활용 min_count = 1 # 단어의 최소 출현 빈도수 (그래프 생성 시), 원래 5 max_length = 10 # 단어의 최대 길이 wordrank_extractor = KRWordRank(min_count=min_count, max_length=max_length) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 # # texts = ['예시 문장 입니다', '여러 문장의 list of str 입니다', ... ] keywords, rank, graph = wordrank_extractor.extract( [globalVariable.fullText_f], beta, max_iter) for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:30]: globalVariable.keywordArr.append(word) # print('%8s:\t%.4f' % (word, r)) globalVariable.keywordArr = globalVariable.keywordArr[:5]
def kr_wordrank(load_version): # 1차원 리스트 구조 : 한 진료 데이터 단위 file = open('data/revision/revision_' + load_version + '.txt', 'r', encoding='utf-8', newline='\n') list_corpus = [] for sentence in file: list_corpus.append(sentence.strip()) wordrank_extractor = KRWordRank( min_count=5, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=10, # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 keywords, rank, graph = wordrank_extractor.extract(list_corpus, beta, max_iter) for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:100]: print('%8s:\t%.4f' % (word, r))
def make_wordcloud(movie_id_list, comments_list): ### 단어 빈도수 keyword dict로 만들고 워드클라우드 생성 후 저장 for idx, texts in enumerate(comments_list): wordrank_extractor = KRWordRank( min_count=2, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=6, # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 6 keywords, rank, graph = wordrank_extractor.extract( texts, beta, max_iter) keywords = dict( sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:min(len(keywords.keys()), 610)]) # 필요없는 단어 삭제 key_set = set(keywords.keys()) for w in exclude_set & key_set: keywords.pop(w) wordcloud = WordCloud(font_path=font_path, width=900, height=300, background_color="white", mask=mask) wordcloud = wordcloud.generate_from_frequencies(keywords) wordcloud.recolor(color_func=color_func, random_state=1) # plt.figure(figsize=(10, 10)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") # plt.show() wordcloud.to_file( "movieapp/static/movieapp/img/wordcloud/{}.png".format( movie_id_list[idx])) print('{}번 완료'.format(movie_id_list[idx]))
def do_wr_keyword(self, video_name, video_description, comments, video_idx): min_count = 2 # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length = 10 # 단어의 최대 길이 wordrank_extractor = KRWordRank(min_count=min_count, max_length=max_length, verbose=False) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 inputs = [video_name, video_description] + comments inputs = [v for v in inputs if v] if len(inputs) <= 3: print("No Korean") return [] try: keywords, rank, graph = wordrank_extractor.extract(inputs, beta, max_iter) except ValueError: return [] insert_list = [] print("#### wordrank, 제목 및 설명 포함 키워드 목록 ####") for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True): if word in video_name or word in video_description: if self.do_sql: if r > 1.0: insert_list.append(f"({video_idx}, '{word[:99]}'),") else: print("%8s:\t%.4f" % (word, r)) print("#### wordrank, 전체 키워드 목록 ####") for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]: if self.do_sql: insert_list.append(f"({video_idx}, '{word[:99]}'),") else: print("%8s:\t%.4f" % (word, r)) return insert_list
def extract_krwordrank(normalized_lines, noun_count_df): import pandas as pd print("extract_krwordrank") from krwordrank.word import KRWordRank wordrank_extractor = KRWordRank( min_count=min(noun_count_df["count"]), # 단어의 최소 출현 빈도수 max_length=max(noun_count_df["tag"].str.len()), # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 #keywords는 filtering이 적용된 L parts #rank는 substriing graph의 substring에 대한 구마 #graph는 substring graph keywords, rank, graph = wordrank_extractor.extract(normalized_lines, beta, max_iter) keyword_df = pd.DataFrame(list(keywords.items()), columns=['word', 'rank']) return keyword_df
def summarize(text, num_summaries, summary_ratio=0.2): text_split = text.split('. ') if len(text_split) < num_summaries: return 'SummarizationError: Number of sentences must be bigger than num_summaries' wordrank_extractor = KRWordRank(min_count=3, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=20, # 단어의 최대 길이 verbose=False) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 try: keywords, rank, graph = wordrank_extractor.extract(text_split, beta, max_iter, num_keywords=100) stopwords = set.union(get_stopwords(), set(stopwords_ko)) vocab_score = make_vocab_score(keywords, stopwords, scaling=lambda x: 1) tokenizer = MaxScoreTokenizer(vocab_score) # tokenizer_mecab = Mecab() # 일정 길이 이상이 될 때까지 요약 반복 text_summary = "" iter_num = 1 while len(text_summary) < len(text) * summary_ratio: if len(text_split) < num_summaries * iter_num: break sents = keysentence(vocab_score, text_split, tokenizer.tokenize, #tokenizer_mecab.nouns diversity=0.7, topk=num_summaries * iter_num) text_summary = '. '.join(sents) iter_num += 1 except ValueError as e: return "SummarizationError: " + str(e) return text_summary
def get_key_sentences(text, num): import re print(text) sentences = re.split("[.?!]", text) sentences = [s.strip() for s in sentences] if len(sentences) < num: return [ "KeySentencesError: Number of sentences must be bigger than num" ] wordrank_extractor = KRWordRank( min_count=3, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=20, # 단어의 최대 길이 verbose=False) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 try: keywords, rank, graph = wordrank_extractor.extract(sentences, beta, max_iter, num_keywords=100) stopwords = get_stopwords() vocab_score = make_vocab_score( keywords, stopwords, scaling=lambda x: 1) # scaling=lambda x: 1 tokenizer = MaxScoreTokenizer(vocab_score) sents = keysentence( vocab_score, sentences, tokenizer.tokenize, # tokenizer_mecab.nouns diversity=0.6, topk=num) return sents except ValueError as e: return ["KeySentencesError: " + str(e)]
def Keyword(texts): SIZE = 5 extractor = KRWordRank(min_count=1, max_length=120) keywords, rank, graph = extractor.extract(texts, beta=0.85, max_iter=30) okt = Okt() wanted_pos = [ 'Noun', ] outputs = [] for w, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True): pos = [n[0] for n in okt.pos(w) if n[1] in wanted_pos] outputs.extend(pos) print(outputs[:5]) jsonData = json.dumps(dict(zip(range(SIZE), outputs[:SIZE])), ensure_ascii=False) # SIZE = 5 # with open("tagsOfArticle.json", "w") as f: # json.dump( dict( zip(range(SIZE), outputs[:SIZE] ) ), f, ensure_ascii=False) return jsonData
# -*- coding: utf-8 -*- # soynlp 테스트용 from krwordrank.word import KRWordRank import json from docx2python import docx2python import numpy as np from krwordrank.word import summarize_with_keywords min_count = 5 # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length = 10 # 단어의 최대 길이 wordrank_extractor = KRWordRank(min_count=min_count, max_length=max_length) def json_2_list_Contents(json_path): temp_list = [] with open(json_path, 'r', encoding='UTF8') as data_file: json_data = json.load(data_file) for w in json_data: if (str(type(w['content'][0])) == "<class 'list'>"): # print(type(w['content'][0])) # print(w['content']) for wl in w['content']: temp_list.extend(wl) # temp = np.array(w['content']).flatten().tolist() # temp_list.extend(temp) else: # print(type(w['content'][0])) # print(w['content']) temp_list.extend(w['content']) return temp_list
def get_texts_scores(fname): with open(fname, encoding='utf-8') as f: docs = [doc.split('.') for doc in f] if not docs: return [] # texts = zip(*docs) # print(docs) return docs.pop() texts = get_texts_scores(fname) print(texts) import sys sys.path.append('../') from krwordrank.word import KRWordRank from krwordrank.hangle import normalize import krwordrank wordrank_extractor = KRWordRank( min_count=3, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=8, # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:30]: print('%8s:\t%.4f' % (word, r))
#명사 중 가장 긴 단어의 문자열 길이 print("최장 단어 길이 ", max(noun_count_df["tag"].str.len())) #명사 중 최소 출현 빈도 수 print("최소 출현 빈도 수 ", min(noun_count_df["count"])) # In[12]: frequency_noun_list = noun_count_df[noun_count_df["count"] > 0]["tag"].tolist() # In[13]: from krwordrank.word import KRWordRank wordrank_extractor = KRWordRank( min_count=min(noun_count_df["count"]), # 단어의 최소 출현 빈도수 max_length=max(noun_count_df["tag"].str.len()), # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 #keywords는 filtering이 적용된 L parts #rank는 substriing graph의 substring에 대한 구마 #graph는 substring graph keywords, rank, graph = wordrank_extractor.extract(normalized_lines, beta, max_iter) # In[14]: keyword_df = pd.DataFrame(list(keywords.items()), columns=['word', 'rank'])
i) # 크롤링 해올 게시판의 번호 for article_dict in articles: texts.append(article_dict['article']['title']) texts.append(article_dict['article']['text']) for comment in article_dict['comments']: comment_text = comment['text'] if comment_text == "삭제된 댓글입니다.": continue else: texts.append(delete_word(comment_text)) time.sleep(1) print(texts) wordrank_extractor = KRWordRank( min_count=3, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=10, # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 # keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) keywords = summarize_with_keywords(texts) for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:30]: print('%8s:\t%.4f' % (word, r)) krwordrank_cloud = WordCloud(font_path=font_path, width=800, height=800, background_color="white") popstrings = "내가 그냥 근데 너무"
"가", ] min_count = 5 # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length = 10 # 단어의 최대 길이 beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 verbose = True #키워드 하나씩만 추출. for false_tag in false_tags: doc_id = false_tag[0] #도큐먼트 id 추출 content = false_tag[1] sentences = kkma.sentences(content) #추출된 내용을 문장별로 나눔. nouns = [] for sentence in sentences: if sentence is not '': nouns.append(' '.join([ noun for noun in kkma.nouns(str(sentence)) if noun not in stopwords and len(noun) > 1 ])) wordrank_extractor = KRWordRank(min_count, max_length) keywords, rank, graph = wordrank_extractor.extract(nouns, beta, max_iter, verbose) for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:30]: print(doc_id) print('%8s:\t%.4f' % (word, r))
import os, sys cur_dir = os.path.dirname(os.path.abspath(__file__)) os.chdir(cur_dir) os.chdir('..') os.chdir('./public') sentence_pattern = re.compile('\n+|[.?!]') df = pd.read_csv('./contents.csv') data = df[['title', 'body']].agg('\n'.join, axis=1) split_data = [sentence_pattern.split(row) for row in data] min_count = 4 # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length = 10 # 단어의 최대 길이 wordrank_extractor = KRWordRank(min_count=min_count, max_length=max_length) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 verbose = True df.tagList = df.tagList.astype(str) for i, row in enumerate(split_data): try: keywords, rank, graph = wordrank_extractor.extract(row, beta, max_iter) print( f'[success] index: {i}, len: {len(row)}, keywords: {tuple(keywords.keys())}' ) df._set_value(i, 'tagList', ' '.join(list(keywords.keys()))) except: print(f'[fail] index: {i}, len: {len(row)}')
keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:30]: print('%8s:\t%.4f' % (word, r)) ''' ''' from krwordrank.sentence import summarize_with_sentences #keywords, sents = summarize_with_sentences(texts, num_keywords=100, num_keysents=10) penalty=lambda x:0 if (15 <= len(x) <= 80) else 1 keywords, sents = summarize_with_sentences( texts, penalty=penalty, diversity=0.5, num_keywords=100, num_keysents=2, verbose=False ) print(keywords) print(sents) ''' from krwordrank.word import KRWordRank #texts = [] # Comments about 'La La Land (2016)' wordrank_extractor = KRWordRank(min_count=5, max_length=10) keywords, rank, graph = wordrank_extractor.extract(texts, num_keywords=20) print(keywords)
# La La Land fname = './data_analysis/fail_all.txt' # fname = './data_analysis/pass_all.txt' # texts, scores = get_texts_scores(fname) texts = get_texts(fname) from krwordrank.word import KRWordRank import krwordrank print(krwordrank.__version__) # train KR-WordRank model wordrank_extractor = KRWordRank( min_count = 1, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length = 10, # 단어의 최대 길이 verbose = True ) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter) # Check top 30 keywords with corresponding score for word, r in sorted(keywords.items(), key=lambda x:-x[1])[:30]: print('%8s:\t%.4f' % (word, r)) # remove stopwords
def insert_keyword(): mongoDB = myMongoDB("CapstoneTest") okt = Okt() min_count = 1 # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length = 10 # 단어의 최대 길이 string_idx = 0 total_clean_sentence = [] string_id = [] stop_words = [ '이', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '사람', '주', '섯알', '가운데', '보이', '아니', '등', '같', '우리', '때', '년', '가', '한', '지', '대하', '오', '말', '일', '김재', '종', '매사', '스스로', '하자', '그렇', '위하', '대한', '확', '관련', '이상', '미만', '경우', '텔레', '다시', '때문', '대규모', '뭔가', '디섐보', '퍼터', '제대로', '관', '지난', '비준', '지난해', '위해', '곳곳', '현재', '당일', '주요', '일대', '기', '날', '코로', '물이', '간사', '요즘', '거기', '내', '지금', '정도', '이번', '처음', '모두', '통해', '더욱', '앞서', '진짜', '거', '올레', '가가', '해도', '한번', '원래', '사실', '옆', '정말', '올해', '스', '민', '초', '최근', '앞', '역시', '이후', '군', '먼저', '노', '해당', '최고', '가장', '중', '양', '대해', '사이', '얼마', '아주', '대비', '셈', '각국', '실거주', '실수요자', '실', '대부분', '섯알', '셀', '내년', '유독', '언제', '문득', '늘', '다른', '동안', '덩', '역시', '당시', '최', '변', '살', '이번', '씨', '랄라블', '점차', '건수', '번', '쥴', '리', '상대로', '송', '이제', '매년', '곳', '오늘', '듯', '아무', '괜', '하나', '차지', '오히려', '순간', '속', '누군가', '밥주', '스마', '문하', '정유', '주얼', '좀더', '먼저', '디섐보', '일주', '것처', '에브리' '이전', '비대', '각종', '임', '누구', '일일', '필', '부', '트럼', '초등학', '이하', '에브리' ] for content in mongoDB.collected.find({}, {"_id": 1, "content": 1}): cleaned_sentence = [] clean_sentence = [] string_id.append(list(content.values())[0]) string = list(content.values())[1] string = string.replace(u'\xa0', u' ') string = string.replace(u'\n', u' ') string = string.replace(u'\r', u' ') clean_sentence.append(sent_tokenize(string)) for i in clean_sentence: for j in i: cleaned_sentence.append(j) total_clean_sentence.append(cleaned_sentence) for clean_sentence in total_clean_sentence: noun_keyword_list = [] stop_keyword_list = [] keyword_list = [] wordrank_extractor = KRWordRank(min_count=min_count, max_length=max_length) beta = 0.85 max_iter = 10 try: keywords, rank, graph = wordrank_extractor.extract( clean_sentence, beta, max_iter) except ValueError: mongoDB.collected.update_one({'_id': string_id[string_idx]}, {'$set': { 'keyword': 'keywords' }}) string_idx += 1 continue for word, r in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:]: keyword_list.append(word) for i in keyword_list: a = okt.pos(i) if a[0][1] == 'Noun': noun_keyword_list.append(a[0][0]) for i in noun_keyword_list: if i not in stop_words: stop_keyword_list.append(i) if len(stop_keyword_list) == 0: stop_keyword_list.append('') s1 = set(stop_keyword_list) s1_list = list(s1) s2_list = s1_list[:5] mongoDB.collected.update_one( {'_id': string_id[string_idx]}, {'$set': { 'keyword': s1_list, 'point_keyword': s2_list }}) string_idx += 1
from krwordrank.word import KRWordRank from krwordrank.sentence import make_vocab_score from krwordrank.sentence import MaxScoreTokenizer from krwordrank.sentence import keysentence # 분석하고자 하는 텍스트 읽기 fileName = 'kor_input.txt' texts = [] with open(fileName, encoding='utf-8-sig') as file: for line in file: texts.append(line.split(',')[-1].rstrip()) # 텍스트 구조에 따라 달라집니다. # 키워드 학습 wordrank_extractor = KRWordRank( min_count=5, # 단어의 최소 출현 빈도수 max_length=10, # 단어의 최대길이 verbose = True ) beta = 0.85 max_iter = 10 keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter, num_keywords=100) # 단어 출력부분 with open('kor_word_output.txt',mode='w',encoding='utf-8-sig') as file: for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:10]: file.write('%8s:\t%.4f\n' % (word, r)) stopwords = {} # ? vocab_score = make_vocab_score(keywords, stopwords, scaling=lambda x : 1) tokenizer = MaxScoreTokenizer(vocab_score) # 문장 내 단어 추출기