def movieword(code): df1=movie_start.Getdata([code],20) noun_extractor = LRNounExtractor(verbose=True) noun_extractor.train(df1['text']) nouns = noun_extractor.extract() movie_wordcloud.displayWordCloud(str(code),' '.join(nouns)) return "ok"
def noun_extractor_test(corpus_path): from soynlp import DoublespaceLineCorpus from soynlp.noun import LRNounExtractor from soynlp.noun import NewsNounExtractor corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000) # LRNounExtractor print('LRNounExtractor test\n{}'.format('-' * 40)) noun_extractor = LRNounExtractor() noun_scores = noun_extractor.train_extract(corpus) print('{}\n{} words are extracted\ntop 20 frequency * score'.format( '-' * 30, len(noun_scores))) topwords = sorted( noun_scores, key=lambda x: -noun_scores[x].score * noun_scores[x].frequency)[:20] for word in topwords: print('word = {}, score = {}'.format(word, noun_scores[word].score)) # NewsNounExtractor print('\nNewsNounExtractor test\n{}'.format('-' * 40)) newsnoun_extractor = NewsNounExtractor() newsnoun_scores = newsnoun_extractor.train_extract(corpus) print('\n{}\n{} words are extracted\ntop 20 frequency * score'.format( '-' * 30, len(newsnoun_scores))) topwords = sorted(newsnoun_scores, key=lambda x: -newsnoun_scores[x].score * newsnoun_scores[x].frequency)[:20] for word in topwords: print('word = {}, score = {}'.format(word, newsnoun_scores[word].score)) print('noun extractor test has been done\n\n')
def noun_extractor_test(corpus_path): from soynlp import DoublespaceLineCorpus from soynlp.noun import LRNounExtractor from soynlp.noun import NewsNounExtractor corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000) # LRNounExtractor print('LRNounExtractor test\n{}'.format('-'*40)) noun_extractor = LRNounExtractor() noun_scores = noun_extractor.train_extract(corpus) print('{}\n{} words are extracted\ntop 20 frequency * score'.format('-'*30, len(noun_scores))) topwords = sorted(noun_scores, key=lambda x: -noun_scores[x].score * noun_scores[x].frequency)[:20] for word in topwords: print('word = {}, score = {}'.format(word, noun_scores[word].score)) # NewsNounExtractor print('\nNewsNounExtractor test\n{}'.format('-'*40)) newsnoun_extractor = NewsNounExtractor() newsnoun_scores = newsnoun_extractor.train_extract(corpus) print('\n{}\n{} words are extracted\ntop 20 frequency * score'.format('-'*30, len(newsnoun_scores))) topwords = sorted(newsnoun_scores, key=lambda x: -newsnoun_scores[x].score * newsnoun_scores[x].frequency)[:20] for word in topwords: print('word = {}, score = {}'.format(word, newsnoun_scores[word].score)) print('noun extractor test has been done\n\n')
def movieword(code): df1 = movie_start.Getdata([code]) # 명사만 뽑는 작업 나중에 코드 쓰기 noun_extractor = LRNounExtractor(verbose=True) noun_extractor.train(df1['text']) nouns = noun_extractor.extract() # 명사들을 연결해서 워드클라우드로 뽑음 movie_wordcloud.displayWordCloud(str(code), ' '.join(nouns)) return "ok"
def train(): normed_path = path['norm'] noun_src_path = path['noun']['src'] noun_lrgraph_path = path['noun']['lrgraph'] noun_trained_path = path['noun']['train']['pkl'] noun_readable_path = path['noun']['train']['readable'] noun_result_path = path['noun']['result'] corpus = DoublespaceLineCorpus(normed_path, iter_sent=True) noun_extractor = LRNounExtractor(verbose=False, min_num_of_features=1) nouns = noun_extractor.train_extract(corpus, minimum_noun_score=0.5) word_freq = noun_extractor._wordset_l_counter lrgraph = noun_extractor.lrgraph words = noun_extractor.words trained_data = {} trained_data['lrgraph'] = lrgraph trained_data['words'] = words trained_data['word_freq'] = word_freq with open(noun_src_path, 'wb') as f: pickle.dump(trained_data, f) with open(noun_lrgraph_path, 'w', encoding='utf8') as f: json.dump(lrgraph, f, ensure_ascii=False, indent=4) params = {} for noun, noun_score in nouns.items(): params[noun] = { 'frequency': noun_score.frequency, 'score': noun_score.score, 'known_r_ratio': noun_score.known_r_ratio } with open(noun_trained_path, 'wb') as f: pickle.dump(params, f) with open(noun_readable_path, 'w', encoding='utf8') as f: json.dump(sorted(params.items()), f, ensure_ascii=False, indent=4) with open(noun_result_path, 'w', encoding='utf8') as f: json.dump(sorted(params), f, ensure_ascii=False, indent=4) update_user_dict() update(forced=True)
def tag_counting(law_event_type): prec = pd.read_csv('law_list_detail.csv', encoding='utf-8') noun_extractor = LRNounExtractor(verbose=True) noun_extractor.train(prec[prec['law_event_type'] == law_event_type] ['law_content'].astype('str').apply(preprocessing)) nouns = noun_extractor.extract() count = Counter(nouns) # print(count) tag_count = [] stopwords = make_stopword() # print(stopwords) for n, c in count.most_common(200): if n not in stopwords: dics = {'tag': n, 'count': c[0]} tag_count.append(dics) if len(tag_count) == 20: break # print(tag_count) for tag in tag_count: print("{:<14}".format(tag['tag']), end='\t') print("{}".format(tag['count'])) df = pd.DataFrame.from_dict(tag_count, orient='columns') df.set_index(df['tag'], inplace=True) # print(df) # 스타일 서식 지정 plt.style.use('ggplot') ax1 = df.plot(kind='bar', figsize=(20, 10), width=0.7, stacked=False, legend=None) ax1.set_ylim(0, 60000) ax1.set_xlabel('단어', size=20) ax1.set_ylabel('빈도수', size=20) plt.title('사건 종류별 특정 단어 빈도수(형사)', size=20) plt.show()
def train_extractor(begin_d=None, end_d=None, sections: list = None, base_dir='./out', tokenizer=None): _, sentences, corpus_class = make_corpus(begin_d=begin_d, end_d=end_d, sections=sections, base_dir=base_dir) # nouns = get_noun_words(begin_d='20201101', end_d='20201130') noun_extractor = LRNounExtractor() nouns = noun_extractor.train_extract(sentences) # list of str like noun_score = dict([(key, val.score) for key, val in nouns.items()]) if tokenizer is None: tokenize = lambda x: x.strip().split() elif tokenizer == 'max_score_tokenizer': tokenize = MaxScoreTokenizer(noun_score) elif tokenizer == 'ltokenizer': tokenize = LTokenizer(noun_score) else: raise NotImplementedError if sections is not None and len(sections) >= 1: min_tf = 10 min_df = 2 else: min_tf = 20 min_df = 2 keyword_extractor = CorpusbasedKeywordExtractor( min_tf=min_tf, min_df=min_df, # tokenize=lambda x: x.strip().split(), tokenize=tokenize, verbose=True) # docs: list of str like keyword_extractor.train(sentences) return keyword_extractor, nouns, corpus_class
def get_keyword(characters): df = pd.read_csv("./MbtiApp/keyword/roles.csv") stopwords = pd.read_csv("./MbtiApp/keyword/stopwords.csv")["stopwords"] sentences = df.iloc[:, 2] sentences = list(sentences) + list(characters["feature_total"]) # 명사 추출 noun_extractor = LRNounExtractor() nouns = noun_extractor.train_extract(sentences) nouns = sorted(nouns, key=lambda x: len(x), reverse=True) # stopwords 제거 for sw in stopwords: if sw in nouns: nouns.remove(sw) personal = [] for i, row in characters.iterrows(): noun_sen = "" for noun in nouns: if noun in row["feature_total"]: noun_sen = noun_sen + " #" + noun personal.append(noun_sen) characters["personal"] = personal return characters
def __init__(self, pre_trained=True, analyzer='Hannanum'): self.pre_trained = pre_trained if analyzer == 'Hannanum': self.analyzer = tag.Hannanum() elif analyzer == 'Kkma': self.analyzer = tag.Kkma() elif analyzer == 'Komoran': self.analyzer = tag.Komoran() elif analyzer == 'Mecab': self.analyzer = tag.Mecab() elif analyzer == 'Okt': self.analyzer = tag.Okt() else: if pre_trained == False: pass else: print('Enter a valid KoNLPy analyzer name.\n\tavailable: Hannanum, Kkma, Komoran, Mecab, Okt') self.WordExtractor = WordExtractor(min_frequency=0) self.noun_extractor = LRNounExtractor(verbose=False) self.word_score = {}
width=width, height=height).generate(data) wordcloud.to_file(os.path.join(currdir, "wc" + num + ".png")) #plt.figure(figsize = (15 , 10)) #plt.imshow(wordcloud) #plt.axis("off") #plt.show() # In[51]: from soynlp.noun import LRNounExtractor # In[52]: noun_extractor = LRNounExtractor(verbose=True) noun_extractor.train(sentences1) nouns1 = noun_extractor.extract() noun_extractor.train(sentences2) nouns2 = noun_extractor.extract() noun_extractor.train(sentences3) nouns3 = noun_extractor.extract() noun_extractor.train(sentences4) nouns4 = noun_extractor.extract() noun_extractor.train(sentences5) nouns5 = noun_extractor.extract() noun_extractor.train(sentences6) nouns6 = noun_extractor.extract() noun_extractor.train(sentences7) nouns7 = noun_extractor.extract() noun_extractor.train(sentences8)
plt.axis("off") plt.show() df = pd.read_csv('foo1.csv', engine='python', encoding='utf-8') tokenizer = RegexTokenizer() stopwords_kr = [ '하지만', '그리고', '그런데', '저는', '제가', '그럼', '이런', '저런', '합니다', '많은', '많이', '정말', '너무', '[', ']', '것으로', '했습니다', '했다' ] sentences = df['본문'].apply(preprocessing) displayWordCloud(' '.join(sentences)) # soynlp로 명사 추출하기 noun_extractor = LRNounExtractor(verbose=True) noun_extractor.train(sentences) nouns = noun_extractor.extract() displayWordCloud(' '.join(nouns)) # 이미지 파일위에 출력하기 img = Image.open('cloud.png') img_array = np.array(img) wordcloud = WordCloud(font_path='/Library/Fonts/NanumBarunGothic.ttf', stopwords=stopwords_kr, background_color='white', mask=img_array, width=800, height=600).generate(' '.join(nouns)) plt.figure(figsize=(15, 10))
law_categoriesMin1 = law_categoriesMin['law_content'].astype('str').apply( preprocessing) # print(law_categoriesMin1.head()) # displayWordCloud(' '.join(law_categoriesMin1)) # law_categoriesSe = prec[prec['law_event_type'] == "세무"] # law_categoriesSe1 = law_categoriesSe['law_content'].astype('str').apply(preprocessing) # displayWordCloud(' '.join(law_categoriesSe1)) # law_categoriesH = prec[prec['law_event_type'] == "일반행정"] # law_categoriesH1 = law_categoriesH['law_content'].astype('str').apply(preprocessing) # displayWordCloud(' '.join(law_categoriesH1)) # law_categoriesT = prec[prec['law_event_type'] == "특허"] # law_categoriesT1 = law_categoriesT['law_content'].astype('str').apply(preprocessing) # # print(law_categoriesT1) # displayWordCloud(' '.join(law_categoriesT1)) # law_categoriesP = prec[prec['law_event_type'] == "형사"] # law_categoriesP1 = law_categoriesP['law_content'].astype('str').apply(preprocessing) # displayWordCloud(' '.join(law_categoriesP1)) noun_extractor = LRNounExtractor(verbose=True) noun_extractor.train(law_categoriesMin1) # 명사만 추출 nouns = noun_extractor.extract() # print(type(nouns)) # print(nouns) displayWordCloud(' '.join(nouns)) # displayWordCloud(' '.join(law_categoriesGa1))
def detail(m_no, current_movie_title): conn = pymysql.connect(host='127.0.0.1', user='******', password='******', db='movie', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) try: with conn.cursor() as cursor: sql = 'select * from current_movie c inner join test t on c.current_movie_title = t.title where current_movie_title = %s;' cursor.execute(sql, (current_movie_title)) result = cursor.fetchone() #하나만 가져올떄 sql = 'select * from current_movie where current_movie_title = %s;' cursor.execute(sql, (current_movie_title)) result1 = cursor.fetchone() #하나만 가져올떄 sql = 'select * from board where m_no= %s;' cursor.execute(sql, (m_no)) board = cursor.fetchall() finally: conn.close() if result is not None: tmrvl = [] movieName = result['codem'] for page in range(1, 200): url = "https://movie.naver.com/movie/bi/mi/review.nhn?code=" + str( movieName) + "&page=" + str(page) response = urllib.request.urlopen(url) soup = BeautifulSoup(response, 'html.parser') table = soup.select('ul.rvw_list_area li a') for result3 in table: mrv = str(result3.string) tmrv = tuple([mrv]) tmrvl.append(tmrv) #tmrv1=str(tmrv) #f.write(tmrv1) df = pd.DataFrame(tmrvl) def preprocessing(text): # 개행문자 제거 text = re.sub('\\\\n', ' ', text) return text tokenizer = RegexTokenizer() stopwords_kr = [ '하지만', '그리고', '그런데', '저는', '제가', '그럼', '이런', '저런', '합니다', '많은', '많이', '정말', '너무', '[', ']', '것으로', '했습니다', '했다' ] sentences = df[0].apply(preprocessing) # soynlp로 명사 추출하기 noun_extractor = LRNounExtractor(verbose=True) noun_extractor.train(sentences) nouns = noun_extractor.extract() # 이미지 파일위에 출력하기 img = Image.open('IT_Bank_Movie/static/img/cloud.png') img_array = np.array(img) wordcloud = WordCloud(font_path='/Library/Fonts/NanumBarunGothic.ttf', stopwords=stopwords_kr, background_color='white', mask=img_array, width=800, height=600).generate(' '.join(nouns)) plt.figure(figsize=(15, 10)) plt.imshow(wordcloud) plt.axis("off") #plt.show() url1 = "IT_Bank_Movie/static/wordcloud/" + current_movie_title + ".png" wordcloud.to_file(url1) return render_template('movie_detail.html', wordInfo=result, board=board, movieInfo=result1)
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf' font = fm.FontProperties(fname=fontpath, size=9) stopwords_kr = [ '하지만', '그리고', '그런데', '저는', '제가', '그럼', '이런', '저런', '합니다', '많은', '많이', '정말', '너무' ] def displayWordCloud(data=None, backgroundcolor='white', width=800, height=600): wordcloud = WordCloud(font_path=fontpath, stopwords=stopwords_kr, background_color=backgroundcolor, width=width, height=height).generate(data) plt.figure(figsize=(15, 10)) plt.imshow(wordcloud) plt.axis("off") plt.show() # noun_extractor = LRNounExtractor(verbose=True) noun_extractor = LRNounExtractor() nouns = noun_extractor.train_extract(content) # nouns = noun_extractor.extract() print(nouns) # displayWordCloud(' '.join(nouns))