def remove_stopwords(tokens): with open('data/stopwords-ko.txt', 'rb') as f: file_lines = read_txt('data/stopwords-ko.txt') stopwords = [line.strip() for line in file_lines] new_tokens = [] for token in tokens: if token not in stopwords: new_tokens.append(token) return new_tokens
def tokenize_corpus(corpus_file): raw_document_ko = read_txt(corpus_file) lines = raw_document_ko.split('\n') processed_doc_ko = remove_stopwords( remove_extraneous(remove_english(remove_extraneous(lines)))) doc_ko = ' '.join(str(word) for line in processed_doc_ko for word in line) t = Okt() tokens_ko = t.morphs(doc_ko) return tokens_ko
def crawling(): #os.remove('templates/test.json') #time.sleep(0) raw = requests.get( "https://search.naver.com/search.naver?where=news&query=%EA%B0%9C%EB%B0%9C%EC%9E%90" ) soup = BeautifulSoup(raw.text, 'html.parser') #본문요약본 contents_lists = soup.select('ul.type01 dl') for contents_list in contents_lists: #print('==='*40) #print(contents_list) contents_cleansing(contents_list) #본문요약 정제화 #모든 리스트 딕셔너리형태로 저장 result = {"contents": contents_text} #resultJSON = json.dumps(result, ensure_ascii=False) df = pd.DataFrame(result) df.to_json('templates/itnews.json', orient='table', force_ascii=False) news = utils.read_txt('templates/itnews.json', encoding='ANSI') okt = Komoran() noun = okt.nouns(news) for i, v in enumerate(noun): if len(v) < 2: noun.pop(i) count = Counter(noun) noun_list = count.most_common(10) for v in noun_list: print(v) new_list = {"rank": noun_list} dn = pd.DataFrame(new_list) dn.to_json('templates/rank.json', orient='table', force_ascii=True) return render_template('rank.json', string=json)
def crawling(): #os.remove('templates/test.json') raw = requests.get("https://search.naver.com/search.naver?where=news&query=%EA%B0%9C%EB%B0%9C%EC%9E%90",headers={'User-Agent':'Mozilla/5.0'}) soup = BeautifulSoup(raw.text, 'html.parser') #본문요약본 contents_lists = soup.select('ul.type01 dl') for contents_list in contents_lists: #print('==='*40) #print(contents_list) contents_cleansing(contents_list) #본문요약 정제화 #모든 리스트 딕셔너리형태로 저장 result= { "contents": contents_text } #resultJSON = json.dumps(result, ensure_ascii=False) df = pd.DataFrame(result) df.to_json('templates/itnews.json', orient='table', force_ascii = False) news = utils.read_txt('templates/itnews.json', encoding='ANSI') okt = Komoran() noun = okt.nouns(news) for i,v in enumerate(noun): if len(v)<2: noun.pop(i) count = Counter(noun) result = list() for i,v in count.most_common(10): insert_data = dict() insert_data = ({'tag': [i],'count': [v]}) result.append(insert_data) res_dict = {'rank':result} api = json.dumps(res_dict, ensure_ascii=False) return api
# -*- coding: utf-8 -*- from konlpy.tag import Komoran from collections import Counter from konlpy import utils import csv #import json # 크롤링 한 json 열기 # json 파일명은 extract에서 년월일.json으로 만들어짐 news = utils.read_txt('templates/2020529.json', encoding='ANSI') okt = Komoran() noun = okt.nouns(news) # 명사의 글자가 하나인 것은 제외 # '올해'도 명사라고 합니다 ≒ 금년(今年) for i,v in enumerate(noun): if len(v)<2: noun.pop(i) count = Counter(noun) # 명사 빈도 상위 10개 noun_list = count.most_common(10) for v in noun_list: print(v) # txt 파일로 저장 #with open("templates/rank.txt",'w',encoding='utf-8') as f: # for v in noun_list: # f.write(" ".join(map(str,v)))
# # def draw_cloud(tags, filename, fontname='Noto Sans CJK', size=(800, 600)): # pytagcloud.create_tag_image(tags, filename, fontname=fontname, size=size) # webbrowser.open(filename) import re def cleansing(text): cleaned_text = re.sub('[a-zA-z]','',text) cleaned_text = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"\♥\♡\ㅋ\ㅠ\ㅜ\ㄱ\ㅎ\ㄲ\ㅡ]','',cleaned_text) return cleaned_text text = utils.read_txt("C:\\Users\\Faust\\PycharmProjects\\SWC\\media\\zinuzian\\20191030\\500347562\\500347562.txt", encoding=u'utf-8').split("\n") print(text) processed = u"" k = Komoran() i=0 while text: line = text.pop(0) i += 1 if line: timeline, data = line.split(" ", maxsplit=1) try: check = data.encode('utf-8') check.decode('utf-8', 'strict') processed += data + u"\n" except UnicodeDecodeError: pass