Ejemplo n.º 1
0
def remove_stopwords(tokens):
    with open('data/stopwords-ko.txt', 'rb') as f:
        file_lines = read_txt('data/stopwords-ko.txt')

    stopwords = [line.strip() for line in file_lines]
    new_tokens = []
    for token in tokens:
        if token not in stopwords:
            new_tokens.append(token)
    return new_tokens
Ejemplo n.º 2
0
def tokenize_corpus(corpus_file):
    raw_document_ko = read_txt(corpus_file)
    lines = raw_document_ko.split('\n')

    processed_doc_ko = remove_stopwords(
        remove_extraneous(remove_english(remove_extraneous(lines))))
    doc_ko = ' '.join(str(word) for line in processed_doc_ko for word in line)

    t = Okt()
    tokens_ko = t.morphs(doc_ko)

    return tokens_ko
Ejemplo n.º 3
0
def crawling():

    #os.remove('templates/test.json')

    #time.sleep(0)

    raw = requests.get(
        "https://search.naver.com/search.naver?where=news&query=%EA%B0%9C%EB%B0%9C%EC%9E%90"
    )
    soup = BeautifulSoup(raw.text, 'html.parser')

    #본문요약본
    contents_lists = soup.select('ul.type01 dl')
    for contents_list in contents_lists:
        #print('==='*40)
        #print(contents_list)
        contents_cleansing(contents_list)  #본문요약 정제화

    #모든 리스트 딕셔너리형태로 저장
    result = {"contents": contents_text}

    #resultJSON = json.dumps(result, ensure_ascii=False)
    df = pd.DataFrame(result)
    df.to_json('templates/itnews.json', orient='table', force_ascii=False)

    news = utils.read_txt('templates/itnews.json', encoding='ANSI')

    okt = Komoran()
    noun = okt.nouns(news)

    for i, v in enumerate(noun):
        if len(v) < 2:
            noun.pop(i)

    count = Counter(noun)

    noun_list = count.most_common(10)
    for v in noun_list:
        print(v)

    new_list = {"rank": noun_list}
    dn = pd.DataFrame(new_list)
    dn.to_json('templates/rank.json', orient='table', force_ascii=True)

    return render_template('rank.json', string=json)
Ejemplo n.º 4
0
def crawling():
    
    #os.remove('templates/test.json')
    
    raw = requests.get("https://search.naver.com/search.naver?where=news&query=%EA%B0%9C%EB%B0%9C%EC%9E%90",headers={'User-Agent':'Mozilla/5.0'})
    soup = BeautifulSoup(raw.text, 'html.parser')
        
    #본문요약본
    contents_lists = soup.select('ul.type01 dl')
    for contents_list in contents_lists:
        #print('==='*40)
        #print(contents_list)
        contents_cleansing(contents_list) #본문요약 정제화
        
    #모든 리스트 딕셔너리형태로 저장
    result= { "contents": contents_text }
    
    #resultJSON = json.dumps(result, ensure_ascii=False)
    df = pd.DataFrame(result)
    df.to_json('templates/itnews.json', orient='table', force_ascii = False)
    
    news = utils.read_txt('templates/itnews.json', encoding='ANSI')
    
    okt = Komoran()
    noun = okt.nouns(news)

    for i,v in enumerate(noun):
        if len(v)<2:
            noun.pop(i)

    count = Counter(noun)
    result = list()
    
    for i,v in count.most_common(10):
        insert_data = dict()
        insert_data = ({'tag': [i],'count': [v]})
        result.append(insert_data)
    
    res_dict = {'rank':result}
    api = json.dumps(res_dict, ensure_ascii=False)
    
    return api
Ejemplo n.º 5
0
# -*- coding: utf-8 -*-
from konlpy.tag import Komoran
from collections import Counter
from konlpy import utils
import csv
#import json

# 크롤링 한 json 열기
# json 파일명은 extract에서 년월일.json으로 만들어짐
news = utils.read_txt('templates/2020529.json', encoding='ANSI')

okt = Komoran()
noun = okt.nouns(news)

# 명사의 글자가 하나인 것은 제외
# '올해'도 명사라고 합니다 ≒ 금년(今年)
for i,v in enumerate(noun):
    if len(v)<2:
        noun.pop(i)

count = Counter(noun)

# 명사 빈도 상위 10개
noun_list = count.most_common(10)
for v in noun_list:
    print(v)
    
# txt 파일로 저장
#with open("templates/rank.txt",'w',encoding='utf-8') as f:
#    for v in noun_list:
#        f.write(" ".join(map(str,v)))
Ejemplo n.º 6
0

#
# def draw_cloud(tags, filename, fontname='Noto Sans CJK', size=(800, 600)):
#     pytagcloud.create_tag_image(tags, filename, fontname=fontname, size=size)
#     webbrowser.open(filename)

import re

def cleansing(text):
    cleaned_text = re.sub('[a-zA-z]','',text)
    cleaned_text = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"\♥\♡\ㅋ\ㅠ\ㅜ\ㄱ\ㅎ\ㄲ\ㅡ]','',cleaned_text)
    return cleaned_text


text = utils.read_txt("C:\\Users\\Faust\\PycharmProjects\\SWC\\media\\zinuzian\\20191030\\500347562\\500347562.txt", encoding=u'utf-8').split("\n")
print(text)
processed = u""
k = Komoran()
i=0
while text:
    line = text.pop(0)
    i += 1
    if line:
        timeline, data = line.split(" ", maxsplit=1)
        try:
            check = data.encode('utf-8')
            check.decode('utf-8', 'strict')
            processed += data + u"\n"
        except UnicodeDecodeError:
            pass