def get_df_news_word(self, filePath, encoding): df = handler.load_to_csv(filePath, encoding) contents = df['contents'].tolist() # times = df['time'].tolist() print('list load end') total = [] stopWords = [ '지난', '진자', '판정', '대통령', '위해', '지역', '사람', '관련', '이후', '대해', '개발', '올해', '당국', '경우', '국내', '때문', '조사', '최근', '이번', '확인', '증가', '진행', '통해', '신종', '지난달', '대상' '단계', '우리', '상황', '현재', '조치' ] okt = Okt() for content in contents: content = str(content) # print(f'content : {content}') # noun = okt.nouns(content) morph = okt.pos(content) for word, tag in morph: if tag in ['Noun'] and len(word) > 1 and word not in stopWords: total.append(word) count = Counter(total) noun_list = count.most_common(30) print('noun_list load end') if not Checker.check_folder_path('./csv'): handler.crete_folder('./csv') handler.save_to_csv('./csv/result_covid_news_word.csv', noun_list, ['word', 'count'], 'utf-8-sig') # for value in noun_list: # print(value) # print('create wordclude') # wc = WordCloud(font_path='./font/NanumBarunGothic.ttf', background_color="white", width=1000, height=1000, max_words=50, max_font_size=300) # wc.generate_from_frequencies(dict(noun_list)) # wc.to_file('wordCloud.png') colnames = ['word', 'count'] return pd.DataFrame(noun_list, columns=colnames)
def get_df_news_word(self, filePath, encoding): df = handler.load_to_csv(filePath, encoding) contents = df['contents'].tolist() # times = df['time'].tolist() print('list load end') total = [] stopWords = ['크게', '여기', '서울', '정부', '위원회', '사업', '한국', '옵티머스', '의원', '금융감독원', '국회', '지난'] okt = Okt() for content in contents: content = str(content) # print(f'content : {content}') # noun = okt.nouns(content) morph = okt.pos(content) for word, tag in morph: if tag in ['Noun'] and len(word) > 1 and word not in stopWords: total.append(word) count = Counter(total) noun_list = count.most_common(10) print('noun_list load end') for value in noun_list: print(value) # print('create wordclude') # wc = WordCloud(font_path='./font/NanumBarunGothic.ttf', background_color="white", width=1000, height=1000, max_words=50, max_font_size=300) # wc.generate_from_frequencies(dict(noun_list)) # wc.to_file('wordCloud.png') colnames = ['word', 'count'] if not Checker.check_folder_path('./csv'): handler.crete_folder('./csv') handler.save_to_csv('./csv/result_economy_news_word.csv', noun_list, colnames, 'utf-8-sig') return pd.DataFrame(noun_list, columns=colnames)
def get(self): econmoy_news_count = self.news_dao.count() if econmoy_news_count == 0: kdd = EconomyNewsKdd() urls = kdd.get_economy_news_urls() # print(datas) if not Checker.check_folder_path('./csv'): handler.crete_folder('./csv') handler.save_to_csv('./csv/result_economy_urls.csv', urls, ['urls'], 'utf-8-sig') result_list = [] thread_count = 6 thread_list = [] div_count = int(len(urls) / thread_count) # 600 for idx in range(0, thread_count): start_idx = idx * div_count end_idx = (start_idx + div_count) div_url = urls[int(start_idx):int(end_idx)] thread = threading.Thread(target=kdd.get_contents_from_economy_urls, args=(div_url, result_list)) thread_list.append(thread) thread.start() for thread in thread_list: thread.join() if not Checker.check_folder_path('./csv'): handler.crete_folder('./csv') handler.save_to_csv('./csv/result_economy_news.csv', result_list, ['time','contents'], 'utf-8-sig') df = self.df.get_df_news(result_list) self.news_dao.save_data_bulk(df) econmoy_word_count = self.word_dao.count() if econmoy_word_count == 0: df = self.df.get_df_news_word('./csv/result_economy_news.csv', 'utf-8-sig') self.word_dao.save_data_bulk(df) result = self.word_dao.find_all() return jsonify([item.json for item in result])
def __init__(self): print('-----------emotionDfo--------------') print(keyword) self.fileReader = FileHandler()
# elif emotion_match != 3: elif emotion_find_key == 0: EmotionDao.bulk() print('ok!') elif emotion_find_key == 1: EmotionDao.find_update(keyword) # EmotionDao.update() print('ok!!') # ================================ kain code ======================================= if status_count == 0: endDate = datetime.date.today().strftime('%Y%m%d') datas = CovidStatusKdd().get_covid19_status(endDate) if len(datas) > 0: if not Checker.check_folder_path('./csv'): handler.crete_folder('./csv') keys = list(datas[0].keys()) handler.save_to_csv('./csv/result_covid19_status.csv', datas, keys, 'utf-8-sig') df = CovidStatusDf(keys).get_dataframe(datas) CovidStatusDao.save_data_bulk(df) # =================================================================================== # EmotionDao.emotion_fi_insert() # EmotionDao.find_insert(EmotionDto, keyword) # session.query(cls).filter(cls.keyword == emotion['keyword'])\ # if emotion_find == 0: # EmotionDao.find_insert() # session.query(emotion).filter(emotion.keyword == keyword).last()\
def __init__(self): self.fileHandler = FileHandler()
def __init__(self): print('-----------ExchangeDfo--------------') self.fileHandler = FileHandler()
def __init__(self): print('-----------FinanceDfo--------------') self.fileHandler = FileHandler()
def get(self): params = request.get_json() keyword = params['keyword'] if keyword is not None: count = self.news_dao.count() if count == 0: crawer = CovidNewsKDD() print('get urls start') start_time = time.time() urls = crawer.get_naver_news_urls(keyword) print( f'get urls end. processing time : {time.time() - start_time}s' ) if not Checker.check_folder_path('./csv'): handler.crete_folder('./csv') handler.save_to_csv('./csv/result_Covid19_urls.csv', urls, ['urls'], 'utf-8-sig') # url_df = handler.load_to_csv('./csv/result_Covid19_urls.csv', 'utf-8-sig') # urls = url_df['urls'].tolist() print('get contents from urls start') start_time = time.time() result_list = [] thread_count = 5 thread_list = [] div_count = int(len(urls) / thread_count) # 600 for idx in range(0, thread_count): start_idx = idx * div_count end_idx = (start_idx + div_count) div_url = urls[int(start_idx):int(end_idx)] thread = threading.Thread( target=crawer.get_contents_from_naver_urls, args=(div_url, result_list)) thread_list.append(thread) thread.start() for thread in thread_list: thread.join() print( f'get contents from urls end. processing time : {time.time() - start_time}s' ) if not Checker.check_folder_path('./csv'): handler.crete_folder('./csv') handler.save_to_csv('./csv/result_covid19_news.csv', result_list, ['time', 'contents'], 'utf-8-sig') df = self.df.get_df_news(result_list) # df = handler.load_to_csv('./csv/result_Covid19_News.csv', 'utf-8-sig') # print(df) # print(df.isnull().values.any()) # counter = df['contents'].isnull().sum() # print(f'contents is non : {counter}') # counter = df['time'].isnull().sum() # print(f'time is non : {counter}') self.news_dao.save_data_bulk(df) wordcount = self.word_dao.count() if wordcount == 0: df = self.df.get_df_news_word('./csv/result_covid19_news.csv', 'utf-8-sig') # print(df) self.word_dao.save_data_bulk(df) result = self.word_dao.find_all() return jsonify([item.json for item in result])