コード例 #1
0
    def get_df_news_word(self, filePath, encoding):
        df = handler.load_to_csv(filePath, encoding)
        contents = df['contents'].tolist()
        # times = df['time'].tolist()

        print('list load end')

        total = []
        stopWords = [
            '지난', '진자', '판정', '대통령', '위해', '지역', '사람', '관련', '이후', '대해', '개발',
            '올해', '당국', '경우', '국내', '때문', '조사', '최근', '이번', '확인', '증가', '진행',
            '통해', '신종', '지난달', '대상'
            '단계', '우리', '상황', '현재', '조치'
        ]

        okt = Okt()

        for content in contents:
            content = str(content)
            #     print(f'content : {content}')
            # noun = okt.nouns(content)
            morph = okt.pos(content)

            for word, tag in morph:
                if tag in ['Noun'] and len(word) > 1 and word not in stopWords:
                    total.append(word)

        count = Counter(total)
        noun_list = count.most_common(30)

        print('noun_list load end')

        if not Checker.check_folder_path('./csv'):
            handler.crete_folder('./csv')

        handler.save_to_csv('./csv/result_covid_news_word.csv', noun_list,
                            ['word', 'count'], 'utf-8-sig')

        # for value in noun_list:
        #     print(value)

        # print('create wordclude')
        # wc = WordCloud(font_path='./font/NanumBarunGothic.ttf', background_color="white", width=1000, height=1000, max_words=50, max_font_size=300)
        # wc.generate_from_frequencies(dict(noun_list))

        # wc.to_file('wordCloud.png')

        colnames = ['word', 'count']

        return pd.DataFrame(noun_list, columns=colnames)
コード例 #2
0
ファイル: economy_df.py プロジェクト: silenc3502/Stock-Proj
    def get_df_news_word(self, filePath, encoding):
        df = handler.load_to_csv(filePath, encoding)
        contents = df['contents'].tolist()
        # times = df['time'].tolist()

        print('list load end')

        total = []
        stopWords = ['크게', '여기', '서울', '정부', '위원회', '사업', '한국', '옵티머스', '의원', '금융감독원', '국회', '지난']
        
        okt = Okt()

        for content in contents:
            content = str(content)
        #     print(f'content : {content}')
            # noun = okt.nouns(content)
            morph = okt.pos(content)
                                   
            for word, tag in morph:
                if tag in ['Noun'] and len(word) > 1 and word not in stopWords:
                    total.append(word)
            
        count = Counter(total)
        noun_list = count.most_common(10)

        print('noun_list load end')

        for value in noun_list:
            print(value)

        # print('create wordclude')
        # wc = WordCloud(font_path='./font/NanumBarunGothic.ttf', background_color="white", width=1000, height=1000, max_words=50, max_font_size=300)
        # wc.generate_from_frequencies(dict(noun_list))

        # wc.to_file('wordCloud.png')

        colnames = ['word', 'count']
        
        if not Checker.check_folder_path('./csv'):
                handler.crete_folder('./csv')
            
        handler.save_to_csv('./csv/result_economy_news_word.csv', noun_list, colnames, 'utf-8-sig')

        return pd.DataFrame(noun_list, columns=colnames)
コード例 #3
0
    def get(self):

        econmoy_news_count = self.news_dao.count()

        if econmoy_news_count == 0:
            kdd = EconomyNewsKdd()
            urls = kdd.get_economy_news_urls()
            # print(datas)

            if not Checker.check_folder_path('./csv'):
                handler.crete_folder('./csv')
                    
            handler.save_to_csv('./csv/result_economy_urls.csv', urls, ['urls'], 'utf-8-sig')

            result_list = []
            thread_count = 6
            thread_list = []
            div_count = int(len(urls) / thread_count)   # 600

            for idx in range(0, thread_count):
                start_idx = idx * div_count
                end_idx = (start_idx + div_count)

                div_url = urls[int(start_idx):int(end_idx)]

                thread = threading.Thread(target=kdd.get_contents_from_economy_urls, args=(div_url, result_list))
                thread_list.append(thread)
                thread.start()
            
            for thread in thread_list:
                thread.join()

            if not Checker.check_folder_path('./csv'):
                handler.crete_folder('./csv')
            
            handler.save_to_csv('./csv/result_economy_news.csv', result_list, ['time','contents'], 'utf-8-sig')
            df = self.df.get_df_news(result_list)
            self.news_dao.save_data_bulk(df)
        
        econmoy_word_count = self.word_dao.count()
        if econmoy_word_count == 0:
            df = self.df.get_df_news_word('./csv/result_economy_news.csv', 'utf-8-sig')
            self.word_dao.save_data_bulk(df)
        
        result = self.word_dao.find_all()
        return jsonify([item.json for item in result])
コード例 #4
0
 def __init__(self):
     print('-----------emotionDfo--------------')
     print(keyword)
     self.fileReader = FileHandler()
コード例 #5
0
    # elif emotion_match != 3:
    elif emotion_find_key == 0:
        EmotionDao.bulk()
        print('ok!')
    elif emotion_find_key == 1:
        EmotionDao.find_update(keyword)
        # EmotionDao.update()
        print('ok!!')
    # ================================ kain code =======================================
    if status_count == 0:
        endDate = datetime.date.today().strftime('%Y%m%d')
        datas = CovidStatusKdd().get_covid19_status(endDate)

        if len(datas) > 0:
            if not Checker.check_folder_path('./csv'):
                handler.crete_folder('./csv')

            keys = list(datas[0].keys())
            handler.save_to_csv('./csv/result_covid19_status.csv', datas, keys,
                                'utf-8-sig')

            df = CovidStatusDf(keys).get_dataframe(datas)
            CovidStatusDao.save_data_bulk(df)
    # ===================================================================================
    # EmotionDao.emotion_fi_insert()
    # EmotionDao.find_insert(EmotionDto, keyword)

    # session.query(cls).filter(cls.keyword == emotion['keyword'])\
    # if emotion_find == 0:
    # EmotionDao.find_insert()
    # session.query(emotion).filter(emotion.keyword == keyword).last()\
コード例 #6
0
 def __init__(self):
     self.fileHandler = FileHandler()  
コード例 #7
0
 def __init__(self):
     print('-----------ExchangeDfo--------------')
     self.fileHandler = FileHandler()
コード例 #8
0
 def __init__(self):
     print('-----------FinanceDfo--------------')
     self.fileHandler = FileHandler()
コード例 #9
0
    def get(self):

        params = request.get_json()
        keyword = params['keyword']

        if keyword is not None:

            count = self.news_dao.count()

            if count == 0:
                crawer = CovidNewsKDD()
                print('get urls start')
                start_time = time.time()
                urls = crawer.get_naver_news_urls(keyword)
                print(
                    f'get urls end. processing time : {time.time() - start_time}s'
                )

                if not Checker.check_folder_path('./csv'):
                    handler.crete_folder('./csv')

                handler.save_to_csv('./csv/result_Covid19_urls.csv', urls,
                                    ['urls'], 'utf-8-sig')

                # url_df = handler.load_to_csv('./csv/result_Covid19_urls.csv', 'utf-8-sig')
                # urls = url_df['urls'].tolist()

                print('get contents from urls start')

                start_time = time.time()

                result_list = []
                thread_count = 5
                thread_list = []
                div_count = int(len(urls) / thread_count)  # 600

                for idx in range(0, thread_count):
                    start_idx = idx * div_count
                    end_idx = (start_idx + div_count)

                    div_url = urls[int(start_idx):int(end_idx)]

                    thread = threading.Thread(
                        target=crawer.get_contents_from_naver_urls,
                        args=(div_url, result_list))
                    thread_list.append(thread)
                    thread.start()

                for thread in thread_list:
                    thread.join()

                print(
                    f'get contents from urls end. processing time : {time.time() - start_time}s'
                )

                if not Checker.check_folder_path('./csv'):
                    handler.crete_folder('./csv')

                handler.save_to_csv('./csv/result_covid19_news.csv',
                                    result_list, ['time', 'contents'],
                                    'utf-8-sig')

                df = self.df.get_df_news(result_list)
                # df = handler.load_to_csv('./csv/result_Covid19_News.csv', 'utf-8-sig')
                # print(df)
                # print(df.isnull().values.any())
                # counter = df['contents'].isnull().sum()
                # print(f'contents is non : {counter}')
                # counter = df['time'].isnull().sum()
                # print(f'time is non : {counter}')
                self.news_dao.save_data_bulk(df)

            wordcount = self.word_dao.count()

            if wordcount == 0:
                df = self.df.get_df_news_word('./csv/result_covid19_news.csv',
                                              'utf-8-sig')
                # print(df)

                self.word_dao.save_data_bulk(df)

            result = self.word_dao.find_all()
            return jsonify([item.json for item in result])