Exemple #1
0
def process_refine(news_route_list):
    news_refine = []
    for news_route in news_route_list:
        refine_data = refining.refin_new_day(news_route)
        if refine_data is not None and len(refine_data) > 0:
            news_refine.append(refine_data)
    
    if news_refine is None:
        print_log('refine result is None. please check refine process or check news directory.')
        return None
    
    return news_refine
Exemple #2
0
    def news_process(self):
        Settings.settings(r'../files/settings.csv')
        start_read_news = time.time()

        refine_data = refining.refin_new_day(self.route)
        refine_data = json.dumps(refine_data)

        recode_dict = recoding.load_recode_dict(Settings.recoding_dict_route)
        recoding_data = recoding.replace_all(recode_dict, refine_data)
        #data = knlp.read_news(newsroute = self.route)
        self.news = knlp.read_news(newsdata=recoding_data)

        end_read_news = time.time()

        log_read_news = 'os{} : {} - read_news() - {}'.format(
            os.getpid(), self.name, str(end_read_news - start_read_news))
        print(log_read_news)

        #         self.news = knlp.read_news(self.route)
        #         end_read_news = time.time()
        #
        #         log_read_news = 'os{} : {} - read_news() - {}'.format(os.getpid(),
        #                                                             self.name,
        #                                                             str(end_read_news - start_read_news))
        #         print(log_read_news)

        start_konlpy = time.time()
        self.nouns = knlp.get_KoNLP(self.news,
                                    Settings.konlp_class,
                                    Settings.konlp_function,
                                    userDict=Settings.user_dict_route)

        self.nouns = list(itertools.chain.from_iterable(self.nouns))
        end_konlp = time.time()

        log_nouns = 'os{} : {} - Twitter.nouns() - {}'.format(
            os.getpid(), self.name, str(end_konlp - start_konlpy))
        print(log_nouns)
        self.write_nouns(Settings.result_route + '_nouns')

        start_count = time.time()
        count_data = count.get_unique_count(self.nouns)
        self.counts = pandas.DataFrame({
            'word': list(count_data.keys()),
            'count': list(count_data.values())
        })
        self.counts = self.counts.sort_index(axis=1, ascending=False)
        end_count = time.time()

        log_count = 'os{} : {} - count.get_unique_count() - {}'.format(
            os.getpid(), self.name, str(end_count - start_count))
        print(log_count)

        start_filter = time.time()
        self.result = filter.filter_count(self.counts, Settings.filter_route)
        end_filter = time.time()

        log_filter = 'os{} : {} - filter.filter_count() - {}'.format(
            os.getpid(), self.name, str(end_filter - start_filter))
        print(log_filter)
        self.process_log = '\n'.join(
            [log_read_news, log_nouns, log_count, log_filter])
        self.write_result(Settings.result_route)
Exemple #3
0
    def news_process(self):
        # Multiprocessing 이 메모리를 공유하지 않으므로 Settings 다시 로딩
        Settings.settings(settings_route)
        log_list = []

        # start
        start_news = time.time()
        log_start_news = self.get_news_debug_string('- start news process')
        print(log_start_news)
        log_list.append(log_start_news)

        # refine
        start_refine_news = time.time()
        refine_data = refining.refin_new_day(self.route)
        refine_data = json.dumps(refine_data)
        end_refine_news = time.time()

        log_refine_news = self.get_news_debug_string('- refine - {}'.format(
            str(end_refine_news - start_refine_news)))
        print(log_refine_news)
        log_list.append(log_refine_news)

        # recoding
        start_recoding_news = time.time()
        recode_dict = recoding.load_recode_dict(Settings.recoding_dict_route)
        recoding_data = recoding.replace_all(recode_dict, refine_data)
        #data = knlp.read_news(newsroute = self.route)
        self.news = knlp.read_news(newsdata=recoding_data)
        end_recoding_news = time.time()

        log_recoding_news = self.get_news_debug_string(
            '- recoding - {}'.format(
                str(end_recoding_news - start_recoding_news)))
        print(log_recoding_news)
        log_list.append(log_recoding_news)

        #         self.news = knlp.read_news(self.route)
        #         end_read_news = time.time()
        #
        #         log_read_news = 'os{} : {} - read_news() - {}'.format(os.getpid(),
        #                                                             self.name,
        #                                                             str(end_read_news - start_read_news))
        #         print(log_read_news)

        # konlp
        start_konlpy = time.time()
        self.nouns = knlp.get_KoNLP(self.news,
                                    Settings.konlp_class,
                                    Settings.konlp_function,
                                    userDict=Settings.user_dict_route)

        self.nouns = list(itertools.chain.from_iterable(self.nouns))
        end_konlp = time.time()

        log_nouns = self.get_news_debug_string('- cTwitter.nouns - {}'.format(
            str(end_konlp - start_konlpy)))
        print(log_nouns)
        log_list.append(log_nouns)

        self.write_nouns(Settings.result_route + '_nouns')

        # count
        start_count = time.time()
        count_data = count.get_unique_count(self.nouns)
        self.counts = pandas.DataFrame({
            'word': list(count_data.keys()),
            'count': list(count_data.values())
        })
        self.counts = self.counts.sort_index(axis=1, ascending=False)
        end_count = time.time()

        log_count = self.get_news_debug_string('- count - {}'.format(
            str(end_count - start_count)))
        print(log_count)
        log_list.append(log_count)

        # filter
        start_filter = time.time()
        self.result = filter.filter_count(self.counts, Settings.filter_route)
        end_filter = time.time()

        log_filter = self.get_news_debug_string('- filter - {}'.format(
            str(end_filter - start_filter)))
        print(log_filter)
        log_list.append(log_filter)

        end_news = time.time()
        log_end_news = self.get_news_debug_string(
            '- end news process - {}'.format(str(end_news - start_news)))
        print(log_end_news)
        log_list.append(log_end_news)

        # save log
        self.process_log = ''.join(log_list)

        # save result
        self.write_csv(self.result, Settings.result_route, modifier='result')