コード例 #1
0
    def crawl_by_date(self, start=None, end=None, sleep_time=.87, send=False):
        format_ct = "%Y-%m-%d" + self.root_num
        for day_page, date in self.pages(cal_days(start, end, format_in="%Y%m%d", format_out=format_ct)):
            date = trans_date_format(date, format_ct, "%Y%m%d")
            file_path = self.file_root + self.news_name + '/' + date + '/'
            check_folder(file_path)

            meta_path = file_path + date + '.json'
            meta_old = check_meta(meta_path)

            # 找出所有頁面
            page = day_page
            pages = []
            while page:
                page = self.next_page(page)
                if page:
                    pages.append(page)

            for catergory, article in self.articles(pages, meta_old):
                # print(catergory, article)
                art = self.parse_article(catergory, article)
                # print(art)
                try:
                    file_name = '%s_' % art['Date'] + str(art['Title'])
                except Exception as e:
                    self.log.exception(e)
                    file_name = 'UnkownFileName_%d' % time.time()
                self.save_article(file_name, art, meta_old, meta_path, send=send)

                time.sleep(sleep_time)
コード例 #2
0
ファイル: AppleCrawler.py プロジェクト: DSLabXXX/NewsCrawlers
    def crawl_by_date(self, start=None, end=None, sleep_time=.17, send=False):
        for day_page, date in self.pages(
                cal_days(start, end, format_in="%Y%m%d", format_out="%Y%m%d")):
            file_path = self.file_root + self.news_name + '/' + date + '/'
            check_folder(file_path)

            meta_path = file_path + date + '.json'
            meta_old = check_meta(meta_path)

            for catergory, big_category, article in self.articles(
                    day_page, meta_old):
                art = self.parse_article(catergory, big_category, article,
                                         date)
                try:
                    file_name = '%s_' % art['Date'] + str(art['Title'])
                except Exception as e:
                    self.log.exception(e)
                    file_name = 'UnkownFileName_%d' % time.time()
                self.save_article(file_name,
                                  art,
                                  meta_old,
                                  meta_path,
                                  send=send)

                time.sleep(sleep_time)
コード例 #3
0
    def crawl_by_date(self, start=None, end=None, sleep_time=.87, send=False):

        for day_page, date in self.pages(cal_days(start, end)):
            file_path = self.file_root + self.news_name + '/' + date + '/'
            check_folder(file_path)

            meta_path = file_path + date + '.json'
            meta_old = check_meta(meta_path)

            for page in self.classes(day_page):
                # 找出所有頁面
                next_page = page
                pages = [page]
                while next_page:
                    next_page = self.next_page(next_page)
                    if next_page:
                        pages.append(next_page)

                for catergory, article in self.articles(pages, meta_old):
                    art = self.parse_article(catergory, article)
                    try:
                        file_name = '%s_' % art['Date'] + str(art['Title'])
                    except Exception as e:
                        self.log.exception(e)
                        file_name = 'UnkownFileName_%d' % time.time()
                    self.save_article(file_name,
                                      art,
                                      meta_old,
                                      meta_path,
                                      send=send)

                    time.sleep(sleep_time)
コード例 #4
0
ファイル: Crawler.py プロジェクト: DSLabXXX/NewsCrawlers
    def save_article(self, filename, data, meta_old, meta_path, send):
        # 依照給予的檔名儲存單篇文章
        try:
            # check folder
            file_path = os.path.join(self.file_root, self.news_name, data['Date'][0:8],
                                     data['BigCategory'], data['Category'])
            check_folder(file_path)

            # 為了因應蘋果新聞url已被修改 但原來的url尚可使用 故判斷若已有檔案就不存直接跳出
            filename_path = os.path.join(file_path, filename + '.json')
            if os.path.exists(filename_path):
                return 0

            with open(filename_path, 'w') as op:
                json.dump(data, op, indent=4, ensure_ascii=False)

            # 存檔完沒掛掉就傳到 kafka
            # if send:
            #     send_json_kafka(json.dumps(data))

            # 都沒掛掉就存回 meta date
            meta_old.update({
                data['URL']: {'Title': data['Title'],
                              'Category': data['Category'],
                              'BigCategory': data['BigCategory']}
            })

            with open(meta_path, 'w') as wf:
                json.dump(meta_old, wf, indent=4, ensure_ascii=False)
            self.log.info('已完成爬取 %s > %s > %s > %s' % (data.get('Date'), data.get('BigCategory'),
                                                       data.get('Category'), data.get('Title')))

        except Exception as e:
            self.log.exception(e)
            self.log.error(u'在 Check Folder or Save File 時出現錯誤\nfilename:{0}'.format(filename))
コード例 #5
0
ファイル: Crawler.py プロジェクト: DSLabXXX/NewsCrawlers
    def set_log_conf(self):
        # 設定log
        self.log.setLevel(logging.DEBUG)

        # Log file 看得到 DEBUG
        log_path = os.path.join(self.log_root, self.news_name)
        check_folder(log_path)
        log_name = time.strftime('%Y%m%d%H%M') + '_' + self.news_name + '.log'
        file_hdlr = logging.FileHandler(os.path.join(log_path, log_name))
        file_hdlr.setLevel(logging.DEBUG)

        # Command line 看不到 DEBUG
        console_hdlr = logging.StreamHandler()
        console_hdlr.setLevel(logging.INFO)

        formatter = logging.Formatter('%(levelname)-8s - %(asctime)s - %(name)-12s - %(message)s')
        file_hdlr.setFormatter(formatter)
        console_hdlr.setFormatter(formatter)

        self.log.addHandler(file_hdlr)
        self.log.addHandler(console_hdlr)
コード例 #6
0
    def set_log_conf(self):
        # 設定log
        self.log.setLevel(logging.DEBUG)
        log_path = self.file_root + 'log/'
        check_folder(log_path)
        # Log file 看得到 DEBUG
        file_hdlr = logging.FileHandler(log_path +
                                        time.strftime('%Y%m%d%H%M') +
                                        '_PttGossiping.log')
        file_hdlr.setLevel(logging.DEBUG)

        # Command line 看不到 DEBUG
        console_hdlr = logging.StreamHandler()
        console_hdlr.setLevel(logging.INFO)

        formatter = logging.Formatter(
            '%(levelname)-8s - %(asctime)s - %(name)-12s - %(message)s')
        file_hdlr.setFormatter(formatter)
        console_hdlr.setFormatter(formatter)

        self.log.addHandler(file_hdlr)
        self.log.addHandler(console_hdlr)