def crawl_by_date(self, start=None, end=None, sleep_time=.87, send=False): format_ct = "%Y-%m-%d" + self.root_num for day_page, date in self.pages(cal_days(start, end, format_in="%Y%m%d", format_out=format_ct)): date = trans_date_format(date, format_ct, "%Y%m%d") file_path = self.file_root + self.news_name + '/' + date + '/' check_folder(file_path) meta_path = file_path + date + '.json' meta_old = check_meta(meta_path) # 找出所有頁面 page = day_page pages = [] while page: page = self.next_page(page) if page: pages.append(page) for catergory, article in self.articles(pages, meta_old): # print(catergory, article) art = self.parse_article(catergory, article) # print(art) try: file_name = '%s_' % art['Date'] + str(art['Title']) except Exception as e: self.log.exception(e) file_name = 'UnkownFileName_%d' % time.time() self.save_article(file_name, art, meta_old, meta_path, send=send) time.sleep(sleep_time)
def crawl_by_date(self, start=None, end=None, sleep_time=.17, send=False): for day_page, date in self.pages( cal_days(start, end, format_in="%Y%m%d", format_out="%Y%m%d")): file_path = self.file_root + self.news_name + '/' + date + '/' check_folder(file_path) meta_path = file_path + date + '.json' meta_old = check_meta(meta_path) for catergory, big_category, article in self.articles( day_page, meta_old): art = self.parse_article(catergory, big_category, article, date) try: file_name = '%s_' % art['Date'] + str(art['Title']) except Exception as e: self.log.exception(e) file_name = 'UnkownFileName_%d' % time.time() self.save_article(file_name, art, meta_old, meta_path, send=send) time.sleep(sleep_time)
def crawl_by_date(self, start=None, end=None, sleep_time=.87, send=False): for day_page, date in self.pages(cal_days(start, end)): file_path = self.file_root + self.news_name + '/' + date + '/' check_folder(file_path) meta_path = file_path + date + '.json' meta_old = check_meta(meta_path) for page in self.classes(day_page): # 找出所有頁面 next_page = page pages = [page] while next_page: next_page = self.next_page(next_page) if next_page: pages.append(next_page) for catergory, article in self.articles(pages, meta_old): art = self.parse_article(catergory, article) try: file_name = '%s_' % art['Date'] + str(art['Title']) except Exception as e: self.log.exception(e) file_name = 'UnkownFileName_%d' % time.time() self.save_article(file_name, art, meta_old, meta_path, send=send) time.sleep(sleep_time)
def save_article(self, filename, data, meta_old, meta_path, send): # 依照給予的檔名儲存單篇文章 try: # check folder file_path = os.path.join(self.file_root, self.news_name, data['Date'][0:8], data['BigCategory'], data['Category']) check_folder(file_path) # 為了因應蘋果新聞url已被修改 但原來的url尚可使用 故判斷若已有檔案就不存直接跳出 filename_path = os.path.join(file_path, filename + '.json') if os.path.exists(filename_path): return 0 with open(filename_path, 'w') as op: json.dump(data, op, indent=4, ensure_ascii=False) # 存檔完沒掛掉就傳到 kafka # if send: # send_json_kafka(json.dumps(data)) # 都沒掛掉就存回 meta date meta_old.update({ data['URL']: {'Title': data['Title'], 'Category': data['Category'], 'BigCategory': data['BigCategory']} }) with open(meta_path, 'w') as wf: json.dump(meta_old, wf, indent=4, ensure_ascii=False) self.log.info('已完成爬取 %s > %s > %s > %s' % (data.get('Date'), data.get('BigCategory'), data.get('Category'), data.get('Title'))) except Exception as e: self.log.exception(e) self.log.error(u'在 Check Folder or Save File 時出現錯誤\nfilename:{0}'.format(filename))
def set_log_conf(self): # 設定log self.log.setLevel(logging.DEBUG) # Log file 看得到 DEBUG log_path = os.path.join(self.log_root, self.news_name) check_folder(log_path) log_name = time.strftime('%Y%m%d%H%M') + '_' + self.news_name + '.log' file_hdlr = logging.FileHandler(os.path.join(log_path, log_name)) file_hdlr.setLevel(logging.DEBUG) # Command line 看不到 DEBUG console_hdlr = logging.StreamHandler() console_hdlr.setLevel(logging.INFO) formatter = logging.Formatter('%(levelname)-8s - %(asctime)s - %(name)-12s - %(message)s') file_hdlr.setFormatter(formatter) console_hdlr.setFormatter(formatter) self.log.addHandler(file_hdlr) self.log.addHandler(console_hdlr)
def set_log_conf(self): # 設定log self.log.setLevel(logging.DEBUG) log_path = self.file_root + 'log/' check_folder(log_path) # Log file 看得到 DEBUG file_hdlr = logging.FileHandler(log_path + time.strftime('%Y%m%d%H%M') + '_PttGossiping.log') file_hdlr.setLevel(logging.DEBUG) # Command line 看不到 DEBUG console_hdlr = logging.StreamHandler() console_hdlr.setLevel(logging.INFO) formatter = logging.Formatter( '%(levelname)-8s - %(asctime)s - %(name)-12s - %(message)s') file_hdlr.setFormatter(formatter) console_hdlr.setFormatter(formatter) self.log.addHandler(file_hdlr) self.log.addHandler(console_hdlr)