def crawl(): Crawler.initialize_workbook() JiaoWuChu().crawl() # XueShengChu().crawl() # JiuYeChuangYe_TongZhi().crawl() # JiuYeChuangYe_JiuYe().crawl() # JiuYeChuangYe_ShiXi().crawl() # JiuYeChuangYe_ZhuanChangZhaoPinHui().crawl() # DangWei().crawl() Crawler.save_workbook()
def convert_date(date_str): try: date_str = Crawler.replace_white_space(date_str) time_format = "%Y-%m-%d%H:%M" date = datetime.datetime.strptime(date_str, time_format) return date except BaseException as e: print("Convert time error in TaiMeiTi. ErrMsg: %s" % str(e))
def website_crawler(): Crawler.initialize_workbook() chan_pin_jing_li.crawl() chuang_ye_bang.crawl() do_news.crawl() # gui_gu_mi_tan.crawl() hu_lian_wang_de_yi_xie_shi.crawl() hu_xiu.crawl() hua_er_jie_jian_wen.crawl() ji_ke_wang.crawl() jie_mian.crawl() jing_li_ren_fen_xiang.crawl() ju_shuo_she.crawl() # ke_ji_lie.crawl() kr.crawl() lie_yun_wang.crawl() mi_ke_wang.crawl() pin_wan.crawl() pin_tu.crawl() tai_mei_ti.crawl() xiao_bai_chuang_ye.crawl() she_hui_website.crawl() Crawler.save_workbook() # Crawler.is_article_dir_exists = 0 # 设置状态为0,下次启动时重新创建文件夹 # send_mail(Crawler.write_file_path) article_path_dir, article_target_dir = get_dir(Crawler.write_article_path) os.chdir(article_path_dir) os.system("tar -czvf result2.tar.gz %s" % article_target_dir) src = article_path_dir + "/result2.tar.gz" target = "/home/jfqiao/result/" transfer_file(src, target, host, user, password) os.system("rm -rf %s/result2.tar.gz" % article_path_dir) image_path_dir, image_target_dir = get_dir(Crawler.write_image_path) os.chdir(image_path_dir) os.system("tar -czvf result3.tar.gz %s" % image_target_dir) src = image_path_dir + "/result3.tar.gz" transfer_file(src, target, host, user, password) os.system("rm -rf %s/result3.tar.gz" % image_path_dir)
def crawl(self): try: page = 1 while not YeJieZiXun.update_stop: resp = requests.get(url=self.page_url % page) if resp.status_code != 200: break bs_obj = BeautifulSoup(resp.content, "html.parser") articles_list = bs_obj.find("div", id="content").findAll( "div", attrs={"id": re.compile("post-\d+")}) if len(articles_list) == 0: break for i in range(1, len(articles_list)): try: article = articles_list[i] href = article.find("h2").find("a") title = href.get_text() url = href.get("href") select_result = self.select_url(url) if select_result: # 查看数据库是否已经有该链接 YeJieZiXun.update_stop = 1 # 如果有则可以直接停止 break image_url = article.find("img").get("src") rel_date = article.find( "div", class_="entry-meta").get_text() pos = rel_date.find(" ") pos = rel_date.find(" ", pos + 1) rel_date = Crawler.replace_white_space(rel_date[:pos]) # 文章发布的时间,一周以内是相对时间(天),今天的文章则相对时间为(时|分), 其他时间则是绝对时间yyyy-mm-dd date = self.convert_date(rel_date) if date < self.target_date: # 比较文章的发表时间,可以保留特定时间段内的文章 YeJieZiXun.update_stop = 1 # 如果文章的发表时间在给定的时间之前,则停止爬虫 break date_str = date.strftime(Crawler.time_format) self.get_article_content(url) self.crawl_image_and_save(image_url) self.write_data_to_sheet(title, url, image_url, date_str, date_str, self.label, self.origin) self.insert_url(url) print(url) except BaseException as e: print("MiKeWang crawl error. ErrMsg: %s" % str(e)) page += 1 except BaseException as e: print("MiKeWang crawl error. ErrMsg: %s" % str(e)) finally: YeJieZiXun.update_stop = 0 # 重置为开始状态,为后续爬其他模块做准备。
def convert_date(date_str): """ 发布的时间采用的是英文月份。 :param date_str: :return: """ try: if "Today" in date_str: date = datetime.datetime.now() else: date_str = "2018-" + Crawler.replace_white_space(date_str) time_format = "%Y-%B%d" date = datetime.datetime.strptime(date_str, time_format) return date except BaseException as e: print("Convert time error in VOX. ErrMsg: %s" % str(e))
from website_crawler import jie_mian from website_crawler import jing_li_ren_fen_xiang from website_crawler import ju_shuo_she from website_crawler import ke_ji_lie from website_crawler import kr from website_crawler import lie_yun_wang from website_crawler import mi_ke_wang from website_crawler import pin_tu from website_crawler import pin_wan from website_crawler import tai_mei_ti from website_crawler import xiao_bai_chuang_ye from website_crawler.crawler import Crawler if __name__ == "__main__": Crawler.initialize_workbook() chan_pin_jing_li.crawl() chuang_ye_bang.crawl() do_news.crawl() # gui_gu_mi_tan.crawl() hu_lian_wang_de_yi_xie_shi.crawl() hu_xiu.crawl() hua_er_jie_jian_wen.crawl() ji_ke_wang.crawl() jie_mian.crawl() jing_li_ren_fen_xiang.crawl() ju_shuo_she.crawl() # ke_ji_lie.crawl() kr.crawl() lie_yun_wang.crawl() mi_ke_wang.crawl()