def convert_date(date_str): try: date_str = Crawler.replace_white_space(date_str) time_format = "%Y-%m-%d%H:%M" date = datetime.datetime.strptime(date_str, time_format) return date except BaseException as e: print("Convert time error in TaiMeiTi. ErrMsg: %s" % str(e))
def crawl(self): try: page = 1 while not YeJieZiXun.update_stop: resp = requests.get(url=self.page_url % page) if resp.status_code != 200: break bs_obj = BeautifulSoup(resp.content, "html.parser") articles_list = bs_obj.find("div", id="content").findAll( "div", attrs={"id": re.compile("post-\d+")}) if len(articles_list) == 0: break for i in range(1, len(articles_list)): try: article = articles_list[i] href = article.find("h2").find("a") title = href.get_text() url = href.get("href") select_result = self.select_url(url) if select_result: # 查看数据库是否已经有该链接 YeJieZiXun.update_stop = 1 # 如果有则可以直接停止 break image_url = article.find("img").get("src") rel_date = article.find( "div", class_="entry-meta").get_text() pos = rel_date.find(" ") pos = rel_date.find(" ", pos + 1) rel_date = Crawler.replace_white_space(rel_date[:pos]) # 文章发布的时间,一周以内是相对时间(天),今天的文章则相对时间为(时|分), 其他时间则是绝对时间yyyy-mm-dd date = self.convert_date(rel_date) if date < self.target_date: # 比较文章的发表时间,可以保留特定时间段内的文章 YeJieZiXun.update_stop = 1 # 如果文章的发表时间在给定的时间之前,则停止爬虫 break date_str = date.strftime(Crawler.time_format) self.get_article_content(url) self.crawl_image_and_save(image_url) self.write_data_to_sheet(title, url, image_url, date_str, date_str, self.label, self.origin) self.insert_url(url) print(url) except BaseException as e: print("MiKeWang crawl error. ErrMsg: %s" % str(e)) page += 1 except BaseException as e: print("MiKeWang crawl error. ErrMsg: %s" % str(e)) finally: YeJieZiXun.update_stop = 0 # 重置为开始状态,为后续爬其他模块做准备。
def convert_date(date_str): """ 发布的时间采用的是英文月份。 :param date_str: :return: """ try: if "Today" in date_str: date = datetime.datetime.now() else: date_str = "2018-" + Crawler.replace_white_space(date_str) time_format = "%Y-%B%d" date = datetime.datetime.strptime(date_str, time_format) return date except BaseException as e: print("Convert time error in VOX. ErrMsg: %s" % str(e))