def main(self): for url in self.url_list: new_url = url.format(page='') try: content = self.get_content(new_url) except Exception as e: logger.debug(traceback.format_exc()) continue # 获取max page content_list = content.split('\n') max_page = 0 for c in content_list: if 'var maxPage = ' in c: start_index = c.find('=') + 1 max_page = int(c[start_index: -1].strip()) - 1 break try: self.detail_spider(url) except Exception as e: logger.debug(traceback.format_exc()) continue while self.flag != 1 and max_page != 0: max_page_str = '_' + str(max_page) print url.format(page=max_page_str) try: self.detail_spider(url.format(page=max_page_str)) except Exception as e: logger.debug(traceback.format_exc()) continue max_page -= 1 self.flag = 0 print self.article_data_list insert_news_to_mysql(self.article_data_list)
def pic_main(self): for url in self.pic_url_list: page = 1 while self.flag != 1: news_url = url.format(page=page) try: self.pic_detail_spider(news_url) except Exception as e: logger.debug(traceback.format_exc()) print traceback.format_exc() continue page += 1 self.flag = 0 insert_news_to_mysql(self.article_data_list)
def main(self): for url in self.url_list: page = 1 while self.flag != 1: try: self.sina(url.format(page=page)) except Exception as e: logger.debug(traceback.format_exc()) print traceback.format_exc() page += 1 if page >= 2: break self.flag = 0 insert_news_to_mysql(self.article_data_list)
def pic_main(self): for url in self.pic_url_list: try: content = self.get_content(url) except Exception as e: logger.debug(traceback.format_exc()) continue soup = BeautifulSoup(content) for data in soup.select("#item-list a"): tmp_dict = dict() news_url = data['href'] try: news_body = self.get_content(news_url) except Exception as e: logger.debug(traceback.format_exc()) continue news_soup = BeautifulSoup(news_body) title = get_tag_html(news_soup, '#contentE h2') pub_time = get_tag_html(news_soup, '[class~=timt]') pub_time = pub_time.replace(u'日期:', '').strip() pub_timmestamp = string_transform_timestamp(pub_time + ' 00:00:00') if pub_timmestamp < self.start_timestamp: self.flag = 1 break tmp_dict['title'] = title # 获取文章内容 tmp_dict['artile'] = get_tag_html(news_soup, '[class~=explain]') # 获取图片 img_list = list() for img in news_soup.select("#picPlayerTab img"): img_title = img['alt'] img_url = img['src'].replace('st', '') # 上传图片到阿里云 status, msg, img_url = upload_img_to_oss2(img_url) if status: img_list.append([img_title, img_url]) tmp_dict['img_list'] = img_list tmp_dict['source'] = news_url self.article_data_list.append(tmp_dict) insert_news_to_mysql(self.article_data_list)