def pic_detail_spider(self, url): content = self.get_content(url) soup = BeautifulSoup(content) news_detail_list = list() now_year = str(datetime.now().year) for data in soup.select(".picList div"): if '(' in data.span.string: date_time = now_year + '-' + data.span.string[2:-1].replace('/', '-') + ':00' else: date_time = data.span.string + ':00' pub_timestamp = string_transform_timestamp(date_time) if pub_timestamp < self.start_timestamp: self.flag = 1 break news_detail_list.append(data.p.a['href']) for news in news_detail_list: tmp_dict = dict() try: news_body = self.get_content(news) except Exception as e: print traceback.format_exc() logger.debug(traceback.format_exc()) continue news_soup = BeautifulSoup(news_body) title = get_tag_html(news_soup, 'h1') tmp_dict['title'] = title # 获取文章内容 content_list = news_body.split('\n') artile_list = list() img_list = list() img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>' artile = '' for em in content_list: if '{title:' in em: em = em.replace("{title:'", "") em = em.replace("',", "") artile_list.append(em.strip()) if 'big_img: ' in em: em = em.replace("big_img: '", "") em = em.replace("',", "") img_title = '' # 上传图片到阿里云 status, msg, img_url = upload_img_to_oss2(em.strip()) if status: artile += img_tag.format(img_url=img_url, img_title=img_title) img_list.append([img_title, img_url]) for a_content in set(artile_list): artile += a_content tmp_dict['artile'] = artile tmp_dict['img_list'] = img_list tmp_dict['source'] = news tmp_dict['pic_mode'] = 1 self.article_data_list.append(tmp_dict)
def sina(self, url): """ 新浪微博数据抓取 :param url: 抓取数据的url """ print url content = self.get_content(url) content = content.replace('try{feedCardJsonpCallback(', '') content = content.replace(');}catch(e){};', '') content_dict = eval(content) data_list = content_dict['result']['data'] for data in data_list: tmp_dict = dict() url = data['url'].replace('\\', '') ctime = float(data['ctime']) if ctime < self.start_timestamp: self.flag = 1 break tmp_dict['ctime'] = ctime tmp_dict['source'] = url try: data_content = self.get_content(url) except Exception as e: print traceback.format_exc() logger.debug(traceback.format_exc()) continue soup = BeautifulSoup(data_content) title = get_tag_html(soup, '#main_title') tmp_dict['title'] = title.replace('\\', '') digest = get_tag_html(soup, '.ellipsis') tmp_dict['digest'] = digest img_list = list() # 获取图片内容 img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>' artile = u'' for img in soup.select("[class~=content] img"): img_title = img['alt'] img_url = img['src'] # 上传图片到阿里云 status, msg, img_url = upload_img_to_oss2(img_url) if status: img_list.append([img_title, img_url]) artile += img_tag.format(img_url=img_url, img_title=img_title) # 获取文章内容 for a in soup.select("[class~=content] p"): for string in a.strings: artile += u'<p>'+string.strip()+u'</p>' artile = artile.replace(u'新浪娱乐讯 ', '') artile = artile.replace(u'<p>[微博]</p>', '') tmp_dict['artile'] = artile tmp_dict['img_list'] = img_list tmp_dict['pic_mode'] = 0 self.article_data_list.append(tmp_dict)
def detail_spider(self, url): content = self.get_content(url) soup = BeautifulSoup(content) news_detail_list = list() for data in soup.select(".box_txt"): pub_timestamp = string_transform_timestamp(data.span.string + ':00') if pub_timestamp < self.start_timestamp: self.flag = 1 break news_detail_list.append(data.a['href']) for news in news_detail_list: tmp_dict = dict() try: news_body = self.get_content(news) except Exception as e: print traceback.format_exc() logger.debug(traceback.format_exc()) continue news_soup = BeautifulSoup(news_body) title = get_tag_html(news_soup, 'h1') tmp_dict['title'] = title # 获取文章内容 artile = '' # 获取图片 img_list = list() img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>' for data in news_soup.select("#main_content"): img_title = data.span.string if data.span.string else '' try: img_url = data.p.img['src'] except Exception as e: print traceback.format_exc() logger.debug(traceback.format_exc()) continue # 上传图片到阿里云 status, msg, img_url = upload_img_to_oss2(img_url) if status: img_list.append([img_title, img_url]) artile += img_tag.format(img_url=img_url, img_title=img_title) for a in news_soup.select("#main_content p"): for string in a.strings: artile += u'<p>' + string.strip() + u'</p>' tmp_dict['artile'] = artile tmp_dict['img_list'] = img_list tmp_dict['source'] = news tmp_dict['pic_mode'] = 1 self.article_data_list.append(tmp_dict)
def pic_main(self): for url in self.pic_url_list: try: content = self.get_content(url) except Exception as e: logger.debug(traceback.format_exc()) continue soup = BeautifulSoup(content) for data in soup.select("#item-list a"): tmp_dict = dict() news_url = data['href'] try: news_body = self.get_content(news_url) except Exception as e: logger.debug(traceback.format_exc()) continue news_soup = BeautifulSoup(news_body) title = get_tag_html(news_soup, '#contentE h2') pub_time = get_tag_html(news_soup, '[class~=timt]') pub_time = pub_time.replace(u'日期:', '').strip() pub_timmestamp = string_transform_timestamp(pub_time + ' 00:00:00') if pub_timmestamp < self.start_timestamp: self.flag = 1 break tmp_dict['title'] = title # 获取文章内容 tmp_dict['artile'] = get_tag_html(news_soup, '[class~=explain]') # 获取图片 img_list = list() for img in news_soup.select("#picPlayerTab img"): img_title = img['alt'] img_url = img['src'].replace('st', '') # 上传图片到阿里云 status, msg, img_url = upload_img_to_oss2(img_url) if status: img_list.append([img_title, img_url]) tmp_dict['img_list'] = img_list tmp_dict['source'] = news_url self.article_data_list.append(tmp_dict) insert_news_to_mysql(self.article_data_list)
def detail_spider(self, url): content = self.get_content(url) soup = BeautifulSoup(content) news_detail_list = list() now_year = str(datetime.now().year) for data in soup.select("[class~=f14list] li"): if data.span: date_time = now_year + '-' + data.span.string[2:-1].replace('/', '-') + ':00' date_timestamp = string_transform_timestamp(date_time) if date_timestamp < self.start_timestamp: self.flag = 1 break news_detail_list.append(data.a['href']) for news in news_detail_list: tmp_dict = dict() try: news_body = self.get_content(news) except Exception as e: logger.debug(traceback.format_exc()) continue news_soup = BeautifulSoup(news_body) if 'pic' not in news: print news title = get_tag_html(news_soup, 'h1') tmp_dict['title'] = title # 获取图片 img_list = list() img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>' artile = '' for img in news_soup.select("#contentText img"): img_title = img['alt'] img_url = img['src'] # 上传图片到阿里云 status, msg, img_url = upload_img_to_oss2(img_url) if status: img_list.append([img_title, img_url]) artile += img_tag.format(img_url=img_url, img_title=img_title) # 获取文章内容 for a in news_soup.select("#contentText p"): for string in a.strings: if '_tvId' not in string: artile += u'<p>' + string.strip() + u'</p>' artile = artile.replace(u'搜狐娱乐讯 ', '') tmp_dict['artile'] = artile tmp_dict['img_list'] = img_list tmp_dict['pic_mode'] = 0 else: title = get_tag_html(news_soup, '[class~=ttl]') tmp_dict['title'] = title # 获取文章内容 tmp_dict['artile'] = get_tag_html(news_soup, '[class~=explain]') # 获取图片 img_list = list() for img in news_soup.select("#picPlayerTab img"): img_title = img.get('alt') if img.get('alt') else '' img_url = img['src'].replace('st', '') # 上传图片到阿里云 status, msg, img_url = upload_img_to_oss2(img_url) if status: img_list.append([img_title, img_url]) tmp_dict['img_list'] = img_list tmp_dict['pic_mode'] = 1 tmp_dict['source'] = news self.article_data_list.append(tmp_dict)