class SinaSpider(scrapy.Spider): name = 'sina' download_delay = 20 # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数 handle_httpstatus_list = [301, 302, 204, 206, 403, 404, 500] # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题 def __init__(self, name=None, **kwargs): super(SinaSpider, self).__init__(name=None, **kwargs) self.count = 0 self.request_stop = False self.request_stop_time = 0 self.logDao = LogDao(self.logger, 'sina_detail') def start_requests(self): # while True: # 检测网络 if not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # continue # 检测服务器 if not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # continue if self.request_stop: # 拨号生效时间不定,所以需要间隔一段时间再重试 timeSpace = time.time() - self.request_stop_time if timeSpace / 60 <= 2: # 当时间间隔小于 2分钟 就不请求 # continue pass else: self.request_stop = False # 进行爬虫 url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=30&spec=&type=&ch=05&k' \ '=&offset_page=0&offset_num=0&num=60&asc=&page=' for page in range(0, 11): if self.request_stop: self.logDao.warn(u'出现被绊或者出现网络异常,退出循环') # 当网络出现被绊的情况,就需要停止所有的请求等待IP更换 break r = random.uniform(0, 1) newUrl = url + str(page) newUrl += ('&r=' + str(r)) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={'url': newUrl}, callback=self.parseList) # 跑空线程2秒 TimerUtil.sleep(2) if self.request_stop: # 需要发起通知 进行重新拨号 self.logDao.warn(u'发送重新拨号信号,请等待2分钟会尝试重新抓取') self.request_stop_time = time.time() pass else: # 正常抓好之后,当前跑空线程10分钟,不影响一些还没请求完成的request self.logDao.info(u'请求了一轮了,但是可能还有没有请求完成,睡一会10分钟') TimerUtil.sleep(10 * 60) pass # TODO。。还没有找到被禁止的情况 def parseList(self, response): url = response.meta['url'] data = response.body.decode('gbk') data = data.lstrip('var jsonData = ').rstrip(';') # 格式化 data = demjson.decode(data) or {} list = data['list'] or [] self.logDao.info(u"解析列表:" + url) for item in list: itemTime = item['time'] or 0 contentItem = ContentItem() channel = item['channel'] or {} channel_name = channel['title'] contentItem['channel_name'] = channel_name contentItem['title'] = item['title'] contentItem['source_url'] = item['url'] # 暂时知道 两种不同的文章界面 if 'http://tech.sina.com.cn/zl/' in item['url']: callback = self.parseDetail2 else: callback = self.parseDetail self.logDao.info(u"开始抓取文章:" + item['url']) yield scrapy.Request(url=item['url'], meta={ 'contentItem': contentItem, 'source_url': item['url'] }, callback=callback) def parseDetail2(self, response): source_url = response.meta['source_url'] contentItem = response.meta['contentItem'] selector = Selector(text=response.body) pl_main_content = selector.xpath( '//*[@id="J_Article_Wrap"]').extract_first() title = selector.xpath( '//*[@id="artibodyTitle"]/text()').extract_first() or '' post_date = selector.xpath( '//*[@id="pub_date"]/text()').extract_first() or '' post_date = post_date.replace('\r\n', '').strip(' ') post_user = selector.xpath( '//*[@id="media_name"]/a[1]/text()').extract_first() tags = selector.xpath( '//p[@class="art_keywords"]/a/text()').extract() or [] tags = ','.join(tags) main = { 'title': title, 'post_date': post_date, 'post_user': post_user, 'pl_main_content': pl_main_content, 'tags': tags } m2 = hashlib.md5() m2.update(source_url.encode('utf8')) urlHash = m2.hexdigest() self.saveFile(urlHash, json.dumps(main, encoding="utf8", ensure_ascii=False)) contentChilds = selector.xpath( '//*[@id="artibody"]/child::*').extract() image_url = '' content = '' type = '' image_hash = '' contents = [] image_urls = [] for child in contentChilds: image_url = '' content = '' type = '' image_hash = '' curSelector = Selector(text=child) # 特别的网站 http://tech.sina.com.cn/d/2017-06-28/doc-ifyhmtrw4294617.shtml # http://tech.sina.com.cn/zl/post/detail/i/2017-06-28/pid_8511506.htm if 'img_wrapper' in child or 'img' in child: # 有的页面没有 img_wrapper,只有img # 图片形 # 获取图片摘要,下载图片,替换图片名称 type = 'img' image_url = curSelector.xpath('//img/@src').extract_first() content = curSelector.xpath('//span/text()').extract_first() # image_url = image_url[0] if image_url and len(image_url) else '' m2 = hashlib.md5() m2.update(image_url) image_hash = m2.hexdigest() image_urls.append({'url': image_url, 'hash': image_hash}) elif 'strong' in child: # 标题形 type = 'title' content = curSelector.xpath('//strong/text()').extract_first() elif 'gb2312, simkai;' in child: # 小描述形 type = 'shortInfo' content = curSelector.xpath('//span/text()').extract_first() elif '"pictext" align="center"' in child: # 小描述形 type = 'centerContent' content = curSelector.xpath('//p/text()').extract_first() else: # 默认 type = 'normalContent' content = curSelector.xpath('//p').xpath('string(.)').extract() contents.append({ 'type': type, 'image_url': image_url, 'content': content, 'image_hash': image_hash }) contentItem['title'] = title contentItem['post_date'] = post_date contentItem['post_user'] = post_user contentItem['image_urls'] = image_urls contentItem['page_content'] = contents contentItem['tags'] = tags return contentItem def parseDetail(self, response): source_url = response.meta['source_url'] contentItem = response.meta['contentItem'] selector = Selector(text=response.body) pl_main_content = selector.xpath( '//*[@id="pl_main_content"]').extract_first() title = selector.xpath( '//*[@id="main_title"]/text()').extract_first() or '' post_date = selector.xpath( '//*[@id="page-tools"]/span/span[1]/text()').extract_first() or '' post_user = selector.xpath( '//*[@id="page-tools"]/span/span[2]/text() | //*[@id="page-tools"]/span/span[2]/a/text()' ).extract() tags = selector.xpath( '//p[@class="art_keywords"]/a/text()').extract() or [] tags = ','.join(tags) if len(post_user): post_user = ''.join(post_user) else: post_user = '' main = { 'title': title, 'post_date': post_date, 'post_user': post_user, 'pl_main_content': pl_main_content, 'tags': tags } m2 = hashlib.md5() m2.update(source_url.encode('utf8')) urlHash = m2.hexdigest() self.saveFile(urlHash, json.dumps(main, encoding="utf8", ensure_ascii=False)) contentChilds = selector.xpath( '//*[@id="artibody"]/child::*').extract() image_url = '' content = '' type = '' image_hash = '' contents = [] image_urls = [] for child in contentChilds: image_url = '' content = '' type = '' image_hash = '' curSelector = Selector(text=child) # 特别的网站 http://tech.sina.com.cn/d/2017-06-28/doc-ifyhmtrw4294617.shtml # http://tech.sina.com.cn/zl/post/detail/i/2017-06-28/pid_8511506.htm if 'img_wrapper' in child or 'img' in child: # 有的页面没有 img_wrapper,只有img # 图片形 # 获取图片摘要,下载图片,替换图片名称 type = 'img' image_url = curSelector.xpath('//img/@src').extract_first() content = curSelector.xpath('//span/text()').extract_first() # image_url = image_url[0] if image_url and len(image_url) else '' m2 = hashlib.md5() m2.update(image_url) image_hash = m2.hexdigest() image_urls.append({'url': image_url, 'hash': image_hash}) elif 'strong' in child: # 标题形 type = 'title' content = curSelector.xpath('//strong/text()').extract_first() elif 'font-family: KaiTi_GB2312, KaiTi;' in child: # 小描述形 type = 'shortInfo' content = curSelector.xpath('//span/text()').extract_first() elif '"pictext" align="center"' in child: # 小描述形 type = 'centerContent' content = curSelector.xpath('//p/text()').extract_first() else: # 默认 type = 'normalContent' content = curSelector.xpath('//p').xpath('string(.)').extract() contents.append({ 'type': type, 'image_url': image_url, 'content': content, 'image_hash': image_hash }) contentItem['title'] = title contentItem['post_date'] = post_date contentItem['post_user'] = post_user contentItem['image_urls'] = image_urls contentItem['page_content'] = contents contentItem['tags'] = tags return contentItem def saveFile(self, title, content): filename = 'html/%s.json' % title with open(filename, 'wb') as f: f.write(content.encode('utf8')) self.log('Saved file %s' % filename)
class WXSourceSpider(scrapy.Spider): name = 'wx_source' download_delay = 20 # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数 handle_httpstatus_list = [301, 302, 204, 206, 403, 404, 500] # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题 def __init__(self, name=None, **kwargs): super(WXSourceSpider, self).__init__(name=None, **kwargs) self.count = 0 self.wxSourceDao = WxSourceDao() self.currIp = '' self.logDao = LogDao(self.logger, 'weixin_source_catch') self.dataMonitor = DataMonitorDao() def close(spider, reason): spider.saveStatus('stop') spider.dataMonitor.updateTotal('weixin_source_total') def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 进行爬虫 # 获取源 可用的,且(是更新失败的,或者最新的同时更新时间跟当前相比大于40分钟) sources = self.wxSourceDao.queryEnable(isRandom=True) for source in sources: # 更新当前条状态为 更新中,如果更新失败或者被绊则更新为更新失败,更新成功之后设置为成功 (wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source # 更新状态为更新中 self.wxSourceDao.updateStatus(wx_account, 'updating') # 进行页面访问 url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&ie=utf8&_sug_=n&_sug_type_=&query=' newUrl = url + wx_account self.logDao.warn(u'进行抓取:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'weixin_source', 'url': newUrl, 'wx_account': wx_account, 'source': source }, callback=self.parseList, dont_filter=True) def parseList(self, response): source = response.meta['source'] wx_account = response.meta['wx_account'] url = response.meta['url'] body = EncodeUtil.toUnicode(response.body) # 判断被禁止 提示需要重启路由 清理cookie if response.status == 302: # 更新状态为更新失败 self.logDao.warn(u'您的访问过于频繁,重新拨号') self.wxSourceDao.updateStatus(wx_account, 'updateFail') # 获取Ip # 同时空线程30s NetworkUtil.getNewIp() TimerUtil.sleep(30) else: self.logDao.info(u'开始解析:' + wx_account) # 进行解析 selector = Selector(text=body) results = selector.xpath('//*[@id="main"]/div[4]/ul/li') self.logDao.info(u'列表长度:' + str(len(results))) hasCatch = False for result in results: wx_name = result.xpath( '//*[@id="sogou_vr_11002301_box_0"]/div/div[2]/p[1]/a/text()' ).extract_first() wx_account_ = result.xpath( '//p[@class="info"]/label/text()').extract_first() wx_url = result.xpath( '//p[@class="tit"]/a/@href').extract_first() if wx_account_ == wx_account: self.logDao.info(u'成功抓取:' + wx_account_) self.wxSourceDao.updateSource(wx_account, wx_name, wx_url, 'last') hasCatch = True break if not hasCatch: self.logDao.info(u'没有抓到:' + wx_account_) self.wxSourceDao.updateStatus(wx_account, 'none') pass def getStatus(self): try: with open("catchStatus.json", 'r') as load_f: aa = json.load(load_f) return aa.get('status') finally: if load_f: load_f.close() def saveStatus(self, status): try: with open("catchStatus.json", "w") as f: json.dump({'status': status}, f) finally: if f: f.close()
class DetailSpider(scrapy.Spider): name = 'demoName_detail' download_delay = 2.5 # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数 handle_httpstatus_list = [301, 302, 204, 206, 403, 404, 500] # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题 def __init__(self, name=None, **kwargs): super(DetailSpider, self).__init__(name=None, **kwargs) self.count = 0 self.request_stop = False self.request_stop_time = 0 self.logDao = LogDao(self.logger, 'demoName_list_detail') self.checkDao = CheckDao() # 用于缓存css self.css = {'hash': 'style'} self.dataMonitor = DataMonitorDao() self.isRunningStop = False def close(spider, reason): if not spider.isRunningStop: # 如果启动爬虫时候,还有未完成的抓取,此时不应该设置状态为停止,反之 spider.saveStatus('stop') # spider.dataMonitor.updateTotal('demoName_total') pass def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 进行页面访问 newUrl = 'http://tech.qq.com/l/scroll.htm' self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'demoName_page_list', 'url': newUrl }, callback=self.parseArticleList, dont_filter=True) # TODO...还没有遇到被禁止的情况 def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: self.logDao.info(u'开始解析列表') selector = Selector(text=body) articles = selector.xpath('//div[@class="mod newslist"]//li') for article in articles: source_url = article.xpath('a/@href').extract_first('') title = article.xpath('a/text()').extract_first('') post_date = article.xpath('span/text()').extract_first('') post_date = time.strftime('%Y', time.localtime( time.time())) + u'年' + post_date if not source_url: continue # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + post_date + ':' + source_url) continue self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'demoName_detail', "title": title, 'post_date': post_date, "source_url": source_url }, callback=self.parseArticle) def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) category = selector.xpath( '//*[@class="a_catalog"]/a/text()|//*[@class="a_catalog"]/text()' ).extract_first('') post_user = selector.xpath( '//*[@class="a_author"]/text() | //*[@class="where"]/text()| //*[@class="where"]/a/text()' ).extract_first('') src_ref = selector.xpath( '//*[@class="a_source"]/text() | //*[@class="a_source"]/a/text()' ).extract_first('') a_time = selector.xpath( '//*[@class="a_time"]/text() | //*[@class="pubTime"]/text()' ).extract_first('') if a_time: post_date = a_time else: post_date = (post_date or '') post_date = post_date.replace(u'年', '-').replace( u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ') try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M")) except Exception: pass content_html = selector.xpath('//div[@id="Cnt-Main-Article-QQ"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe") and not(boolean(@class="rv-root-v2 rv-js-root"))]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div id="Cnt-Main-Article-QQ" class="Cnt-Main-Article-QQ" bosszone="content">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = category contentItem['post_user'] = post_user contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 3 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = '腾讯科技' contentItem['src_ref'] = src_ref return contentItem def saveFile(self, title, content): # TODO..暂时不保存,考虑保存下来复用效果不佳 return # filename = 'html/%s.html' % title # with open(filename, 'wb') as f: # f.write(content.encode("utf8")) # self.log('Saved file %s' % filename) def getStatus(self): loadF = None try: with open("catchStatus.json", 'r') as loadF: aa = json.load(loadF) return aa.get('status') finally: if loadF: loadF.close() def saveStatus(self, status): loadF = None try: with open("catchStatus.json", "w") as loadF: json.dump({'status': status}, loadF) finally: if loadF: loadF.close()
class SinaSpider(scrapy.Spider): name = 'sina2' download_delay = 2.5 # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数 handle_httpstatus_list = [301, 302, 204, 206, 403, 404, 500] # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题 # 错误码 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://tech.sina.com.cn/i/2017-07-18/doc-ifyiakur9086112.shtml> (failed 1 times): TCP connection timed out: 10060: �������ӷ���һ��ʱ���û����ȷ�����ӵ�����û�з�Ӧ�����ӳ���ʧ�ܡ�. # [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://tech.sina.com.cn/i/2017-07-17/doc-ifyiakwa4300270.shtml> (failed 1 times): User timeout caused connection failure: Getting http://tech.sina.com.cn/i/2017-07-17/doc-ifyiakwa4300270.shtml took longer than 180.0 seconds.. def __init__(self, name=None, **kwargs): super(SinaSpider, self).__init__(name=None, **kwargs) self.count = 0 self.logDao = LogDao(self.logger, 'sina_detail') self.checkDao = CheckDao() # 用于缓存css self.css = { 'hash': 'style' } self.dataMonitor = DataMonitorDao() self.logger.info(u'重走init') def close(spider, reason): spider.dataMonitor.updateTotal('sina_total') def start_requests(self): # while True: # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # continue # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # continue # 进行爬虫 url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=96&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=220&asc=&page=1' # url = 'http://tech.sina.com.cn/t/2017-07-24/doc-ifyihrit1274195.shtml' r = random.uniform(0, 1) newUrl = url + ('&r=' + str(r)) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={'request_type': 'sina_list', 'url': newUrl}, callback=self.parseList) # 补缺补漏 url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&page=1&callback=&_=1501148356254' r = random.uniform(0, 1) newUrl = url + ('&r=' + str(r)) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={'request_type': 'sina_list', 'url': newUrl}, callback=self.parseList2) # TODO。。还没有找到被禁止的情况 def parseList2(self, response): data = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: url = response.meta['url'] self.logDao.info(u'开始解析列表' + url) # 格式化 data = demjson.decode(data) or {} result = data.get('result', {}) list = result.get('data', []) for item in list: channel_name = u'科技' title = item.get('title', '') source_url = item.get('url', '') callback = self.parseDetail2 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在:' + title + source_url) continue self.logDao.info(u"开始抓取文章:" + source_url) # item['url'] = "http://tech.sina.com.cn/d/f/2017-07-31/doc-ifyinvwu3872514.shtml" yield scrapy.Request(url=item['url'], meta={'request_type': 'sina_detail', 'category': channel_name, 'title': title, 'source_url': source_url}, callback=callback) # TODO。。还没有找到被禁止的情况 def parseList(self, response): data = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: url = response.meta['url'] self.logDao.info(u'开始解析列表' + url) data = data.lstrip('var jsonData = ').rstrip(';') # 格式化 data = demjson.decode(data) or {} list = data.get('list', []) for item in list: channel = item.get('channel', {}) channel_name = channel.get('title', '') title = item.get('title', '') source_url = item.get('url', '') callback = self.parseDetail2 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在:' + title + source_url) continue self.logDao.info(u"开始抓取文章:" + item['url']) # item['url'] = "http://tech.sina.com.cn/d/f/2017-07-31/doc-ifyinvwu3872514.shtml" yield scrapy.Request(url=item['url'], meta={'request_type': 'sina_detail', 'category': channel_name, 'title': title, 'source_url': source_url}, callback=callback) def parseDetail2(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: category = response.meta['category'] title = response.meta['title'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + category + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath('//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace('\\', '\\\\') styles = CssUtil.clearUrl(styles) styles = styles.replace('overflow-x:hidden', '').replace('overflow:hidden', '') post_date = selector.xpath('//*[@id="pub_date"]/text() | //*[@class="titer"]/text()').extract_first('') post_date = post_date.replace('\r\n', '').strip(' ').replace(u'年', '-').replace(u'月', '-').replace(u'日', ' ') try: post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M")) except Exception: pass src_ref = selector.xpath( '//*[@id="media_name"]/a[1]/text() | //*[@class="source"]/a/text() | //*[@class="source"]/text()').extract_first( '') post_user = selector.xpath('//*[@id="author_ename"]/a/text()').extract_first('') tags = selector.xpath('//p[@class="art_keywords"]/a/text()').extract() or [] tags = ','.join(tags) content_html = selector.xpath('//*[@id="artibody"][1]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 2017-07-24 19:23 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath('*[not(boolean(@class="entHdPic")) and not(name(.)="script")]') if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 # TODO...之后处理 取出标题类型 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') if u'来源:' in allTxt and len(allTxt) < 25: # 说明这是真正的来源 if not post_user: # 先替换作者 ,如果不存在的话 post_user = src_ref src_ref = allTxt.replace(u'来源:', '').strip(u' ') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="content_wrappr_left"><div class="content"><div class="BSHARE_POP blkContainerSblkCon clearfix blkContainerSblkCon_16" id="artibody">${++content++}</div></div></div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src image_url = img.xpath('@src').extract_first() if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = category contentItem['post_user'] = post_user contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 2 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = '新浪科技' contentItem['src_ref'] = src_ref return contentItem def saveFile(self, title, content): filename = 'html/%s.html' % title with open(filename, 'wb') as f: f.write(content.encode("utf8")) self.log('Saved file %s' % filename)
class DetailSpider(scrapy.Spider): name = 'jiemian_detail' download_delay = 2.5 # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数 handle_httpstatus_list = [301, 302, 204, 206, 403, 404, 500] # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题 def __init__(self, name=None, **kwargs): super(DetailSpider, self).__init__(name=None, **kwargs) self.count = 0 self.request_stop = False self.request_stop_time = 0 self.logDao = LogDao(self.logger, 'jiemian_list_detail') self.checkDao = CheckDao() # 用于缓存css self.css = {'hash': 'style'} self.dataMonitor = DataMonitorDao() self.isRunningStop = False def close(spider, reason): if not spider.isRunningStop: # 如果启动爬虫时候,还有未完成的抓取,此时不应该设置状态为停止,反之 spider.saveStatus('stop') # spider.dataMonitor.updateTotal('jiemian_total') pass def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 必读 玩物 产品榜 快报 游戏要闻 单品 盘点 花边要闻 游戏快报 cids = [{ 'src_channel': u'界面科技', 'sub_channel': u'必读', 'num': '6' }, { 'src_channel': u'界面科技', 'sub_channel': u'玩物', 'num': '66' }, { 'src_channel': u'界面科技', 'sub_channel': u'产品榜', 'num': '73' }, { 'src_channel': u'界面科技', 'sub_channel': u'快报', 'num': '84' }, { 'src_channel': u'界面游戏', 'sub_channel': u'游戏要闻', 'num': '100' }, { 'src_channel': u'界面游戏', 'sub_channel': u'单品', 'num': '119' }, { 'src_channel': u'界面游戏', 'sub_channel': u'盘点', 'num': '120' }, { 'src_channel': u'界面游戏', 'sub_channel': u'花边要闻', 'num': '121' }, { 'src_channel': u'界面游戏', 'sub_channel': u'游戏快报', 'num': '122' }] # 必读 url = 'https://a.jiemian.com/index.php?m=lists&a=ajaxlist&callback=&_=1502103362598&page=' for cid in cids: for page in range(1, 2): cidNum = cid.get('num') src_channel = cid.get('src_channel') sub_channel = cid.get('sub_channel') newUrl = url + str(page) + ('&cid=' + cidNum) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'jiemian_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True) # TODO...还没有遇到被禁止的情况 def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: # 格式化 jsonStr = demjson.decode(body.lstrip('(').rstrip(')')) or {} rst = jsonStr.get('rst', '') if not rst: self.logDao.info(u'不存在内容') return self.logDao.info(u'开始解析列表') src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] selector = Selector(text=rst) articles = selector.xpath( '//div[boolean(contains(@class,"news-view"))]') for article in articles: source_url = article.xpath( './/div[@class="news-header"]//a/@href').extract_first('') title = article.xpath( './/div[@class="news-header"]//a/@title | .//div[@class="news-header"]//a/text()' ).extract_first('') post_date = article.xpath( './/div[@class="news-footer"]//span[@class="date"]/text()' ).extract_first('') tags = article.xpath( './/div[@class="news-tag"]/a/text()').extract() if not source_url: self.logDao.info(u'文章不存在' + title + ':' + source_url + ':' + post_date) continue # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + source_url + ':' + post_date) continue self.logDao.info(u'抓取文章' + title + ':' + source_url + ':' + post_date) yield scrapy.Request(url=source_url, meta={ 'request_type': 'jiemian_detail', "title": title, 'post_date': post_date, 'sub_channel': sub_channel, 'src_channel': src_channel, 'tags': tags, 'source_url': source_url }, callback=self.parseArticle) def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] sub_channel = response.meta['sub_channel'] src_channel = response.meta['src_channel'] tags = response.meta['tags'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) styles = CssUtil.clearBackgroundColor(styles, ['#f5f5f5']) post_user = selector.xpath( '//div[@class="article-info"]//span[@class="author"]//text()' ).extract_first('') src_ref = src_channel post_date = selector.xpath( '//div[@class="article-info"]//span[@class="date"]//text()' ).extract_first('') try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y/%m/%d %H:%M")) except Exception: pass tags_ = selector.xpath( '//div[@class="article-info"]//*[@class="tags"]//text()' ).extract() tags = tags + tags_ tags = ','.join(tags) """ article-main article-img article-content p article-source p:来源 p:点击下载“界面新闻”APP 不抓 """ # 得到article-img article_img = selector.xpath( '//div[@class="article-main"]/div[@class="article-img"]' ).extract_first('') # 得到article-content article_content = selector.xpath( '//div[@class="article-main"]/div[@class="article-content"]' ).extract_first('') if not article_content: self.logDao.info(u'文章不存在' + title + ':' + source_url + ':' + post_date) return contentSelector = Selector(text=article_content) content_items = contentSelector.xpath( '//div[@class="article-content"]/*[not(name(.)="script") and not(' 'name(.)="iframe") and not(name(.)="style") and not(boolean( ' 'contains(a//@href,"?m=app"))) and not(boolean(@class="share-view" ' 'or @class="article-source"))]') # 得到来源 做替换 contentSource = contentSelector.xpath( '//div[@class="article-content"]/div[@class="article-source"]/p/text()' ).extract_first('') if contentSource: contentSource = contentSource.replace(u'来源:', u'') src_ref = contentSource # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = u"""<div class="article-main">${++articleImg++}<div class="article-content" style="font-family: 'Microsoft YaHei', 黑体;">${++content++}</div></div> """ content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++articleImg++}', article_img).replace( '${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = post_user contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 5 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem def saveFile(self, title, content): # TODO..暂时不保存,考虑保存下来复用效果不佳 return # filename = 'html/%s.html' % title # with open(filename, 'wb') as f: # f.write(content.encode("utf8")) # self.log('Saved file %s' % filename) def getStatus(self): loadF = None try: with open("catchStatus.json", 'r') as loadF: aa = json.load(loadF) return aa.get('status') finally: if loadF: loadF.close() def saveStatus(self, status): loadF = None try: with open("catchStatus.json", "w") as loadF: json.dump({'status': status}, loadF) finally: if loadF: loadF.close()
class DetailSpider(scrapy.Spider): name = 'fenghuang_detail' # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数 download_delay = 2.5 # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题 handle_httpstatus_list = [301, 302, 204, 206, 403, 404, 500] def __init__(self, name=None, **kwargs): super(DetailSpider, self).__init__(name=None, **kwargs) self.count = 0 self.request_stop = False self.request_stop_time = 0 self.logDao = LogDao(self.logger, 'fenghuang_list_detail') self.checkDao = CheckDao() # 用于缓存css self.css = {'hash': 'style'} self.dataMonitor = DataMonitorDao() self.isRunningStop = False def close(spider, reason): if not spider.isRunningStop: # 如果启动爬虫时候,还有未完成的抓取,此时不应该设置状态为停止,反之 spider.saveStatus('stop') # spider.dataMonitor.updateTotal('fenghuang_total') pass def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') src_channel = u'凤凰财经' sub_channel = u'电子竞技' url = 'http://games.ifeng.com/listpage/17886/1/list.shtml' styleUrlDefault = [ 'http://p2.ifengimg.com/a/2016/0523/esports.css', 'http://y1.ifengimg.com/package/t_20130820__15953/css/pl_detail_v8.css' ] newUrl = url self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'fenghuang_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticleList3, dont_filter=True) sub_channel = u'产品资讯' url = 'http://games.ifeng.com/listpage/27456/1/list.shtml' styleUrlDefault = [ 'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css' ] newUrl = url self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'fenghuang_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticleList, dont_filter=True) sub_channel = u'热点资讯' url = 'http://games.ifeng.com/listpage/27455/1/list.shtml' styleUrlDefault = [ 'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css' ] newUrl = url self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'fenghuang_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticleList, dont_filter=True) src_channel = u'凤凰科技' sub_channel = u'资讯' url = 'http://tech.ifeng.com/listpage/800/0/1/rtlist.shtml' styleUrlDefault = [ 'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css' ] newUrl = url self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'fenghuang_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticleList2, dont_filter=True) def parseArticleList2(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] styleUrlDefault = response.meta['styleUrlDefault'] self.logDao.info(u'开始解析列表') selector = Selector(text=body) articles = selector.xpath('//div[@class="zheng_list pl10 box"]') for article in articles: source_url = article.xpath('./h1/a/@href').extract_first( '').lstrip('%20').lstrip(' ') title = article.xpath('./h1/a/text()').extract_first('') if not source_url: continue # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + source_url) continue self.logDao.info(u'抓取文章' + title + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'fenghuang_detail', "title": title, "source_url": source_url, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticle) def parseArticleList3(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] styleUrlDefault = response.meta['styleUrlDefault'] self.logDao.info(u'开始解析列表') selector = Selector(text=body) articles = selector.xpath( '//div[boolean(contains(@class, "box_list"))]') for article in articles: source_url = article.xpath('./h2/a/@href').extract_first( '').lstrip('%20').lstrip(' ') title = article.xpath('./h2/a/text()').extract_first('') if not source_url: continue # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + source_url) continue self.logDao.info(u'抓取文章' + title + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'fenghuang_detail', "title": title, "source_url": source_url, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticle) # TODO...还没有遇到被禁止的情况 def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] styleUrlDefault = response.meta['styleUrlDefault'] self.logDao.info(u'开始解析列表') selector = Selector(text=body) articles = selector.xpath('//div[@class="newsList"]//li') for article in articles: source_url = article.xpath('./a/@href').extract_first( '').lstrip('%20').lstrip(' ') title = article.xpath('./a/text()').extract_first('') # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + source_url) continue self.logDao.info(u'抓取文章' + title + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'fenghuang_detail', "title": title, "source_url": source_url, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticle) def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] styleUrlDefault = response.meta['styleUrlDefault'] title = response.meta['title'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleUrls = styleUrls + styleUrlDefault styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) styles = CssUtil.clearBackgroundColor(styles, ['#eaeaea']) tags = selector.xpath( '//meta[@name="keywords"]/@content').extract_first('') category = selector.xpath( '//meta[boolean(contains(@name, "og:category"))]/@content' ).extract_first('') if category: sub_channel = sub_channel + ',' + category src_ref = selector.xpath( '//span[@class="ss03"]//text()').extract_first('') if not src_ref.replace('\n', '').replace(' ', ''): src_ref = selector.xpath( '//div[@id="artical_sth"]/p/text()').extract() src_ref = ''.join(src_ref).replace('\n', '').replace( u'来源:', '').replace(' ', '') post_date = selector.xpath( '//meta[@name="og:time"]/@content').extract_first('') post_date = post_date.replace(u'年', '-').replace( u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ') try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M:%S")) except Exception as e: try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M")) except Exception as e: self.logDao.warn(e.message) pass pass content_html = selector.xpath('//div[@id="main_content"]') logoHtml = selector.xpath( '//span[@class="ifengLogo"]').extract_first('') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe")]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div id="artical_real" class="js_img_share_area"><div id="main_content" class="js_selection_area" bosszone="content">${++content++}</div></div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) content_html = content_html.replace(logoHtml, '') selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = '' contentItem['tags'] = tags contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 8 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = src_ref return contentItem def saveFile(self, title, content): # TODO..暂时不保存,考虑保存下来复用效果不佳 return # filename = 'html/%s.html' % title # with open(filename, 'wb') as f: # f.write(content.encode("utf8")) # self.log('Saved file %s' % filename) def getStatus(self): loadF = None try: with open("catchStatus.json", 'r') as loadF: aa = json.load(loadF) return aa.get('status') finally: if loadF: loadF.close() def saveStatus(self, status): loadF = None try: with open("catchStatus.json", "w") as loadF: json.dump({'status': status}, loadF) finally: if loadF: loadF.close()
class TXDetailSpider(scrapy.Spider): name = 'jiemian_detail' download_delay = 2.5 # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数 handle_httpstatus_list = [301, 302, 204, 206, 403, 404, 500] # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题 def __init__(self, name=None, **kwargs): super(TXDetailSpider, self).__init__(name=None, **kwargs) self.count = 0 self.request_stop = False self.request_stop_time = 0 self.logDao = LogDao(self.logger, 'jiemian_list_detail') self.checkDao = CheckDao() # 用于缓存css self.css = {'hash': 'style'} self.dataMonitor = DataMonitorDao() self.logger.info(u'重走init') def close(spider, reason): # spider.dataMonitor.updateTotal('jiemian_total') pass def start_requests(self): # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 必读 玩物 产品榜 快报 游戏要闻 单品 盘点 花边要闻 游戏快报 cids = ['6', '66', '73', '84', '100', '119', '120', '121', '122'] # 必读 url = 'https://a.jiemian.com/index.php?m=lists&a=ajaxlist&callback=&_=1502103362598&page=' for cid in cids: for page in range(1, 2): newUrl = url + str(page) + ('&cid=' + str(cid)) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'jiemian_page_list', 'url': newUrl }, callback=self.parseArticleList, dont_filter=True) # TODO...还没有遇到被禁止的情况 def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: # 格式化 jsonStr = demjson.decode(body.lstrip('(').rstrip(')')) or {} rst = jsonStr.get('rst', '') if not rst: self.logDao.info(u'不存在内容') return self.logDao.info(u'开始解析列表') selector = Selector(text=rst) articles = selector.xpath('//div[@class="news-img"]/a') for article in articles: source_url = article.xpath('@href').extract_first('') title = article.xpath('@title').extract_first('') # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + source_url) continue if not source_url: self.logDao.info(u'文章不存在' + title + ':' + source_url) continue self.logDao.info(u'抓取文章' + title + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'jiemian_detail', "title": title, "source_url": source_url }, callback=self.parseArticle) def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) category = selector.xpath( '//*[@class="a_catalog"]/a/text()|//*[@class="a_catalog"]/text()' ).extract_first('') post_user = selector.xpath( '//*[@class="a_author"]/text() | //*[@class="where"]/text()| //*[@class="where"]/a/text()' ).extract_first('') src_ref = selector.xpath( '//*[@class="a_source"]/text() | //*[@class="a_source"]/a/text()' ).extract_first('') a_time = selector.xpath( '//*[@class="a_time"]/text() | //*[@class="pubTime"]/text()' ).extract_first('') if a_time: post_date = a_time else: post_date = (post_date or '') post_date = post_date.replace(u'年', '-').replace( u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ') try: post_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M")) except Exception: pass content_html = selector.xpath('//div[@id="Cnt-Main-Article-QQ"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe") and not(boolean(@class="rv-root-v2 rv-js-root"))]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div id="Cnt-Main-Article-QQ" class="Cnt-Main-Article-QQ" bosszone="content">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url = img.xpath('@src').extract_first() if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = category contentItem['post_user'] = post_user contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 3 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = '腾讯科技' contentItem['src_ref'] = src_ref return contentItem def saveFile(self, title, content): filename = 'html/%s.html' % title with open(filename, 'wb') as f: f.write(content.encode("utf8")) self.log('Saved file %s' % filename)
class WYDetailSpider(scrapy.Spider): name = 'wangyi_detail' download_delay = 2.5 # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数 handle_httpstatus_list = [301, 302, 204, 206, 403, 404, 500] # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题 def __init__(self, name=None, **kwargs): super(WYDetailSpider, self).__init__(name=None, **kwargs) self.count = 0 self.request_stop = False self.request_stop_time = 0 self.logDao = LogDao(self.logger, 'wangyi_list_detail') self.checkDao = CheckDao() # 用于缓存css self.css = {'hash': 'style'} self.dataMonitor = DataMonitorDao() self.logger.info(u'重走init') def close(spider, reason): spider.dataMonitor.updateTotal('wangyi_total') def start_requests(self): # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 进行页面访问 newUrl = 'http://tech.163.com/special/00094IHV/news_json.js?' + str( random.uniform(0, 1)) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'wangyi_page_list', 'url': newUrl }, callback=self.parseArticleList, dont_filter=True) # TODO...还没有遇到被禁止的情况 def parseArticleList(self, response): url = response.meta['url'] body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: self.logDao.info(u'开始解析列表') body = body.lstrip('var data=').rstrip(';') # 格式化 jsonStr = demjson.decode(body) or {} articles = jsonStr.get('news') or [] categoryList = jsonStr.get('category') or [] for article_ins in articles: for article in article_ins: source_url = article.get('l', '') title = article.get('t', '') timeStr = article.get('p', '') # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + timeStr + source_url) continue categoryIndex = article.get('c') category = '' if 0 <= categoryIndex < len(categoryList): category = categoryList[categoryIndex].get('n') post_date = article.get('p') self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'wangyi_detail', "title": title, 'category': category, 'post_date': post_date, "source_url": source_url }, callback=self.parseArticle) def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: category = response.meta['category'] title = response.meta['title'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') styles = CssUtil.clearUrl(styles) src_ref = selector.xpath( '//*[@id="ne_article_source"]/text()').extract_first() content_html = selector.xpath('//*[@id="endText"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(boolean(@class="gg200x300" or @class="ep-source cDGray")) and not(name(.)="script")]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="post_text" id="endText" style="border-top:1px solid #ddd;" jcid="5611">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url = img.xpath('@src').extract_first() if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = category contentItem['post_user'] = '' contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 4 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = '网易科技' contentItem['src_ref'] = src_ref return contentItem def saveFile(self, title, content): filename = 'html/%s.html' % title with open(filename, 'wb') as f: f.write(content.encode("utf8")) self.log('Saved file %s' % filename)
class WXDetailSpider(scrapy.Spider): name = 'wx_detail' download_delay = 15 # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数 handle_httpstatus_list = [301, 302, 204, 206, 403, 404, 500] # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题 def __init__(self, name=None, **kwargs): super(WXDetailSpider, self).__init__(name=None, **kwargs) self.count = 0 self.wxSourceDao = WxSourceDao() self.logDao = LogDao(self.logger, 'weixin_list_detail') self.checkDao = CheckDao() self.dataMonitor = DataMonitorDao() self.wxSources = [] self.brokenAccounts = [] # 当前被禁止了的账号,下次抓取优先抓取 def close(spider, reason): # 存被禁止的微信账号 spider.saveBrokenAccounts(spider.brokenAccounts) # 缓存状态 spider.saveStatus('stop') spider.dataMonitor.updateTotal('weixin_total') for source in spider.wxSources: (id, wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source spider.dataMonitor.updateTotal('weixin_account_total', account=wx_account) def start_requests(self): # 如果在晚上12点到早上6点不爬 hour = datetime.datetime.now().hour if 0 <= hour <= 6: self.logDao.info(u'这个时间不爬。0-6点') return # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 获取源 可用有值 sources = self.wxSourceDao.queryWxUrl(isRandom=True) # 排序优先 update_time, brokenAccounts = self.getBrokenAccounts() firstGroup = [] secondGroup = [] for source in sources: (id, wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source if wx_account in brokenAccounts: firstGroup.append(source) else: secondGroup.append(source) sources = firstGroup + secondGroup self.wxSources = sources for source in sources: (id, wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source # 进行页面访问 newUrl = wx_url self.logDao.warn(u'进行抓取:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'weixin_page_list', 'source_url': newUrl, 'wx_account': wx_account, 'source': source, 'wx_account_id': id }, callback=self.parseArticleList, dont_filter=True) def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) selector = Selector(text=body) source_url = response.meta['source_url'] wx_account = response.meta['wx_account'] title = selector.xpath('//title/text()').extract_first('').strip(u' ') isN = u"请输入验证码" == title if isN or response.status == 302: self.logDao.info(u'访问过多被禁止,重新拨号') # 存起来 self.brokenAccounts.append(wx_account) # 获取Ip # 同时空线程30s NetworkUtil.getNewIp() TimerUtil.sleep(80) NetworkUtil.openWebbrowser(source_url) else: source = response.meta['source'] wx_account_id = response.meta['wx_account_id'] self.logDao.info(u'开始解析列表:' + wx_account) # 进行解析 articleJS = selector.xpath('//script/text()').extract() for js in articleJS: if 'var msgList = ' in js: p8 = re.compile('var\s*msgList\s*=.*;') matchList = p8.findall(js) for match in matchList: match = match.lstrip('var msgList = ').rstrip(';') # 格式化 articles = demjson.decode(match) or {} articles = articles['list'] or [] self.logDao.info(u'匹配到文章列表' + wx_account) for article in articles: app_msg_ext_info = article.get( 'app_msg_ext_info') or {} desc = app_msg_ext_info.get('digest') or '' title = app_msg_ext_info.get('title') or '' # 如果存在则不抓取 if self.checkDao.checkExist(title, wx_account, 1): self.logDao.info(u'已经存在' + wx_account + ':' + title) continue detailUrl = app_msg_ext_info['content_url'] or '' detailUrl = "http://mp.weixin.qq.com" + detailUrl detailUrl = detailUrl.replace("amp;", "") self.logDao.info(u'抓取' + wx_account + ':' + title + ':' + detailUrl) if not detailUrl: continue yield scrapy.Request(url=detailUrl, meta={ 'request_type': 'weixin_detail', 'wx_account': wx_account, "source": source, "title": title, 'wx_account_id': wx_account_id, "source_url": detailUrl }, callback=self.parseArticle) def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) selector = Selector(text=body) title = selector.xpath('//title/text()').extract_first('').strip(u' ') source_url = response.meta['source_url'] wx_account = response.meta['wx_account'] isN = u"请输入验证码" == title if isN or response.status == 302: self.logDao.info(u'访问过多被禁止,重新拨号') # 存起来 self.brokenAccounts.append(wx_account) # 获取Ip # 同时空线程30s NetworkUtil.getNewIp() TimerUtil.sleep(80) NetworkUtil.openWebbrowser(source_url) else: title = response.meta['title'] source_url = response.meta['source_url'] wx_account_id = response.meta['wx_account_id'] self.logDao.info(u'开始解析文章' + wx_account + ':' + title + ':' + source_url) self.logDao.info(u'开始解析文章:' + source_url) # 进行解析 post_date = selector.xpath( '//*[@id="post-date"]/text()').extract_first('') try: post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d")) except Exception: pass styles = selector.xpath('//style/text()').extract() styles = CssUtil.compressCss(styles).replace('\'', '"').replace( '\\', '\\\\') styles = CssUtil.clearUrl(styles) styles = CssUtil.clearBackgroundColor(styles, ['#f3f3f3']) post_user = selector.xpath( '//*[@id="post-user"]/text()').extract_first('') content_html = selector.xpath('//*[@id="js_content"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 content_items = content_html.xpath('*') if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # content_items_new = [] # for item in content_items: # itemStr = item.extract() # if u'订阅微信' in itemStr: # continue # content_items_new.append(item) # content_items = content_items_new # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="rich_media_content " id="js_content">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url = img.xpath('@src | @data-src').extract_first('') if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) self.logDao.info(wx_account + u'得到文章:' + title + ":" + post_date + ':' + post_user) self.logDao.info(u'得到文章:' + source_url) # 得到hashCode1 hash_code = self.checkDao.getHashCode(title, wx_account, 1) self.saveFile(hash_code, body) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = '' contentItem['post_user'] = post_user contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 1 contentItem['src_account_id'] = wx_account_id contentItem['src_channel'] = '微信公众号' contentItem['src_ref'] = '' contentItem['wx_account'] = wx_account return contentItem def saveFile(self, title, content): # TODO..暂时不保存,考虑保存下来复用效果不佳 return # filename = 'html/%s.html' % title # with open(filename, 'wb') as f: # f.write(content.encode("utf8")) # self.log('Saved file %s' % filename) def getBrokenAccounts(self): loadF = None try: with open("brokenAccount.json", 'r') as loadF: aa = json.load(loadF) return aa.get('update_time', ''), aa.get('accounts', []) finally: if loadF: loadF.close() def saveBrokenAccounts(self, accounts): loadF = None try: with open("brokenAccount.json", "w") as loadF: update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) json.dump({ 'update_time': update_time, 'accounts': accounts }, loadF) finally: if loadF: loadF.close() def getStatus(self): loadF = None try: with open("catchStatus.json", 'r') as loadF: aa = json.load(loadF) return aa.get('status') finally: if loadF: loadF.close() def saveStatus(self, status): loadF = None try: with open("catchStatus.json", "w") as loadF: json.dump({'status': status}, loadF) finally: if loadF: loadF.close()
class DetailSpider(scrapy.Spider): name = 'sohu_detail' download_delay = 2.5 # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数 handle_httpstatus_list = [301, 302, 204, 206, 403, 404, 500] # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题 def __init__(self, name=None, **kwargs): super(DetailSpider, self).__init__(name=None, **kwargs) self.count = 0 self.request_stop = False self.request_stop_time = 0 self.logDao = LogDao(self.logger, 'sohu_list_detail') self.checkDao = CheckDao() # 用于缓存css self.css = {'hash': 'style'} self.dataMonitor = DataMonitorDao() self.isRunningStop = False def close(spider, reason): if not spider.isRunningStop: # 如果启动爬虫时候,还有未完成的抓取,此时不应该设置状态为停止,反之 spider.saveStatus('stop') # spider.dataMonitor.updateTotal('sohu_total') pass def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') src_channel = u'搜狐科技' sub_channel = u'科技' for page in range(1, 4): # 进行页面访问 url = 'http://v2.sohu.com/public-api/feed?scene=CHANNEL&sceneId=30&size=20&callback=&_=1502075449669&page=' newUrl = url + str(page) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'sohu_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True) # TODO...还没有遇到被禁止的情况 def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] # 格式化 articles = demjson.decode( body.lstrip('/**/').lstrip('(').rstrip(';').rstrip(')')) or [] if not articles: self.logDao.info(u'不存在内容') return self.logDao.info(u'开始解析列表') for article in articles: id = article.get('id', '') authorId = article.get('authorId', '') if not id or not authorId: continue pageId = str(id) + '_' + str(authorId) source_url = 'http://www.sohu.com/a/' + pageId + '?loc=1&focus_pic=0' title = article.get('title', '') post_user = article.get('authorName', '') tags = article.get('tags', []) tagsStr = [] for tag in tags: tagsStr.append(tag.get('name', '')) publicTime = article.get('publicTime', time.time() * 1000) post_date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(publicTime / 1000)) # 如果存在则不抓取 if self.checkDao.checkExist(source_url): self.logDao.info(u'文章已经存在' + title + ':' + post_date + ':' + source_url) continue self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' + source_url) yield scrapy.Request(url=source_url, meta={ 'request_type': 'sohu_detail', 'title': title, 'post_date': post_date, 'post_user': post_user, 'tags': tagsStr, 'source_url': source_url, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticle) def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) if False: self.logDao.info(u'访问过多被禁止') else: src_channel = response.meta['src_channel'] sub_channel = response.meta['sub_channel'] title = response.meta['title'] post_user = response.meta['post_user'] tags = response.meta['tags'] post_date = response.meta['post_date'] source_url = response.meta['source_url'] self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url) selector = Selector(text=body) # 得到样式 styleUrls = selector.xpath( '//link[@rel="stylesheet"]/@href').extract() styleList = [] for styleUrl in styleUrls: if styleUrl.startswith('//'): styleUrl = 'http:' + styleUrl # 得到hash作为key styleUrlHash = EncryptUtil.md5(styleUrl) if not self.css.get(styleUrlHash): # 不存在则去下载 并保存 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl) styleList.append(self.css[styleUrlHash]) styles = CssUtil.compressCss(styleList).replace('\'', '"').replace( '\\', '\\\\') # 替换样式里面的链接 styles = CssUtil.clearUrl(styles) content_html = selector.xpath('//*[@class="article"]') backHtml = selector.xpath('//*[@id="backsohucom"]').extract_first( '') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签u'<p data-role="editor-name">责任编辑:<span></span></p>' # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract() content_items = content_html.xpath( '*[not(name(.)="script") and not(name(.)="style") and not(name(.)="iframe") and not(boolean(@data-role="editor-name"))]' ) if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="article-page"><article class="article">${++content++}</article></div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) content_html = content_html.replace(backHtml, '') selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url_base = img.xpath('@src').extract_first('') if image_url_base.startswith('//'): image_url = 'http:' + image_url_base else: image_url = image_url_base if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) content_html = content_html.replace( image_url_base, image_url) urlHash = EncryptUtil.md5(source_url.encode('utf8')) self.saveFile(urlHash, body) # 得到hashCode hash_code = self.checkDao.getHashCode(source_url) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = sub_channel contentItem['post_user'] = post_user contentItem['tags'] = ','.join(tags) contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 7 # contentItem['src_account_id'] = 0 contentItem['src_channel'] = src_channel contentItem['src_ref'] = '搜狐科技' return contentItem def saveFile(self, title, content): # TODO..暂时不保存,考虑保存下来复用效果不佳 return # filename = 'html/%s.html' % title # with open(filename, 'wb') as f: # f.write(content.encode("utf8")) # self.log('Saved file %s' % filename) def getStatus(self): loadF = None try: with open("catchStatus.json", 'r') as loadF: aa = json.load(loadF) return aa.get('status') finally: if loadF: loadF.close() def saveStatus(self, status): loadF = None try: with open("catchStatus.json", "w") as loadF: json.dump({'status': status}, loadF) finally: if loadF: loadF.close()