def start_requests(self): # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 必读 玩物 产品榜 快报 游戏要闻 单品 盘点 花边要闻 游戏快报 cids = ['6', '66', '73', '84', '100', '119', '120', '121', '122'] # 必读 url = 'https://a.jiemian.com/index.php?m=lists&a=ajaxlist&callback=&_=1502103362598&page=' for cid in cids: for page in range(1, 2): newUrl = url + str(page) + ('&cid=' + str(cid)) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'jiemian_page_list', 'url': newUrl }, callback=self.parseArticleList, dont_filter=True)
def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') src_channel = u'第一财经' sub_channel = u'新闻' # 进行页面访问 newUrl = 'http://www.yicai.com/news/' self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'diyicaijing_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True)
def upload(self, path): """ cos_path:/news/jiemian/image/ :param path :return: """ counter = 0 url = '' while counter != 10: try: # 得到hash uploadName = path.replace('full/', '') request = UploadFileRequest(u"crawler", self.cos_path + uploadName, self.local_path + path, insert_only=0) upload_file_ret = self.cos_client.upload_file(request) if upload_file_ret['code'] == 0: data = upload_file_ret['data'] or {} url = data['source_url'] print u'上传成功', url else: print u'上传图片失败', upload_file_ret break except Exception as e: counter += 1 TimerUtil.sleep(10) return url
def start_requests(self): # while True: # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # continue # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # continue for page in range(1, 2): r = random.uniform(0, 1) url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&callback=&_=1501148356254&page=' newUrl = url + str(page) + '&r=' + str(r) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'sina_list', 'url': newUrl }, callback=self.parseList2)
def start_requests(self): # while True: # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # continue # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # continue # 进行爬虫 url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=96&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=220&asc=&page=1' # url = 'http://tech.sina.com.cn/t/2017-07-24/doc-ifyihrit1274195.shtml' r = random.uniform(0, 1) newUrl = url + ('&r=' + str(r)) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={'request_type': 'sina_list', 'url': newUrl}, callback=self.parseList) # 补缺补漏 url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&page=1&callback=&_=1501148356254' r = random.uniform(0, 1) newUrl = url + ('&r=' + str(r)) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={'request_type': 'sina_list', 'url': newUrl}, callback=self.parseList2)
def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 进行页面访问 newUrl = 'http://tech.qq.com/l/scroll.htm' self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'demoName_page_list', 'url': newUrl }, callback=self.parseArticleList, dont_filter=True)
def parseList(self, response): source = response.meta['source'] wx_account = response.meta['wx_account'] url = response.meta['url'] body = EncodeUtil.toUnicode(response.body) # 判断被禁止 提示需要重启路由 清理cookie if response.status == 302: # 更新状态为更新失败 self.logDao.warn(u'您的访问过于频繁,重新拨号') self.wxSourceDao.updateStatus(wx_account, 'updateFail') # 获取Ip # 同时空线程30s NetworkUtil.getNewIp() TimerUtil.sleep(30) else: self.logDao.info(u'开始解析:' + wx_account) # 进行解析 selector = Selector(text=body) results = selector.xpath('//*[@id="main"]/div[4]/ul/li') self.logDao.info(u'列表长度:' + str(len(results))) hasCatch = False for result in results: wx_name = result.xpath('//*[@id="sogou_vr_11002301_box_0"]/div/div[2]/p[1]/a/text()').extract_first() wx_account_ = result.xpath('//p[@class="info"]/label/text()').extract_first() wx_url = result.xpath('//p[@class="tit"]/a/@href').extract_first() if wx_account_ == wx_account: self.logDao.info(u'成功抓取:' + wx_account_) self.wxSourceDao.updateSource(wx_account, wx_name, wx_url, 'last') hasCatch = True break if not hasCatch: self.logDao.info(u'没有抓到:' + wx_account_) self.wxSourceDao.updateStatus(wx_account, 'none') pass
def start_requests(self): # TODO..加上while可能有问题,有些抓不到 # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 进行爬虫 # 获取源 可用的,且(是更新失败的,或者最新的同时更新时间跟当前相比大于40分钟) sources = self.wxSourceDao.queryEnable_special(isRandom=True, wx_accounts=['CINNO_CreateMore']) for source in sources: # 更新当前条状态为 更新中,如果更新失败或者被绊则更新为更新失败,更新成功之后设置为成功 (wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source # 更新状态为更新中 self.wxSourceDao.updateStatus(wx_account, 'updating') # 进行页面访问 url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&ie=utf8&_sug_=n&_sug_type_=&query=' newUrl = url + wx_account self.logDao.warn(u'进行抓取:' + newUrl) yield scrapy.Request(url=newUrl, meta={'request_type': 'weixin_source', 'url': newUrl, 'wx_account': wx_account, 'source': source}, callback=self.parseList, dont_filter=True)
def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) selector = Selector(text=body) source_url = response.meta['source_url'] print source_url title = selector.xpath('//title/text()').extract_first('').strip(u' ') isN = u"请输入验证码" == title if isN or response.status == 302: self.logDao.info(u'访问过多被禁止,重新拨号') # 获取Ip # 同时空线程30s NetworkUtil.getNewIp() TimerUtil.sleep(50) NetworkUtil.openWebbrowser(source_url) else: source = response.meta['source'] wx_account = response.meta['wx_account'] wx_account_id = response.meta['wx_account_id'] self.logDao.info(u'开始解析列表:' + wx_account) # 进行解析 articleJS = selector.xpath('//script/text()').extract() for js in articleJS: if 'var msgList = ' in js: p8 = re.compile('var\s*msgList\s*=.*;') matchList = p8.findall(js) for match in matchList: match = match.lstrip('var msgList = ').rstrip(';') # 格式化 articles = demjson.decode(match) or {} articles = articles['list'] or [] self.logDao.info(u'匹配到文章列表' + wx_account) for article in articles: app_msg_ext_info = article.get( 'app_msg_ext_info') or {} desc = app_msg_ext_info.get('digest') or '' title = app_msg_ext_info.get('title') or '' # 如果存在则不抓取 if self.checkDao.checkExist(title, wx_account, 1): self.logDao.info(u'已经存在' + wx_account + ':' + title) continue detailUrl = app_msg_ext_info['content_url'] or '' if not detailUrl: continue detailUrl = "http://mp.weixin.qq.com" + detailUrl detailUrl = detailUrl.replace("amp;", "") self.logDao.info(u'抓取' + wx_account + ':' + title + ':' + detailUrl) yield scrapy.Request(url=detailUrl, meta={ 'request_type': 'weixin_detail', 'wx_account': wx_account, "source": source, "title": title, 'wx_account_id': wx_account_id, "source_url": detailUrl }, callback=self.parseArticle)
def start_requests(self): # 如果在晚上12点到早上6点不爬 hour = datetime.datetime.now().hour if 0 <= hour <= 6: self.logDao.info(u'这个时间不爬。0-6点') return # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 获取源 可用有值 sources = self.wxSourceDao.queryWxUrl(isRandom=True) # 排序优先 update_time, brokenAccounts = self.getBrokenAccounts() firstGroup = [] secondGroup = [] for source in sources: (id, wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source if wx_account in brokenAccounts: firstGroup.append(source) else: secondGroup.append(source) sources = firstGroup + secondGroup self.wxSources = sources for source in sources: (id, wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source # 进行页面访问 newUrl = wx_url self.logDao.warn(u'进行抓取:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'weixin_page_list', 'source_url': newUrl, 'wx_account': wx_account, 'source': source, 'wx_account_id': id }, callback=self.parseArticleList, dont_filter=True)
def process_item(self, item, spider): image_urls = [] for image_url in item['image_urls']: url = image_url.get('url') urlHash = EncryptUtil.md5(url) path = 'full/' + str(urlHash) + '.jpg' detailPath = self.savePath + '/' + path # 创建目录 saveDir = self.savePath + '/full' if not FileUtil.dirIsExist(saveDir): FileUtil.createDir(saveDir) if FileUtil.fileIsExist(detailPath): spider.logDao.info(u'图片已经存在本地:' + url) image_url_new = { 'ok': True, 'x': { 'url': url, 'path': path } } else: try: fileResponse = requests.get(url, timeout=10) req_code = fileResponse.status_code req_msg = fileResponse.reason if req_code == 200: open(detailPath, 'wb').write(fileResponse.content) image_url_new = { 'ok': True, 'x': { 'url': url, 'path': path } } spider.logDao.info(u'图片成功下载:' + url) else: spider.logDao.info(u'下载图片失败:' + url) image_url_new = { 'ok': False, 'x': { 'url': url, } } except Exception, e: print e spider.logDao.warn(u'下载图片失败:' + url) image_url_new = { 'ok': False, 'x': { 'url': url, } } image_urls.append(image_url_new) # 空转2s TimerUtil.sleep(2)
def start_requests(self): # while True: # 检测网络 if not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # continue # 检测服务器 if not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # continue if self.request_stop: # 拨号生效时间不定,所以需要间隔一段时间再重试 timeSpace = time.time() - self.request_stop_time if timeSpace / 60 <= 2: # 当时间间隔小于 2分钟 就不请求 # continue pass else: self.request_stop = False # 进行爬虫 url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=30&spec=&type=&ch=05&k' \ '=&offset_page=0&offset_num=0&num=60&asc=&page=' for page in range(0, 11): if self.request_stop: self.logDao.warn(u'出现被绊或者出现网络异常,退出循环') # 当网络出现被绊的情况,就需要停止所有的请求等待IP更换 break r = random.uniform(0, 1) newUrl = url + str(page) newUrl += ('&r=' + str(r)) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={'url': newUrl}, callback=self.parseList) # 跑空线程2秒 TimerUtil.sleep(2) if self.request_stop: # 需要发起通知 进行重新拨号 self.logDao.warn(u'发送重新拨号信号,请等待2分钟会尝试重新抓取') self.request_stop_time = time.time() pass else: # 正常抓好之后,当前跑空线程10分钟,不影响一些还没请求完成的request self.logDao.info(u'请求了一轮了,但是可能还有没有请求完成,睡一会10分钟') TimerUtil.sleep(10 * 60) pass
def start_requests(self): # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') src_channel = u'网易科技' sub_channel = u'科技' # 进行页面访问 newUrl = 'http://tech.163.com/special/00094IHV/news_json.js?' + str( random.uniform(0, 1)) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'wangyi_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True) src_channel = u'网易财经' sub_channel = u'财经' # 进行页面访问 newUrl = 'http://money.163.com/special/00251G8F/news_json.js?' + str( random.uniform(0, 1)) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'wangyi_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True)
def start_requests(self): # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 进行页面访问 src_channel = u'腾讯科技' sub_channel = u'科技' newUrl = 'http://tech.qq.com/l/scroll.htm' self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'tengxun_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True) # 进行页面访问 src_channel = u'腾讯财经' sub_channel = u'财经要闻' newUrl = 'http://finance.qq.com/' self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'tengxun_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList2, dont_filter=True)
def start_requests(self): # unKnow = ["didalive", "HIS_Technology", "ad_helper", "zhongduchongdu"]; 是搜索不到的 # TODO..加上while可能有问题,有些可能抓不到 # 如果正在爬,就不请求 # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 获取源 可用有值 sources = self.wxSourceDao.queryWxUrl_special(isRandom=True, wx_accounts=['qqtech']) for source in sources: (id, wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source # 进行页面访问 newUrl = wx_url self.logDao.warn(u'进行抓取:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'weixin_page_list', 'source_url': newUrl, 'wx_account': wx_account, 'source': source, 'wx_account_id': id }, callback=self.parseArticleList, dont_filter=True)
def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') src_channel = u'搜狐科技' sub_channel = u'科技' for page in range(1, 4): # 进行页面访问 url = 'http://v2.sohu.com/public-api/feed?scene=CHANNEL&sceneId=30&size=20&callback=&_=1502075449669&page=' newUrl = url + str(page) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'sohu_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True)
def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 必读 玩物 产品榜 快报 游戏要闻 单品 盘点 花边要闻 游戏快报 cids = [{ 'src_channel': u'界面科技', 'sub_channel': u'必读', 'num': '6' }, { 'src_channel': u'界面科技', 'sub_channel': u'玩物', 'num': '66' }, { 'src_channel': u'界面科技', 'sub_channel': u'产品榜', 'num': '73' }, { 'src_channel': u'界面科技', 'sub_channel': u'快报', 'num': '84' }, { 'src_channel': u'界面游戏', 'sub_channel': u'游戏要闻', 'num': '100' }, { 'src_channel': u'界面游戏', 'sub_channel': u'单品', 'num': '119' }, { 'src_channel': u'界面游戏', 'sub_channel': u'盘点', 'num': '120' }, { 'src_channel': u'界面游戏', 'sub_channel': u'花边要闻', 'num': '121' }, { 'src_channel': u'界面游戏', 'sub_channel': u'游戏快报', 'num': '122' }] # 必读 url = 'https://a.jiemian.com/index.php?m=lists&a=ajaxlist&callback=&_=1502103362598&page=' for cid in cids: for page in range(1, 2): cidNum = cid.get('num') src_channel = cid.get('src_channel') sub_channel = cid.get('sub_channel') newUrl = url + str(page) + ('&cid=' + cidNum) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'jiemian_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True)
def start_requests(self): # while True: # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # continue # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # continue src_channel = u'新浪科技' sub_channel = u'科技' # 进行爬虫 url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=96&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=220&asc=&page=1' r = random.uniform(0, 1) newUrl = url + ('&r=' + str(r)) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'sina_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseList) src_channel = u'新浪科技' sub_channel = u'科技' # 补缺补漏 url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&page=1&callback=&_=1501148356254' r = random.uniform(0, 1) newUrl = url + ('&r=' + str(r)) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'sina_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseList2) # 新浪财经 要闻 src_channel = u'新浪财经' sub_channel = u'要闻' url = 'http://finance.sina.com.cn/' newUrl = url self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'sina_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseList3)
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) selector = Selector(text=body) title = selector.xpath('//title/text()').extract_first('').strip(u' ') source_url = response.meta['source_url'] wx_account = response.meta['wx_account'] isN = u"请输入验证码" == title if isN or response.status == 302: self.logDao.info(u'访问过多被禁止,重新拨号') # 存起来 self.brokenAccounts.append(wx_account) # 获取Ip # 同时空线程30s NetworkUtil.getNewIp() TimerUtil.sleep(80) NetworkUtil.openWebbrowser(source_url) else: title = response.meta['title'] source_url = response.meta['source_url'] wx_account_id = response.meta['wx_account_id'] self.logDao.info(u'开始解析文章' + wx_account + ':' + title + ':' + source_url) self.logDao.info(u'开始解析文章:' + source_url) # 进行解析 post_date = selector.xpath( '//*[@id="post-date"]/text()').extract_first('') try: post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d")) except Exception: pass styles = selector.xpath('//style/text()').extract() styles = CssUtil.compressCss(styles).replace('\'', '"').replace( '\\', '\\\\') styles = CssUtil.clearUrl(styles) styles = CssUtil.clearBackgroundColor(styles, ['#f3f3f3']) post_user = selector.xpath( '//*[@id="post-user"]/text()').extract_first('') content_html = selector.xpath('//*[@id="js_content"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 content_items = content_html.xpath('*') if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # content_items_new = [] # for item in content_items: # itemStr = item.extract() # if u'订阅微信' in itemStr: # continue # content_items_new.append(item) # content_items = content_items_new # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="rich_media_content " id="js_content">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url = img.xpath('@src | @data-src').extract_first('') if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) self.logDao.info(wx_account + u'得到文章:' + title + ":" + post_date + ':' + post_user) self.logDao.info(u'得到文章:' + source_url) # 得到hashCode1 hash_code = self.checkDao.getHashCode(title, wx_account, 1) self.saveFile(hash_code, body) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = '' contentItem['post_user'] = post_user contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 1 contentItem['src_account_id'] = wx_account_id contentItem['src_channel'] = '微信公众号' contentItem['src_ref'] = '' contentItem['wx_account'] = wx_account return contentItem
def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') src_channel = u'凤凰财经' sub_channel = u'电子竞技' url = 'http://games.ifeng.com/listpage/17886/1/list.shtml' styleUrlDefault = [ 'http://p2.ifengimg.com/a/2016/0523/esports.css', 'http://y1.ifengimg.com/package/t_20130820__15953/css/pl_detail_v8.css' ] newUrl = url self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'fenghuang_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticleList3, dont_filter=True) sub_channel = u'产品资讯' url = 'http://games.ifeng.com/listpage/27456/1/list.shtml' styleUrlDefault = [ 'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css' ] newUrl = url self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'fenghuang_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticleList, dont_filter=True) sub_channel = u'热点资讯' url = 'http://games.ifeng.com/listpage/27455/1/list.shtml' styleUrlDefault = [ 'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css' ] newUrl = url self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'fenghuang_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticleList, dont_filter=True) src_channel = u'凤凰科技' sub_channel = u'资讯' url = 'http://tech.ifeng.com/listpage/800/0/1/rtlist.shtml' styleUrlDefault = [ 'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css' ] newUrl = url self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'fenghuang_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticleList2, dont_filter=True)