def parseArticleList(self, response): body = EncodeUtil.toUnicode(response.body) selector = Selector(text=body) source_url = response.meta['source_url'] print source_url title = selector.xpath('//title/text()').extract_first('').strip(u' ') isN = u"请输入验证码" == title if isN or response.status == 302: self.logDao.info(u'访问过多被禁止,重新拨号') # 获取Ip # 同时空线程30s NetworkUtil.getNewIp() TimerUtil.sleep(50) NetworkUtil.openWebbrowser(source_url) else: source = response.meta['source'] wx_account = response.meta['wx_account'] wx_account_id = response.meta['wx_account_id'] self.logDao.info(u'开始解析列表:' + wx_account) # 进行解析 articleJS = selector.xpath('//script/text()').extract() for js in articleJS: if 'var msgList = ' in js: p8 = re.compile('var\s*msgList\s*=.*;') matchList = p8.findall(js) for match in matchList: match = match.lstrip('var msgList = ').rstrip(';') # 格式化 articles = demjson.decode(match) or {} articles = articles['list'] or [] self.logDao.info(u'匹配到文章列表' + wx_account) for article in articles: app_msg_ext_info = article.get( 'app_msg_ext_info') or {} desc = app_msg_ext_info.get('digest') or '' title = app_msg_ext_info.get('title') or '' # 如果存在则不抓取 if self.checkDao.checkExist(title, wx_account, 1): self.logDao.info(u'已经存在' + wx_account + ':' + title) continue detailUrl = app_msg_ext_info['content_url'] or '' if not detailUrl: continue detailUrl = "http://mp.weixin.qq.com" + detailUrl detailUrl = detailUrl.replace("amp;", "") self.logDao.info(u'抓取' + wx_account + ':' + title + ':' + detailUrl) yield scrapy.Request(url=detailUrl, meta={ 'request_type': 'weixin_detail', 'wx_account': wx_account, "source": source, "title": title, 'wx_account_id': wx_account_id, "source_url": detailUrl }, callback=self.parseArticle)
def parseArticle(self, response): body = EncodeUtil.toUnicode(response.body) selector = Selector(text=body) title = selector.xpath('//title/text()').extract_first('').strip(u' ') source_url = response.meta['source_url'] wx_account = response.meta['wx_account'] isN = u"请输入验证码" == title if isN or response.status == 302: self.logDao.info(u'访问过多被禁止,重新拨号') # 存起来 self.brokenAccounts.append(wx_account) # 获取Ip # 同时空线程30s NetworkUtil.getNewIp() TimerUtil.sleep(80) NetworkUtil.openWebbrowser(source_url) else: title = response.meta['title'] source_url = response.meta['source_url'] wx_account_id = response.meta['wx_account_id'] self.logDao.info(u'开始解析文章' + wx_account + ':' + title + ':' + source_url) self.logDao.info(u'开始解析文章:' + source_url) # 进行解析 post_date = selector.xpath( '//*[@id="post-date"]/text()').extract_first('') try: post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d")) except Exception: pass styles = selector.xpath('//style/text()').extract() styles = CssUtil.compressCss(styles).replace('\'', '"').replace( '\\', '\\\\') styles = CssUtil.clearUrl(styles) styles = CssUtil.clearBackgroundColor(styles, ['#f3f3f3']) post_user = selector.xpath( '//*[@id="post-user"]/text()').extract_first('') content_html = selector.xpath('//*[@id="js_content"]') if not len(content_html): self.logDao.info(u'不存在内容:' + source_url) return # 去除内部不需要的标签 content_items = content_html.xpath('*') if not len(content_items): self.logDao.info(u'不存在内容:' + source_url) return # content_items_new = [] # for item in content_items: # itemStr = item.extract() # if u'订阅微信' in itemStr: # continue # content_items_new.append(item) # content_items = content_items_new # 得到纯文本 content_txt = [] for item in content_items: # 文本 allTxt = item.xpath('.//text()').extract() allTxt = ''.join(allTxt).replace('\t', '') # 加入 content_txt.append(allTxt) content_txt = '\n'.join(content_txt) # 组装新的内容标签 outHtml = """<div class="rich_media_content " id="js_content">${++content++}</div>""" content_items = content_items.extract() content_items = ''.join(content_items) content_html = outHtml.replace('${++content++}', content_items) selector = Selector(text=content_html) # 解析文档中的所有图片url,然后替换成标识 image_urls = [] imgs = selector.xpath('descendant::img') for img in imgs: # 图片可能放在src 或者data-src image_url = img.xpath('@src | @data-src').extract_first('') if image_url and image_url.startswith('http'): self.logDao.info(u'得到图片:' + image_url) image_urls.append({ 'url': image_url, }) self.logDao.info(wx_account + u'得到文章:' + title + ":" + post_date + ':' + post_user) self.logDao.info(u'得到文章:' + source_url) # 得到hashCode1 hash_code = self.checkDao.getHashCode(title, wx_account, 1) self.saveFile(hash_code, body) # 去除 image 的 alt title selector = Selector(text=content_html) imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract() # 处理提示块img的 alt title, 关注//img/@alt|//img/@title for imgAltTitle in imgAltTitles: if imgAltTitle.strip(' '): content_html = content_html.replace(imgAltTitle, '') contentItem = ContentItem() contentItem['content_txt'] = content_txt contentItem['image_urls'] = image_urls contentItem['title'] = title contentItem['source_url'] = source_url contentItem['post_date'] = post_date contentItem['sub_channel'] = '' contentItem['post_user'] = post_user contentItem['tags'] = '' contentItem['styles'] = styles contentItem['content_html'] = content_html contentItem['hash_code'] = hash_code contentItem['info_type'] = 1 contentItem['src_source_id'] = 1 contentItem['src_account_id'] = wx_account_id contentItem['src_channel'] = '微信公众号' contentItem['src_ref'] = '' contentItem['wx_account'] = wx_account return contentItem