コード例 #1
0
ファイル: SinaSpider.py プロジェクト: BTYT/studyScrapy
class SinaSpider(scrapy.Spider):
    name = 'sina'
    download_delay = 20  # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数
    handle_httpstatus_list = [301, 302, 204, 206, 403, 404,
                              500]  # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题

    def __init__(self, name=None, **kwargs):
        super(SinaSpider, self).__init__(name=None, **kwargs)
        self.count = 0
        self.request_stop = False
        self.request_stop_time = 0
        self.logDao = LogDao(self.logger, 'sina_detail')

    def start_requests(self):
        # while True:
        # 检测网络
        if not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')
            # continue

        # 检测服务器
        if not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
            # continue

        if self.request_stop:
            # 拨号生效时间不定,所以需要间隔一段时间再重试
            timeSpace = time.time() - self.request_stop_time
            if timeSpace / 60 <= 2:
                # 当时间间隔小于 2分钟 就不请求
                # continue
                pass
            else:
                self.request_stop = False

        # 进行爬虫
        url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=30&spec=&type=&ch=05&k' \
              '=&offset_page=0&offset_num=0&num=60&asc=&page='

        for page in range(0, 11):
            if self.request_stop:
                self.logDao.warn(u'出现被绊或者出现网络异常,退出循环')
                # 当网络出现被绊的情况,就需要停止所有的请求等待IP更换
                break
            r = random.uniform(0, 1)
            newUrl = url + str(page)
            newUrl += ('&r=' + str(r))
            self.logDao.info(u"开始抓取列表:" + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={'url': newUrl},
                                 callback=self.parseList)
            # 跑空线程2秒
            TimerUtil.sleep(2)

        if self.request_stop:
            # 需要发起通知 进行重新拨号
            self.logDao.warn(u'发送重新拨号信号,请等待2分钟会尝试重新抓取')
            self.request_stop_time = time.time()
            pass
        else:
            # 正常抓好之后,当前跑空线程10分钟,不影响一些还没请求完成的request
            self.logDao.info(u'请求了一轮了,但是可能还有没有请求完成,睡一会10分钟')
            TimerUtil.sleep(10 * 60)
            pass

    # TODO。。还没有找到被禁止的情况
    def parseList(self, response):
        url = response.meta['url']
        data = response.body.decode('gbk')
        data = data.lstrip('var jsonData = ').rstrip(';')
        # 格式化
        data = demjson.decode(data) or {}
        list = data['list'] or []
        self.logDao.info(u"解析列表:" + url)
        for item in list:
            itemTime = item['time'] or 0
            contentItem = ContentItem()
            channel = item['channel'] or {}
            channel_name = channel['title']
            contentItem['channel_name'] = channel_name

            contentItem['title'] = item['title']
            contentItem['source_url'] = item['url']

            # 暂时知道 两种不同的文章界面
            if 'http://tech.sina.com.cn/zl/' in item['url']:
                callback = self.parseDetail2
            else:
                callback = self.parseDetail

            self.logDao.info(u"开始抓取文章:" + item['url'])
            yield scrapy.Request(url=item['url'],
                                 meta={
                                     'contentItem': contentItem,
                                     'source_url': item['url']
                                 },
                                 callback=callback)

    def parseDetail2(self, response):
        source_url = response.meta['source_url']
        contentItem = response.meta['contentItem']
        selector = Selector(text=response.body)
        pl_main_content = selector.xpath(
            '//*[@id="J_Article_Wrap"]').extract_first()
        title = selector.xpath(
            '//*[@id="artibodyTitle"]/text()').extract_first() or ''
        post_date = selector.xpath(
            '//*[@id="pub_date"]/text()').extract_first() or ''
        post_date = post_date.replace('\r\n', '').strip(' ')
        post_user = selector.xpath(
            '//*[@id="media_name"]/a[1]/text()').extract_first()
        tags = selector.xpath(
            '//p[@class="art_keywords"]/a/text()').extract() or []
        tags = ','.join(tags)

        main = {
            'title': title,
            'post_date': post_date,
            'post_user': post_user,
            'pl_main_content': pl_main_content,
            'tags': tags
        }
        m2 = hashlib.md5()
        m2.update(source_url.encode('utf8'))
        urlHash = m2.hexdigest()
        self.saveFile(urlHash,
                      json.dumps(main, encoding="utf8", ensure_ascii=False))
        contentChilds = selector.xpath(
            '//*[@id="artibody"]/child::*').extract()

        image_url = ''
        content = ''
        type = ''
        image_hash = ''
        contents = []
        image_urls = []
        for child in contentChilds:
            image_url = ''
            content = ''
            type = ''
            image_hash = ''
            curSelector = Selector(text=child)
            # 特别的网站 http://tech.sina.com.cn/d/2017-06-28/doc-ifyhmtrw4294617.shtml
            # http://tech.sina.com.cn/zl/post/detail/i/2017-06-28/pid_8511506.htm
            if 'img_wrapper' in child or 'img' in child:
                # 有的页面没有 img_wrapper,只有img
                # 图片形
                # 获取图片摘要,下载图片,替换图片名称
                type = 'img'
                image_url = curSelector.xpath('//img/@src').extract_first()
                content = curSelector.xpath('//span/text()').extract_first()
                # image_url = image_url[0] if image_url and len(image_url) else ''

                m2 = hashlib.md5()
                m2.update(image_url)
                image_hash = m2.hexdigest()
                image_urls.append({'url': image_url, 'hash': image_hash})
            elif 'strong' in child:
                # 标题形
                type = 'title'
                content = curSelector.xpath('//strong/text()').extract_first()

            elif 'gb2312, simkai;' in child:
                # 小描述形
                type = 'shortInfo'
                content = curSelector.xpath('//span/text()').extract_first()

            elif '"pictext" align="center"' in child:
                # 小描述形
                type = 'centerContent'
                content = curSelector.xpath('//p/text()').extract_first()

            else:
                # 默认
                type = 'normalContent'
                content = curSelector.xpath('//p').xpath('string(.)').extract()

            contents.append({
                'type': type,
                'image_url': image_url,
                'content': content,
                'image_hash': image_hash
            })
        contentItem['title'] = title
        contentItem['post_date'] = post_date
        contentItem['post_user'] = post_user
        contentItem['image_urls'] = image_urls
        contentItem['page_content'] = contents
        contentItem['tags'] = tags
        return contentItem

    def parseDetail(self, response):
        source_url = response.meta['source_url']
        contentItem = response.meta['contentItem']
        selector = Selector(text=response.body)
        pl_main_content = selector.xpath(
            '//*[@id="pl_main_content"]').extract_first()
        title = selector.xpath(
            '//*[@id="main_title"]/text()').extract_first() or ''
        post_date = selector.xpath(
            '//*[@id="page-tools"]/span/span[1]/text()').extract_first() or ''
        post_user = selector.xpath(
            '//*[@id="page-tools"]/span/span[2]/text() | //*[@id="page-tools"]/span/span[2]/a/text()'
        ).extract()
        tags = selector.xpath(
            '//p[@class="art_keywords"]/a/text()').extract() or []
        tags = ','.join(tags)

        if len(post_user):
            post_user = ''.join(post_user)
        else:
            post_user = ''

        main = {
            'title': title,
            'post_date': post_date,
            'post_user': post_user,
            'pl_main_content': pl_main_content,
            'tags': tags
        }
        m2 = hashlib.md5()
        m2.update(source_url.encode('utf8'))
        urlHash = m2.hexdigest()
        self.saveFile(urlHash,
                      json.dumps(main, encoding="utf8", ensure_ascii=False))
        contentChilds = selector.xpath(
            '//*[@id="artibody"]/child::*').extract()

        image_url = ''
        content = ''
        type = ''
        image_hash = ''
        contents = []
        image_urls = []
        for child in contentChilds:
            image_url = ''
            content = ''
            type = ''
            image_hash = ''
            curSelector = Selector(text=child)
            # 特别的网站 http://tech.sina.com.cn/d/2017-06-28/doc-ifyhmtrw4294617.shtml
            # http://tech.sina.com.cn/zl/post/detail/i/2017-06-28/pid_8511506.htm
            if 'img_wrapper' in child or 'img' in child:
                # 有的页面没有 img_wrapper,只有img
                # 图片形
                # 获取图片摘要,下载图片,替换图片名称
                type = 'img'
                image_url = curSelector.xpath('//img/@src').extract_first()
                content = curSelector.xpath('//span/text()').extract_first()
                # image_url = image_url[0] if image_url and len(image_url) else ''

                m2 = hashlib.md5()
                m2.update(image_url)
                image_hash = m2.hexdigest()
                image_urls.append({'url': image_url, 'hash': image_hash})
            elif 'strong' in child:
                # 标题形
                type = 'title'
                content = curSelector.xpath('//strong/text()').extract_first()

            elif 'font-family: KaiTi_GB2312, KaiTi;' in child:
                # 小描述形
                type = 'shortInfo'
                content = curSelector.xpath('//span/text()').extract_first()

            elif '"pictext" align="center"' in child:
                # 小描述形
                type = 'centerContent'
                content = curSelector.xpath('//p/text()').extract_first()

            else:
                # 默认
                type = 'normalContent'
                content = curSelector.xpath('//p').xpath('string(.)').extract()

            contents.append({
                'type': type,
                'image_url': image_url,
                'content': content,
                'image_hash': image_hash
            })
        contentItem['title'] = title
        contentItem['post_date'] = post_date
        contentItem['post_user'] = post_user
        contentItem['image_urls'] = image_urls
        contentItem['page_content'] = contents
        contentItem['tags'] = tags
        return contentItem

    def saveFile(self, title, content):
        filename = 'html/%s.json' % title
        with open(filename, 'wb') as f:
            f.write(content.encode('utf8'))
        self.log('Saved file %s' % filename)
コード例 #2
0
class WXSourceSpider(scrapy.Spider):
    name = 'wx_source'
    download_delay = 20  # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数
    handle_httpstatus_list = [301, 302, 204, 206, 403, 404,
                              500]  # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题

    def __init__(self, name=None, **kwargs):
        super(WXSourceSpider, self).__init__(name=None, **kwargs)
        self.count = 0
        self.wxSourceDao = WxSourceDao()
        self.currIp = ''
        self.logDao = LogDao(self.logger, 'weixin_source_catch')
        self.dataMonitor = DataMonitorDao()

    def close(spider, reason):
        spider.saveStatus('stop')
        spider.dataMonitor.updateTotal('weixin_source_total')

    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            return
        self.saveStatus('running')

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        # 进行爬虫
        # 获取源  可用的,且(是更新失败的,或者最新的同时更新时间跟当前相比大于40分钟)
        sources = self.wxSourceDao.queryEnable(isRandom=True)

        for source in sources:
            # 更新当前条状态为 更新中,如果更新失败或者被绊则更新为更新失败,更新成功之后设置为成功
            (wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable,
             update_time) = source
            # 更新状态为更新中
            self.wxSourceDao.updateStatus(wx_account, 'updating')
            # 进行页面访问
            url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&ie=utf8&_sug_=n&_sug_type_=&query='
            newUrl = url + wx_account
            self.logDao.warn(u'进行抓取:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'weixin_source',
                                     'url': newUrl,
                                     'wx_account': wx_account,
                                     'source': source
                                 },
                                 callback=self.parseList,
                                 dont_filter=True)

    def parseList(self, response):
        source = response.meta['source']
        wx_account = response.meta['wx_account']
        url = response.meta['url']
        body = EncodeUtil.toUnicode(response.body)
        # 判断被禁止 提示需要重启路由 清理cookie
        if response.status == 302:
            # 更新状态为更新失败
            self.logDao.warn(u'您的访问过于频繁,重新拨号')
            self.wxSourceDao.updateStatus(wx_account, 'updateFail')
            # 获取Ip # 同时空线程30s
            NetworkUtil.getNewIp()
            TimerUtil.sleep(30)
        else:
            self.logDao.info(u'开始解析:' + wx_account)
            # 进行解析
            selector = Selector(text=body)
            results = selector.xpath('//*[@id="main"]/div[4]/ul/li')
            self.logDao.info(u'列表长度:' + str(len(results)))
            hasCatch = False
            for result in results:
                wx_name = result.xpath(
                    '//*[@id="sogou_vr_11002301_box_0"]/div/div[2]/p[1]/a/text()'
                ).extract_first()
                wx_account_ = result.xpath(
                    '//p[@class="info"]/label/text()').extract_first()
                wx_url = result.xpath(
                    '//p[@class="tit"]/a/@href').extract_first()
                if wx_account_ == wx_account:
                    self.logDao.info(u'成功抓取:' + wx_account_)
                    self.wxSourceDao.updateSource(wx_account, wx_name, wx_url,
                                                  'last')
                    hasCatch = True
                    break
            if not hasCatch:
                self.logDao.info(u'没有抓到:' + wx_account_)
                self.wxSourceDao.updateStatus(wx_account, 'none')
            pass

    def getStatus(self):
        try:
            with open("catchStatus.json", 'r') as load_f:
                aa = json.load(load_f)
                return aa.get('status')
        finally:
            if load_f:
                load_f.close()

    def saveStatus(self, status):
        try:
            with open("catchStatus.json", "w") as f:
                json.dump({'status': status}, f)
        finally:
            if f:
                f.close()
コード例 #3
0
class DetailSpider(scrapy.Spider):
    name = 'demoName_detail'
    download_delay = 2.5  # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数
    handle_httpstatus_list = [301, 302, 204, 206, 403, 404,
                              500]  # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题

    def __init__(self, name=None, **kwargs):
        super(DetailSpider, self).__init__(name=None, **kwargs)
        self.count = 0
        self.request_stop = False
        self.request_stop_time = 0
        self.logDao = LogDao(self.logger, 'demoName_list_detail')
        self.checkDao = CheckDao()
        # 用于缓存css
        self.css = {'hash': 'style'}
        self.dataMonitor = DataMonitorDao()
        self.isRunningStop = False

    def close(spider, reason):
        if not spider.isRunningStop:
            # 如果启动爬虫时候,还有未完成的抓取,此时不应该设置状态为停止,反之
            spider.saveStatus('stop')
        # spider.dataMonitor.updateTotal('demoName_total')
        pass

    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        # 进行页面访问
        newUrl = 'http://tech.qq.com/l/scroll.htm'
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'demoName_page_list',
                                 'url': newUrl
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)

    # TODO...还没有遇到被禁止的情况
    def parseArticleList(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            self.logDao.info(u'开始解析列表')
            selector = Selector(text=body)
            articles = selector.xpath('//div[@class="mod newslist"]//li')
            for article in articles:
                source_url = article.xpath('a/@href').extract_first('')
                title = article.xpath('a/text()').extract_first('')
                post_date = article.xpath('span/text()').extract_first('')
                post_date = time.strftime('%Y', time.localtime(
                    time.time())) + u'年' + post_date
                if not source_url:
                    continue
                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + post_date +
                                     ':' + source_url)
                    continue
                self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' +
                                 source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'demoName_detail',
                                         "title": title,
                                         'post_date': post_date,
                                         "source_url": source_url
                                     },
                                     callback=self.parseArticle)

    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            title = response.meta['title']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' +
                             source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)

            category = selector.xpath(
                '//*[@class="a_catalog"]/a/text()|//*[@class="a_catalog"]/text()'
            ).extract_first('')

            post_user = selector.xpath(
                '//*[@class="a_author"]/text() | //*[@class="where"]/text()| //*[@class="where"]/a/text()'
            ).extract_first('')

            src_ref = selector.xpath(
                '//*[@class="a_source"]/text() | //*[@class="a_source"]/a/text()'
            ).extract_first('')

            a_time = selector.xpath(
                '//*[@class="a_time"]/text() | //*[@class="pubTime"]/text()'
            ).extract_first('')

            if a_time:
                post_date = a_time
            else:
                post_date = (post_date or '')

            post_date = post_date.replace(u'年', '-').replace(
                u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ')

            try:
                post_date = time.strftime(
                    "%Y-%m-%d %H:%M:%S",
                    time.strptime(post_date, "%Y-%m-%d %H:%M"))
            except Exception:
                pass

            content_html = selector.xpath('//div[@id="Cnt-Main-Article-QQ"]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath(
                '*[not(name(.)="script") and not(name(.)="style")  and not(name(.)="iframe") and not(boolean(@class="rv-root-v2 rv-js-root"))]'
            )
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div id="Cnt-Main-Article-QQ" class="Cnt-Main-Article-QQ" bosszone="content">${++content++}</div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(
                        image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = category
            contentItem['post_user'] = post_user
            contentItem['tags'] = ''
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 3
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = '腾讯科技'
            contentItem['src_ref'] = src_ref
            return contentItem

    def saveFile(self, title, content):
        # TODO..暂时不保存,考虑保存下来复用效果不佳
        return
        # filename = 'html/%s.html' % title
        # with open(filename, 'wb') as f:
        #     f.write(content.encode("utf8"))
        # self.log('Saved file %s' % filename)

    def getStatus(self):
        loadF = None
        try:
            with open("catchStatus.json", 'r') as loadF:
                aa = json.load(loadF)
                return aa.get('status')
        finally:
            if loadF:
                loadF.close()

    def saveStatus(self, status):
        loadF = None
        try:
            with open("catchStatus.json", "w") as loadF:
                json.dump({'status': status}, loadF)
        finally:
            if loadF:
                loadF.close()
コード例 #4
0
ファイル: SinaSpider2.py プロジェクト: BTYT/studyScrapy
class SinaSpider(scrapy.Spider):
    name = 'sina2'
    download_delay = 2.5  # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数
    handle_httpstatus_list = [301, 302, 204, 206, 403, 404, 500]  # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题

    # 错误码 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://tech.sina.com.cn/i/2017-07-18/doc-ifyiakur9086112.shtml> (failed 1 times): TCP connection timed out: 10060: �������ӷ���һ��ʱ���û����ȷ�𸴻����ӵ�����û�з�Ӧ�����ӳ���ʧ�ܡ�.
    # [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://tech.sina.com.cn/i/2017-07-17/doc-ifyiakwa4300270.shtml> (failed 1 times): User timeout caused connection failure: Getting http://tech.sina.com.cn/i/2017-07-17/doc-ifyiakwa4300270.shtml took longer than 180.0 seconds..
    def __init__(self, name=None, **kwargs):
        super(SinaSpider, self).__init__(name=None, **kwargs)
        self.count = 0
        self.logDao = LogDao(self.logger, 'sina_detail')
        self.checkDao = CheckDao()
        # 用于缓存css
        self.css = {
            'hash': 'style'
        }
        self.dataMonitor = DataMonitorDao()
        self.logger.info(u'重走init')

    def close(spider, reason):
        spider.dataMonitor.updateTotal('sina_total')

    def start_requests(self):
        # while True:
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')
            # continue

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
            # continue

        # 进行爬虫
        url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=96&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=220&asc=&page=1'
        # url = 'http://tech.sina.com.cn/t/2017-07-24/doc-ifyihrit1274195.shtml'

        r = random.uniform(0, 1)
        newUrl = url + ('&r=' + str(r))
        self.logDao.info(u"开始抓取列表:" + newUrl)
        yield scrapy.Request(url=newUrl, meta={'request_type': 'sina_list', 'url': newUrl}, callback=self.parseList)

        # 补缺补漏
        url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&page=1&callback=&_=1501148356254'
        r = random.uniform(0, 1)
        newUrl = url + ('&r=' + str(r))
        self.logDao.info(u"开始抓取列表:" + newUrl)
        yield scrapy.Request(url=newUrl, meta={'request_type': 'sina_list', 'url': newUrl}, callback=self.parseList2)

    # TODO。。还没有找到被禁止的情况
    def parseList2(self, response):
        data = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            url = response.meta['url']
            self.logDao.info(u'开始解析列表' + url)

            # 格式化
            data = demjson.decode(data) or {}

            result = data.get('result', {})
            list = result.get('data', [])

            for item in list:
                channel_name = u'科技'
                title = item.get('title', '')
                source_url = item.get('url', '')
                callback = self.parseDetail2
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在:' + title + source_url)
                    continue
                self.logDao.info(u"开始抓取文章:" + source_url)
                # item['url'] = "http://tech.sina.com.cn/d/f/2017-07-31/doc-ifyinvwu3872514.shtml"
                yield scrapy.Request(url=item['url'],
                                     meta={'request_type': 'sina_detail', 'category': channel_name,
                                           'title': title, 'source_url': source_url},
                                     callback=callback)

    # TODO。。还没有找到被禁止的情况
    def parseList(self, response):
        data = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            url = response.meta['url']
            self.logDao.info(u'开始解析列表' + url)

            data = data.lstrip('var jsonData = ').rstrip(';')
            # 格式化
            data = demjson.decode(data) or {}

            list = data.get('list', [])

            for item in list:
                channel = item.get('channel', {})
                channel_name = channel.get('title', '')
                title = item.get('title', '')
                source_url = item.get('url', '')
                callback = self.parseDetail2
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在:' + title + source_url)
                    continue
                self.logDao.info(u"开始抓取文章:" + item['url'])
                # item['url'] = "http://tech.sina.com.cn/d/f/2017-07-31/doc-ifyinvwu3872514.shtml"
                yield scrapy.Request(url=item['url'],
                                     meta={'request_type': 'sina_detail', 'category': channel_name,
                                           'title': title, 'source_url': source_url},
                                     callback=callback)

    def parseDetail2(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            category = response.meta['category']
            title = response.meta['title']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + category + ':' + source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath('//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace('\\', '\\\\')
            styles = CssUtil.clearUrl(styles)
            styles = styles.replace('overflow-x:hidden', '').replace('overflow:hidden', '')

            post_date = selector.xpath('//*[@id="pub_date"]/text() | //*[@class="titer"]/text()').extract_first('')
            post_date = post_date.replace('\r\n', '').strip(' ').replace(u'年', '-').replace(u'月', '-').replace(u'日', ' ')

            try:
                post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M"))
            except Exception:
                pass

            src_ref = selector.xpath(
                '//*[@id="media_name"]/a[1]/text() | //*[@class="source"]/a/text() | //*[@class="source"]/text()').extract_first(
                '')

            post_user = selector.xpath('//*[@id="author_ename"]/a/text()').extract_first('')

            tags = selector.xpath('//p[@class="art_keywords"]/a/text()').extract() or []
            tags = ','.join(tags)

            content_html = selector.xpath('//*[@id="artibody"][1]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签  2017-07-24 19:23
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath('*[not(boolean(@class="entHdPic")) and not(name(.)="script")]')

            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                # TODO...之后处理 取出标题类型
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                if u'来源:' in allTxt and len(allTxt) < 25:
                    # 说明这是真正的来源
                    if not post_user:
                        # 先替换作者 ,如果不存在的话
                        post_user = src_ref
                    src_ref = allTxt.replace(u'来源:', '').strip(u' ')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)
            # 组装新的内容标签
            outHtml = """<div class="content_wrappr_left"><div class="content"><div class="BSHARE_POP blkContainerSblkCon clearfix blkContainerSblkCon_16" id="artibody">${++content++}</div></div></div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src
                image_url = img.xpath('@src').extract_first()
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = category
            contentItem['post_user'] = post_user
            contentItem['tags'] = tags
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 2
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = '新浪科技'
            contentItem['src_ref'] = src_ref
            return contentItem

    def saveFile(self, title, content):
        filename = 'html/%s.html' % title
        with open(filename, 'wb') as f:
            f.write(content.encode("utf8"))
        self.log('Saved file %s' % filename)
コード例 #5
0
class DetailSpider(scrapy.Spider):
    name = 'jiemian_detail'
    download_delay = 2.5  # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数
    handle_httpstatus_list = [301, 302, 204, 206, 403, 404,
                              500]  # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题

    def __init__(self, name=None, **kwargs):
        super(DetailSpider, self).__init__(name=None, **kwargs)
        self.count = 0
        self.request_stop = False
        self.request_stop_time = 0
        self.logDao = LogDao(self.logger, 'jiemian_list_detail')
        self.checkDao = CheckDao()
        # 用于缓存css
        self.css = {'hash': 'style'}
        self.dataMonitor = DataMonitorDao()
        self.isRunningStop = False

    def close(spider, reason):
        if not spider.isRunningStop:
            # 如果启动爬虫时候,还有未完成的抓取,此时不应该设置状态为停止,反之
            spider.saveStatus('stop')
        # spider.dataMonitor.updateTotal('jiemian_total')
        pass

    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
        # 必读 玩物 产品榜 快报 游戏要闻 单品 盘点 花边要闻 游戏快报
        cids = [{
            'src_channel': u'界面科技',
            'sub_channel': u'必读',
            'num': '6'
        }, {
            'src_channel': u'界面科技',
            'sub_channel': u'玩物',
            'num': '66'
        }, {
            'src_channel': u'界面科技',
            'sub_channel': u'产品榜',
            'num': '73'
        }, {
            'src_channel': u'界面科技',
            'sub_channel': u'快报',
            'num': '84'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'游戏要闻',
            'num': '100'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'单品',
            'num': '119'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'盘点',
            'num': '120'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'花边要闻',
            'num': '121'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'游戏快报',
            'num': '122'
        }]
        # 必读
        url = 'https://a.jiemian.com/index.php?m=lists&a=ajaxlist&callback=&_=1502103362598&page='
        for cid in cids:
            for page in range(1, 2):
                cidNum = cid.get('num')
                src_channel = cid.get('src_channel')
                sub_channel = cid.get('sub_channel')
                newUrl = url + str(page) + ('&cid=' + cidNum)
                self.logDao.warn(u'进行抓取列表:' + newUrl)
                yield scrapy.Request(url=newUrl,
                                     meta={
                                         'request_type': 'jiemian_page_list',
                                         'url': newUrl,
                                         'src_channel': src_channel,
                                         'sub_channel': sub_channel
                                     },
                                     callback=self.parseArticleList,
                                     dont_filter=True)

    # TODO...还没有遇到被禁止的情况
    def parseArticleList(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            # 格式化
            jsonStr = demjson.decode(body.lstrip('(').rstrip(')')) or {}
            rst = jsonStr.get('rst', '')
            if not rst:
                self.logDao.info(u'不存在内容')
                return
            self.logDao.info(u'开始解析列表')
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            selector = Selector(text=rst)
            articles = selector.xpath(
                '//div[boolean(contains(@class,"news-view"))]')
            for article in articles:
                source_url = article.xpath(
                    './/div[@class="news-header"]//a/@href').extract_first('')
                title = article.xpath(
                    './/div[@class="news-header"]//a/@title | .//div[@class="news-header"]//a/text()'
                ).extract_first('')
                post_date = article.xpath(
                    './/div[@class="news-footer"]//span[@class="date"]/text()'
                ).extract_first('')
                tags = article.xpath(
                    './/div[@class="news-tag"]/a/text()').extract()

                if not source_url:
                    self.logDao.info(u'文章不存在' + title + ':' + source_url +
                                     ':' + post_date)
                    continue

                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + source_url +
                                     ':' + post_date)
                    continue

                self.logDao.info(u'抓取文章' + title + ':' + source_url + ':' +
                                 post_date)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'jiemian_detail',
                                         "title": title,
                                         'post_date': post_date,
                                         'sub_channel': sub_channel,
                                         'src_channel': src_channel,
                                         'tags': tags,
                                         'source_url': source_url
                                     },
                                     callback=self.parseArticle)

    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            title = response.meta['title']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']
            sub_channel = response.meta['sub_channel']
            src_channel = response.meta['src_channel']
            tags = response.meta['tags']
            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' +
                             source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)
            styles = CssUtil.clearBackgroundColor(styles, ['#f5f5f5'])

            post_user = selector.xpath(
                '//div[@class="article-info"]//span[@class="author"]//text()'
            ).extract_first('')

            src_ref = src_channel

            post_date = selector.xpath(
                '//div[@class="article-info"]//span[@class="date"]//text()'
            ).extract_first('')

            try:
                post_date = time.strftime(
                    "%Y-%m-%d %H:%M:%S",
                    time.strptime(post_date, "%Y/%m/%d %H:%M"))
            except Exception:
                pass

            tags_ = selector.xpath(
                '//div[@class="article-info"]//*[@class="tags"]//text()'
            ).extract()
            tags = tags + tags_
            tags = ','.join(tags)
            """
                article-main
                    article-img
                    article-content
                        p
                        article-source
                            p:来源
                            p:点击下载“界面新闻”APP 不抓
            """

            # 得到article-img
            article_img = selector.xpath(
                '//div[@class="article-main"]/div[@class="article-img"]'
            ).extract_first('')

            # 得到article-content
            article_content = selector.xpath(
                '//div[@class="article-main"]/div[@class="article-content"]'
            ).extract_first('')

            if not article_content:
                self.logDao.info(u'文章不存在' + title + ':' + source_url + ':' +
                                 post_date)
                return

            contentSelector = Selector(text=article_content)
            content_items = contentSelector.xpath(
                '//div[@class="article-content"]/*[not(name(.)="script") and not('
                'name(.)="iframe") and not(name(.)="style") and not(boolean( '
                'contains(a//@href,"?m=app"))) and not(boolean(@class="share-view" '
                'or @class="article-source"))]')

            # 得到来源 做替换
            contentSource = contentSelector.xpath(
                '//div[@class="article-content"]/div[@class="article-source"]/p/text()'
            ).extract_first('')
            if contentSource:
                contentSource = contentSource.replace(u'来源:', u'')
                src_ref = contentSource

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = u"""<div class="article-main">${++articleImg++}<div class="article-content" style="font-family:
            'Microsoft YaHei', 黑体;">${++content++}</div></div> """

            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++articleImg++}',
                                           article_img).replace(
                                               '${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(
                        image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = sub_channel
            contentItem['post_user'] = post_user
            contentItem['tags'] = tags
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 5
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = src_channel
            contentItem['src_ref'] = src_ref
            return contentItem

    def saveFile(self, title, content):
        # TODO..暂时不保存,考虑保存下来复用效果不佳
        return
        # filename = 'html/%s.html' % title
        # with open(filename, 'wb') as f:
        #     f.write(content.encode("utf8"))
        # self.log('Saved file %s' % filename)

    def getStatus(self):
        loadF = None
        try:
            with open("catchStatus.json", 'r') as loadF:
                aa = json.load(loadF)
                return aa.get('status')
        finally:
            if loadF:
                loadF.close()

    def saveStatus(self, status):
        loadF = None
        try:
            with open("catchStatus.json", "w") as loadF:
                json.dump({'status': status}, loadF)
        finally:
            if loadF:
                loadF.close()
コード例 #6
0
class DetailSpider(scrapy.Spider):
    name = 'fenghuang_detail'
    # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数
    download_delay = 2.5
    # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题
    handle_httpstatus_list = [301, 302, 204, 206, 403, 404, 500]

    def __init__(self, name=None, **kwargs):
        super(DetailSpider, self).__init__(name=None, **kwargs)
        self.count = 0
        self.request_stop = False
        self.request_stop_time = 0
        self.logDao = LogDao(self.logger, 'fenghuang_list_detail')
        self.checkDao = CheckDao()
        # 用于缓存css
        self.css = {'hash': 'style'}
        self.dataMonitor = DataMonitorDao()
        self.isRunningStop = False

    def close(spider, reason):
        if not spider.isRunningStop:
            # 如果启动爬虫时候,还有未完成的抓取,此时不应该设置状态为停止,反之
            spider.saveStatus('stop')
        # spider.dataMonitor.updateTotal('fenghuang_total')
        pass

    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
        src_channel = u'凤凰财经'

        sub_channel = u'电子竞技'
        url = 'http://games.ifeng.com/listpage/17886/1/list.shtml'
        styleUrlDefault = [
            'http://p2.ifengimg.com/a/2016/0523/esports.css',
            'http://y1.ifengimg.com/package/t_20130820__15953/css/pl_detail_v8.css'
        ]
        newUrl = url
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'fenghuang_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel,
                                 'styleUrlDefault': styleUrlDefault
                             },
                             callback=self.parseArticleList3,
                             dont_filter=True)

        sub_channel = u'产品资讯'
        url = 'http://games.ifeng.com/listpage/27456/1/list.shtml'
        styleUrlDefault = [
            'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css'
        ]
        newUrl = url
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'fenghuang_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel,
                                 'styleUrlDefault': styleUrlDefault
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)

        sub_channel = u'热点资讯'
        url = 'http://games.ifeng.com/listpage/27455/1/list.shtml'
        styleUrlDefault = [
            'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css'
        ]
        newUrl = url
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'fenghuang_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel,
                                 'styleUrlDefault': styleUrlDefault
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)

        src_channel = u'凤凰科技'
        sub_channel = u'资讯'
        url = 'http://tech.ifeng.com/listpage/800/0/1/rtlist.shtml'
        styleUrlDefault = [
            'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css'
        ]
        newUrl = url
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'fenghuang_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel,
                                 'styleUrlDefault': styleUrlDefault
                             },
                             callback=self.parseArticleList2,
                             dont_filter=True)

    def parseArticleList2(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            styleUrlDefault = response.meta['styleUrlDefault']
            self.logDao.info(u'开始解析列表')
            selector = Selector(text=body)
            articles = selector.xpath('//div[@class="zheng_list pl10 box"]')
            for article in articles:
                source_url = article.xpath('./h1/a/@href').extract_first(
                    '').lstrip('%20').lstrip(' ')
                title = article.xpath('./h1/a/text()').extract_first('')
                if not source_url:
                    continue
                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + source_url)
                    continue
                self.logDao.info(u'抓取文章' + title + ':' + source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'fenghuang_detail',
                                         "title": title,
                                         "source_url": source_url,
                                         'src_channel': src_channel,
                                         'sub_channel': sub_channel,
                                         'styleUrlDefault': styleUrlDefault
                                     },
                                     callback=self.parseArticle)

    def parseArticleList3(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            styleUrlDefault = response.meta['styleUrlDefault']
            self.logDao.info(u'开始解析列表')
            selector = Selector(text=body)
            articles = selector.xpath(
                '//div[boolean(contains(@class, "box_list"))]')
            for article in articles:
                source_url = article.xpath('./h2/a/@href').extract_first(
                    '').lstrip('%20').lstrip(' ')
                title = article.xpath('./h2/a/text()').extract_first('')
                if not source_url:
                    continue
                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + source_url)
                    continue
                self.logDao.info(u'抓取文章' + title + ':' + source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'fenghuang_detail',
                                         "title": title,
                                         "source_url": source_url,
                                         'src_channel': src_channel,
                                         'sub_channel': sub_channel,
                                         'styleUrlDefault': styleUrlDefault
                                     },
                                     callback=self.parseArticle)

    # TODO...还没有遇到被禁止的情况
    def parseArticleList(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            styleUrlDefault = response.meta['styleUrlDefault']
            self.logDao.info(u'开始解析列表')
            selector = Selector(text=body)
            articles = selector.xpath('//div[@class="newsList"]//li')
            for article in articles:
                source_url = article.xpath('./a/@href').extract_first(
                    '').lstrip('%20').lstrip(' ')
                title = article.xpath('./a/text()').extract_first('')
                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + source_url)
                    continue
                self.logDao.info(u'抓取文章' + title + ':' + source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'fenghuang_detail',
                                         "title": title,
                                         "source_url": source_url,
                                         'src_channel': src_channel,
                                         'sub_channel': sub_channel,
                                         'styleUrlDefault': styleUrlDefault
                                     },
                                     callback=self.parseArticle)

    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            styleUrlDefault = response.meta['styleUrlDefault']
            title = response.meta['title']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleUrls = styleUrls + styleUrlDefault
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)
            styles = CssUtil.clearBackgroundColor(styles, ['#eaeaea'])

            tags = selector.xpath(
                '//meta[@name="keywords"]/@content').extract_first('')

            category = selector.xpath(
                '//meta[boolean(contains(@name, "og:category"))]/@content'
            ).extract_first('')
            if category:
                sub_channel = sub_channel + ',' + category

            src_ref = selector.xpath(
                '//span[@class="ss03"]//text()').extract_first('')
            if not src_ref.replace('\n', '').replace(' ', ''):
                src_ref = selector.xpath(
                    '//div[@id="artical_sth"]/p/text()').extract()
                src_ref = ''.join(src_ref).replace('\n', '').replace(
                    u'来源:', '').replace(' ', '')

            post_date = selector.xpath(
                '//meta[@name="og:time"]/@content').extract_first('')
            post_date = post_date.replace(u'年', '-').replace(
                u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ')

            try:
                post_date = time.strftime(
                    "%Y-%m-%d %H:%M:%S",
                    time.strptime(post_date, "%Y-%m-%d %H:%M:%S"))
            except Exception as e:
                try:
                    post_date = time.strftime(
                        "%Y-%m-%d %H:%M:%S",
                        time.strptime(post_date, "%Y-%m-%d %H:%M"))
                except Exception as e:
                    self.logDao.warn(e.message)
                    pass
                pass

            content_html = selector.xpath('//div[@id="main_content"]')
            logoHtml = selector.xpath(
                '//span[@class="ifengLogo"]').extract_first('')

            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath(
                '*[not(name(.)="script") and not(name(.)="style")  and not(name(.)="iframe")]'
            )
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div id="artical_real" class="js_img_share_area"><div id="main_content" class="js_selection_area" bosszone="content">${++content++}</div></div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)
            content_html = content_html.replace(logoHtml, '')

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(
                        image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = sub_channel
            contentItem['post_user'] = ''
            contentItem['tags'] = tags
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 8
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = src_channel
            contentItem['src_ref'] = src_ref
            return contentItem

    def saveFile(self, title, content):
        # TODO..暂时不保存,考虑保存下来复用效果不佳
        return
        # filename = 'html/%s.html' % title
        # with open(filename, 'wb') as f:
        #     f.write(content.encode("utf8"))
        # self.log('Saved file %s' % filename)

    def getStatus(self):
        loadF = None
        try:
            with open("catchStatus.json", 'r') as loadF:
                aa = json.load(loadF)
                return aa.get('status')
        finally:
            if loadF:
                loadF.close()

    def saveStatus(self, status):
        loadF = None
        try:
            with open("catchStatus.json", "w") as loadF:
                json.dump({'status': status}, loadF)
        finally:
            if loadF:
                loadF.close()
コード例 #7
0
class TXDetailSpider(scrapy.Spider):
    name = 'jiemian_detail'
    download_delay = 2.5  # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数
    handle_httpstatus_list = [301, 302, 204, 206, 403, 404,
                              500]  # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题

    def __init__(self, name=None, **kwargs):
        super(TXDetailSpider, self).__init__(name=None, **kwargs)
        self.count = 0
        self.request_stop = False
        self.request_stop_time = 0
        self.logDao = LogDao(self.logger, 'jiemian_list_detail')
        self.checkDao = CheckDao()
        # 用于缓存css
        self.css = {'hash': 'style'}
        self.dataMonitor = DataMonitorDao()
        self.logger.info(u'重走init')

    def close(spider, reason):
        # spider.dataMonitor.updateTotal('jiemian_total')
        pass

    def start_requests(self):
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
        # 必读 玩物 产品榜 快报 游戏要闻 单品 盘点 花边要闻 游戏快报
        cids = ['6', '66', '73', '84', '100', '119', '120', '121', '122']
        # 必读
        url = 'https://a.jiemian.com/index.php?m=lists&a=ajaxlist&callback=&_=1502103362598&page='
        for cid in cids:
            for page in range(1, 2):
                newUrl = url + str(page) + ('&cid=' + str(cid))
                self.logDao.warn(u'进行抓取列表:' + newUrl)
                yield scrapy.Request(url=newUrl,
                                     meta={
                                         'request_type': 'jiemian_page_list',
                                         'url': newUrl
                                     },
                                     callback=self.parseArticleList,
                                     dont_filter=True)

    # TODO...还没有遇到被禁止的情况
    def parseArticleList(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            # 格式化
            jsonStr = demjson.decode(body.lstrip('(').rstrip(')')) or {}
            rst = jsonStr.get('rst', '')
            if not rst:
                self.logDao.info(u'不存在内容')
                return
            self.logDao.info(u'开始解析列表')
            selector = Selector(text=rst)
            articles = selector.xpath('//div[@class="news-img"]/a')
            for article in articles:
                source_url = article.xpath('@href').extract_first('')
                title = article.xpath('@title').extract_first('')
                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + source_url)
                    continue

                if not source_url:
                    self.logDao.info(u'文章不存在' + title + ':' + source_url)
                    continue

                self.logDao.info(u'抓取文章' + title + ':' + source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'jiemian_detail',
                                         "title": title,
                                         "source_url": source_url
                                     },
                                     callback=self.parseArticle)

    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            title = response.meta['title']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' +
                             source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)

            category = selector.xpath(
                '//*[@class="a_catalog"]/a/text()|//*[@class="a_catalog"]/text()'
            ).extract_first('')

            post_user = selector.xpath(
                '//*[@class="a_author"]/text() | //*[@class="where"]/text()| //*[@class="where"]/a/text()'
            ).extract_first('')

            src_ref = selector.xpath(
                '//*[@class="a_source"]/text() | //*[@class="a_source"]/a/text()'
            ).extract_first('')

            a_time = selector.xpath(
                '//*[@class="a_time"]/text() | //*[@class="pubTime"]/text()'
            ).extract_first('')

            if a_time:
                post_date = a_time
            else:
                post_date = (post_date or '')

            post_date = post_date.replace(u'年', '-').replace(
                u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ')

            try:
                post_date = time.strftime(
                    "%Y-%m-%d %H:%M:%S",
                    time.strptime(post_date, "%Y-%m-%d %H:%M"))
            except Exception:
                pass

            content_html = selector.xpath('//div[@id="Cnt-Main-Article-QQ"]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath(
                '*[not(name(.)="script") and not(name(.)="style")  and not(name(.)="iframe") and not(boolean(@class="rv-root-v2 rv-js-root"))]'
            )
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div id="Cnt-Main-Article-QQ" class="Cnt-Main-Article-QQ" bosszone="content">${++content++}</div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url = img.xpath('@src').extract_first()
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = category
            contentItem['post_user'] = post_user
            contentItem['tags'] = ''
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 3
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = '腾讯科技'
            contentItem['src_ref'] = src_ref
            return contentItem

    def saveFile(self, title, content):
        filename = 'html/%s.html' % title
        with open(filename, 'wb') as f:
            f.write(content.encode("utf8"))
        self.log('Saved file %s' % filename)
コード例 #8
0
ファイル: WYDetailSpider.py プロジェクト: BTYT/studyScrapy
class WYDetailSpider(scrapy.Spider):
    name = 'wangyi_detail'
    download_delay = 2.5  # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数
    handle_httpstatus_list = [301, 302, 204, 206, 403, 404,
                              500]  # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题

    def __init__(self, name=None, **kwargs):
        super(WYDetailSpider, self).__init__(name=None, **kwargs)
        self.count = 0
        self.request_stop = False
        self.request_stop_time = 0
        self.logDao = LogDao(self.logger, 'wangyi_list_detail')
        self.checkDao = CheckDao()
        # 用于缓存css
        self.css = {'hash': 'style'}
        self.dataMonitor = DataMonitorDao()
        self.logger.info(u'重走init')

    def close(spider, reason):
        spider.dataMonitor.updateTotal('wangyi_total')

    def start_requests(self):
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        # 进行页面访问
        newUrl = 'http://tech.163.com/special/00094IHV/news_json.js?' + str(
            random.uniform(0, 1))
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'wangyi_page_list',
                                 'url': newUrl
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)

    # TODO...还没有遇到被禁止的情况
    def parseArticleList(self, response):
        url = response.meta['url']
        body = EncodeUtil.toUnicode(response.body)

        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            self.logDao.info(u'开始解析列表')
            body = body.lstrip('var data=').rstrip(';')
            # 格式化
            jsonStr = demjson.decode(body) or {}
            articles = jsonStr.get('news') or []
            categoryList = jsonStr.get('category') or []
            for article_ins in articles:
                for article in article_ins:
                    source_url = article.get('l', '')
                    title = article.get('t', '')
                    timeStr = article.get('p', '')
                    # 如果存在则不抓取
                    if self.checkDao.checkExist(source_url):
                        self.logDao.info(u'文章已经存在' + title + timeStr +
                                         source_url)
                        continue
                    categoryIndex = article.get('c')
                    category = ''
                    if 0 <= categoryIndex < len(categoryList):
                        category = categoryList[categoryIndex].get('n')
                    post_date = article.get('p')
                    self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' +
                                     source_url)
                    yield scrapy.Request(url=source_url,
                                         meta={
                                             'request_type': 'wangyi_detail',
                                             "title": title,
                                             'category': category,
                                             'post_date': post_date,
                                             "source_url": source_url
                                         },
                                         callback=self.parseArticle)

    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            category = response.meta['category']
            title = response.meta['title']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' +
                             source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')
            styles = CssUtil.clearUrl(styles)

            src_ref = selector.xpath(
                '//*[@id="ne_article_source"]/text()').extract_first()
            content_html = selector.xpath('//*[@id="endText"]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath(
                '*[not(boolean(@class="gg200x300" or @class="ep-source cDGray")) and not(name(.)="script")]'
            )
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div class="post_text" id="endText" style="border-top:1px solid #ddd;" jcid="5611">${++content++}</div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url = img.xpath('@src').extract_first()
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = category
            contentItem['post_user'] = ''
            contentItem['tags'] = ''
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 4
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = '网易科技'
            contentItem['src_ref'] = src_ref
            return contentItem

    def saveFile(self, title, content):
        filename = 'html/%s.html' % title
        with open(filename, 'wb') as f:
            f.write(content.encode("utf8"))
        self.log('Saved file %s' % filename)
コード例 #9
0
class WXDetailSpider(scrapy.Spider):
    name = 'wx_detail'
    download_delay = 15  # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数
    handle_httpstatus_list = [301, 302, 204, 206, 403, 404,
                              500]  # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题

    def __init__(self, name=None, **kwargs):
        super(WXDetailSpider, self).__init__(name=None, **kwargs)
        self.count = 0
        self.wxSourceDao = WxSourceDao()
        self.logDao = LogDao(self.logger, 'weixin_list_detail')
        self.checkDao = CheckDao()
        self.dataMonitor = DataMonitorDao()
        self.wxSources = []
        self.brokenAccounts = []  # 当前被禁止了的账号,下次抓取优先抓取

    def close(spider, reason):
        # 存被禁止的微信账号
        spider.saveBrokenAccounts(spider.brokenAccounts)
        # 缓存状态
        spider.saveStatus('stop')
        spider.dataMonitor.updateTotal('weixin_total')
        for source in spider.wxSources:
            (id, wx_name, wx_account, wx_url, wx_avatar, update_status,
             is_enable, update_time) = source
            spider.dataMonitor.updateTotal('weixin_account_total',
                                           account=wx_account)

    def start_requests(self):
        # 如果在晚上12点到早上6点不爬
        hour = datetime.datetime.now().hour
        if 0 <= hour <= 6:
            self.logDao.info(u'这个时间不爬。0-6点')
            return

        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            return
        self.saveStatus('running')

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        # 获取源  可用有值
        sources = self.wxSourceDao.queryWxUrl(isRandom=True)

        # 排序优先
        update_time, brokenAccounts = self.getBrokenAccounts()
        firstGroup = []
        secondGroup = []
        for source in sources:
            (id, wx_name, wx_account, wx_url, wx_avatar, update_status,
             is_enable, update_time) = source
            if wx_account in brokenAccounts:
                firstGroup.append(source)
            else:
                secondGroup.append(source)
        sources = firstGroup + secondGroup

        self.wxSources = sources
        for source in sources:
            (id, wx_name, wx_account, wx_url, wx_avatar, update_status,
             is_enable, update_time) = source
            # 进行页面访问
            newUrl = wx_url
            self.logDao.warn(u'进行抓取:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'weixin_page_list',
                                     'source_url': newUrl,
                                     'wx_account': wx_account,
                                     'source': source,
                                     'wx_account_id': id
                                 },
                                 callback=self.parseArticleList,
                                 dont_filter=True)

    def parseArticleList(self, response):
        body = EncodeUtil.toUnicode(response.body)
        selector = Selector(text=body)
        source_url = response.meta['source_url']
        wx_account = response.meta['wx_account']
        title = selector.xpath('//title/text()').extract_first('').strip(u' ')
        isN = u"请输入验证码" == title
        if isN or response.status == 302:
            self.logDao.info(u'访问过多被禁止,重新拨号')
            # 存起来
            self.brokenAccounts.append(wx_account)
            # 获取Ip # 同时空线程30s
            NetworkUtil.getNewIp()
            TimerUtil.sleep(80)
            NetworkUtil.openWebbrowser(source_url)
        else:
            source = response.meta['source']
            wx_account_id = response.meta['wx_account_id']
            self.logDao.info(u'开始解析列表:' + wx_account)
            # 进行解析
            articleJS = selector.xpath('//script/text()').extract()
            for js in articleJS:
                if 'var msgList = ' in js:
                    p8 = re.compile('var\s*msgList\s*=.*;')
                    matchList = p8.findall(js)
                    for match in matchList:
                        match = match.lstrip('var msgList = ').rstrip(';')
                        # 格式化
                        articles = demjson.decode(match) or {}
                        articles = articles['list'] or []
                        self.logDao.info(u'匹配到文章列表' + wx_account)
                        for article in articles:
                            app_msg_ext_info = article.get(
                                'app_msg_ext_info') or {}
                            desc = app_msg_ext_info.get('digest') or ''
                            title = app_msg_ext_info.get('title') or ''
                            # 如果存在则不抓取
                            if self.checkDao.checkExist(title, wx_account, 1):
                                self.logDao.info(u'已经存在' + wx_account + ':' +
                                                 title)
                                continue

                            detailUrl = app_msg_ext_info['content_url'] or ''
                            detailUrl = "http://mp.weixin.qq.com" + detailUrl
                            detailUrl = detailUrl.replace("amp;", "")
                            self.logDao.info(u'抓取' + wx_account + ':' + title +
                                             ':' + detailUrl)
                            if not detailUrl:
                                continue

                            yield scrapy.Request(url=detailUrl,
                                                 meta={
                                                     'request_type':
                                                     'weixin_detail',
                                                     'wx_account': wx_account,
                                                     "source": source,
                                                     "title": title,
                                                     'wx_account_id':
                                                     wx_account_id,
                                                     "source_url": detailUrl
                                                 },
                                                 callback=self.parseArticle)

    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        selector = Selector(text=body)
        title = selector.xpath('//title/text()').extract_first('').strip(u' ')
        source_url = response.meta['source_url']
        wx_account = response.meta['wx_account']
        isN = u"请输入验证码" == title
        if isN or response.status == 302:
            self.logDao.info(u'访问过多被禁止,重新拨号')
            # 存起来
            self.brokenAccounts.append(wx_account)
            # 获取Ip # 同时空线程30s
            NetworkUtil.getNewIp()
            TimerUtil.sleep(80)
            NetworkUtil.openWebbrowser(source_url)
        else:
            title = response.meta['title']
            source_url = response.meta['source_url']
            wx_account_id = response.meta['wx_account_id']
            self.logDao.info(u'开始解析文章' + wx_account + ':' + title + ':' +
                             source_url)
            self.logDao.info(u'开始解析文章:' + source_url)
            # 进行解析
            post_date = selector.xpath(
                '//*[@id="post-date"]/text()').extract_first('')

            try:
                post_date = time.strftime("%Y-%m-%d %H:%M:%S",
                                          time.strptime(post_date, "%Y-%m-%d"))
            except Exception:
                pass

            styles = selector.xpath('//style/text()').extract()
            styles = CssUtil.compressCss(styles).replace('\'', '"').replace(
                '\\', '\\\\')
            styles = CssUtil.clearUrl(styles)
            styles = CssUtil.clearBackgroundColor(styles, ['#f3f3f3'])

            post_user = selector.xpath(
                '//*[@id="post-user"]/text()').extract_first('')
            content_html = selector.xpath('//*[@id="js_content"]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            content_items = content_html.xpath('*')
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # content_items_new = []
            # for item in content_items:
            #     itemStr = item.extract()
            #     if u'订阅微信' in itemStr:
            #         continue
            #     content_items_new.append(item)
            # content_items = content_items_new

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div class="rich_media_content " id="js_content">${++content++}</div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)

            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url = img.xpath('@src | @data-src').extract_first('')
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
            self.logDao.info(wx_account + u'得到文章:' + title + ":" + post_date +
                             ':' + post_user)
            self.logDao.info(u'得到文章:' + source_url)

            # 得到hashCode1
            hash_code = self.checkDao.getHashCode(title, wx_account, 1)

            self.saveFile(hash_code, body)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = ''
            contentItem['post_user'] = post_user
            contentItem['tags'] = ''
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 1
            contentItem['src_account_id'] = wx_account_id
            contentItem['src_channel'] = '微信公众号'
            contentItem['src_ref'] = ''
            contentItem['wx_account'] = wx_account

            return contentItem

    def saveFile(self, title, content):
        # TODO..暂时不保存,考虑保存下来复用效果不佳
        return
        # filename = 'html/%s.html' % title
        # with open(filename, 'wb') as f:
        #     f.write(content.encode("utf8"))
        # self.log('Saved file %s' % filename)

    def getBrokenAccounts(self):
        loadF = None
        try:
            with open("brokenAccount.json", 'r') as loadF:
                aa = json.load(loadF)
                return aa.get('update_time', ''), aa.get('accounts', [])
        finally:
            if loadF:
                loadF.close()

    def saveBrokenAccounts(self, accounts):
        loadF = None
        try:
            with open("brokenAccount.json", "w") as loadF:
                update_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                            time.localtime(time.time()))
                json.dump({
                    'update_time': update_time,
                    'accounts': accounts
                }, loadF)
        finally:
            if loadF:
                loadF.close()

    def getStatus(self):
        loadF = None
        try:
            with open("catchStatus.json", 'r') as loadF:
                aa = json.load(loadF)
                return aa.get('status')
        finally:
            if loadF:
                loadF.close()

    def saveStatus(self, status):
        loadF = None
        try:
            with open("catchStatus.json", "w") as loadF:
                json.dump({'status': status}, loadF)
        finally:
            if loadF:
                loadF.close()
コード例 #10
0
class DetailSpider(scrapy.Spider):
    name = 'sohu_detail'
    download_delay = 2.5  # 基础间隔 0.5*download_delay --- 1.5*download_delays之间的随机数
    handle_httpstatus_list = [301, 302, 204, 206, 403, 404,
                              500]  # 可以处理重定向及其他错误码导致的 页面无法获取解析的问题

    def __init__(self, name=None, **kwargs):
        super(DetailSpider, self).__init__(name=None, **kwargs)
        self.count = 0
        self.request_stop = False
        self.request_stop_time = 0
        self.logDao = LogDao(self.logger, 'sohu_list_detail')
        self.checkDao = CheckDao()
        # 用于缓存css
        self.css = {'hash': 'style'}
        self.dataMonitor = DataMonitorDao()
        self.isRunningStop = False

    def close(spider, reason):
        if not spider.isRunningStop:
            # 如果启动爬虫时候,还有未完成的抓取,此时不应该设置状态为停止,反之
            spider.saveStatus('stop')
        # spider.dataMonitor.updateTotal('sohu_total')
        pass

    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        src_channel = u'搜狐科技'
        sub_channel = u'科技'

        for page in range(1, 4):
            # 进行页面访问
            url = 'http://v2.sohu.com/public-api/feed?scene=CHANNEL&sceneId=30&size=20&callback=&_=1502075449669&page='
            newUrl = url + str(page)
            self.logDao.warn(u'进行抓取列表:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'sohu_page_list',
                                     'url': newUrl,
                                     'src_channel': src_channel,
                                     'sub_channel': sub_channel
                                 },
                                 callback=self.parseArticleList,
                                 dont_filter=True)

    # TODO...还没有遇到被禁止的情况
    def parseArticleList(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            # 格式化
            articles = demjson.decode(
                body.lstrip('/**/').lstrip('(').rstrip(';').rstrip(')')) or []
            if not articles:
                self.logDao.info(u'不存在内容')
                return
            self.logDao.info(u'开始解析列表')
            for article in articles:
                id = article.get('id', '')
                authorId = article.get('authorId', '')
                if not id or not authorId:
                    continue
                pageId = str(id) + '_' + str(authorId)
                source_url = 'http://www.sohu.com/a/' + pageId + '?loc=1&focus_pic=0'
                title = article.get('title', '')
                post_user = article.get('authorName', '')
                tags = article.get('tags', [])
                tagsStr = []
                for tag in tags:
                    tagsStr.append(tag.get('name', ''))

                publicTime = article.get('publicTime', time.time() * 1000)
                post_date = time.strftime('%Y-%m-%d %H:%M:%S',
                                          time.localtime(publicTime / 1000))
                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + post_date +
                                     ':' + source_url)
                    continue
                self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' +
                                 source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'sohu_detail',
                                         'title': title,
                                         'post_date': post_date,
                                         'post_user': post_user,
                                         'tags': tagsStr,
                                         'source_url': source_url,
                                         'src_channel': src_channel,
                                         'sub_channel': sub_channel
                                     },
                                     callback=self.parseArticle)

    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            title = response.meta['title']
            post_user = response.meta['post_user']
            tags = response.meta['tags']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']

            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' +
                             source_url)
            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)

            content_html = selector.xpath('//*[@class="article"]')
            backHtml = selector.xpath('//*[@id="backsohucom"]').extract_first(
                '')

            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签u'<p data-role="editor-name">责任编辑:<span></span></p>'
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath(
                '*[not(name(.)="script") and not(name(.)="style")  and not(name(.)="iframe") and not(boolean(@data-role="editor-name"))]'
            )
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div class="article-page"><article class="article">${++content++}</article></div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)
            content_html = content_html.replace(backHtml, '')

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(
                        image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = sub_channel
            contentItem['post_user'] = post_user
            contentItem['tags'] = ','.join(tags)
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 7
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = src_channel
            contentItem['src_ref'] = '搜狐科技'
            return contentItem

    def saveFile(self, title, content):
        # TODO..暂时不保存,考虑保存下来复用效果不佳
        return
        # filename = 'html/%s.html' % title
        # with open(filename, 'wb') as f:
        #     f.write(content.encode("utf8"))
        # self.log('Saved file %s' % filename)

    def getStatus(self):
        loadF = None
        try:
            with open("catchStatus.json", 'r') as loadF:
                aa = json.load(loadF)
                return aa.get('status')
        finally:
            if loadF:
                loadF.close()

    def saveStatus(self, status):
        loadF = None
        try:
            with open("catchStatus.json", "w") as loadF:
                json.dump({'status': status}, loadF)
        finally:
            if loadF:
                loadF.close()