Example #1
0
    def parseArticleList(self, response):
        url = response.meta['url']
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            self.logDao.info(u'开始解析列表')
            body = body.lstrip('var data=').rstrip(';')
            # 格式化
            jsonStr = demjson.decode(body) or {}
            articles = jsonStr.get('news') or []
            categoryList = jsonStr.get('category') or []

            for article_ins in articles:
                for article in article_ins:
                    source_url = article.get('l', '')
                    title = article.get('t', '')
                    timeStr = article.get('p', '')
                    # 如果存在则不抓取
                    if self.checkDao.checkExist(source_url):
                        self.logDao.info(u'文章已经存在' + title + timeStr +
                                         source_url)
                        continue
                    categoryIndex = article.get('c')
                    category = ''
                    if 0 <= categoryIndex < len(categoryList):
                        category = categoryList[categoryIndex].get('n')

                    if category:
                        sub_channel = EncodeUtil.toUnicode(
                            sub_channel) + u',' + EncodeUtil.toUnicode(
                                category)

                    post_date = article.get('p')
                    self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' +
                                     source_url)
                    yield scrapy.Request(url=source_url,
                                         meta={
                                             'request_type': 'wangyi_detail',
                                             'title': title,
                                             'post_date': post_date,
                                             'source_url': source_url,
                                             'src_channel': src_channel,
                                             'sub_channel': sub_channel
                                         },
                                         callback=self.parseArticle)
Example #2
0
 def parseArticle(self, response):
     source_url = response.meta['source_url']
     body = EncodeUtil.toUnicode(response.body)
     if False:
         self.infoStr(u'访问过多被禁止')
     else:
         self.infoStr(u'开始解析界面')
         title = response.meta['title']
         post_date = response.meta['post_date']
         source_url = response.meta['source_url']
         contentItem = response.meta['contentItem']
         selector = Selector(text=body)
         # 得到样式
         styleUrls = selector.xpath(
             '//link[@rel="stylesheet"]/@href | //style/text()').extract()
         styleList = []
         for styleUrl in styleUrls:
             # 得到hash作为key
             styleUrlHash = EncryptUtil.md5(styleUrl)
             if not self.css.get(styleUrlHash):
                 # 不存在则去下载 并保存
                 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
             styleList.append(self.css[styleUrlHash])
         styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
             '\\', '\\\\')
         styles = CssUtil.clearUrl(styles)
         contentItem['styles'] = styles
         content_html = selector.xpath('//*[@id="imedia-article"]')
         if len(content_html):
             contentItem['content_html'] = content_html.extract_first('')
             return contentItem
Example #3
0
    def parseList2(self, response):
        data = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            url = response.meta['url']
            self.logDao.info(u'开始解析列表' + url)

            # 格式化
            data = demjson.decode(data) or {}

            result = data.get('result', {})
            list = result.get('data', [])

            for item in list:
                channel_name = u'科技'
                title = item.get('title', '')
                source_url = item.get('url', '')
                callback = self.parseDetail2
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在:' + title + source_url)
                    continue
                self.logDao.info(u"开始抓取文章:" + source_url)
                # item['url'] = "http://tech.sina.com.cn/d/f/2017-07-31/doc-ifyinvwu3872514.shtml"
                yield scrapy.Request(url=item['url'],
                                     meta={'request_type': 'sina_detail', 'category': channel_name,
                                           'title': title, 'source_url': source_url},
                                     callback=callback)
Example #4
0
    def parseArticleList(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            styleUrlDefault = response.meta['styleUrlDefault']
            self.logDao.info(u'开始解析列表')
            selector = Selector(text=body)
            articles = selector.xpath('//div[@class="newsList"]//li')
            for article in articles:
                source_url = article.xpath('./a/@href').extract_first(
                    '').lstrip('%20').lstrip(' ')
                title = article.xpath('./a/text()').extract_first('')
                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + source_url)
                    continue
                self.logDao.info(u'抓取文章' + title + ':' + source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'fenghuang_detail',
                                         "title": title,
                                         "source_url": source_url,
                                         'src_channel': src_channel,
                                         'sub_channel': sub_channel,
                                         'styleUrlDefault': styleUrlDefault
                                     },
                                     callback=self.parseArticle)
 def parseList(self, response):
     source = response.meta['source']
     wx_account = response.meta['wx_account']
     url = response.meta['url']
     body = EncodeUtil.toUnicode(response.body)
     # 判断被禁止 提示需要重启路由 清理cookie
     if response.status == 302:
         # 更新状态为更新失败
         self.logDao.warn(u'您的访问过于频繁,重新拨号')
         self.wxSourceDao.updateStatus(wx_account, 'updateFail')
         # 获取Ip # 同时空线程30s
         NetworkUtil.getNewIp()
         TimerUtil.sleep(30)
     else:
         self.logDao.info(u'开始解析:' + wx_account)
         # 进行解析
         selector = Selector(text=body)
         results = selector.xpath('//*[@id="main"]/div[4]/ul/li')
         self.logDao.info(u'列表长度:' + str(len(results)))
         hasCatch = False
         for result in results:
             wx_name = result.xpath('//*[@id="sogou_vr_11002301_box_0"]/div/div[2]/p[1]/a/text()').extract_first()
             wx_account_ = result.xpath('//p[@class="info"]/label/text()').extract_first()
             wx_url = result.xpath('//p[@class="tit"]/a/@href').extract_first()
             if wx_account_ == wx_account:
                 self.logDao.info(u'成功抓取:' + wx_account_)
                 self.wxSourceDao.updateSource(wx_account, wx_name, wx_url, 'last')
                 hasCatch = True
                 break
         if not hasCatch:
             self.logDao.info(u'没有抓到:' + wx_account_)
             self.wxSourceDao.updateStatus(wx_account, 'none')
         pass
Example #6
0
    def parseArticleList(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            # 格式化
            jsonStr = demjson.decode(body.lstrip('(').rstrip(')')) or {}
            rst = jsonStr.get('rst', '')
            if not rst:
                self.logDao.info(u'不存在内容')
                return
            self.logDao.info(u'开始解析列表')
            selector = Selector(text=rst)
            articles = selector.xpath('//div[@class="news-img"]/a')
            for article in articles:
                source_url = article.xpath('@href').extract_first('')
                title = article.xpath('@title').extract_first('')
                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + source_url)
                    continue

                if not source_url:
                    self.logDao.info(u'文章不存在' + title + ':' + source_url)
                    continue

                self.logDao.info(u'抓取文章' + title + ':' + source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'jiemian_detail',
                                         "title": title,
                                         "source_url": source_url
                                     },
                                     callback=self.parseArticle)
Example #7
0
    def parseArticleList2(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            self.logDao.info(u'开始解析列表')
            selector = Selector(text=body)
            articles = selector.xpath('//div[@class="list yaowen"]//li[boolean(contains(@class, "item"))]/a[1]')
            for article in articles:
                source_url = article.xpath('./@href').extract_first('')
                title = article.xpath('./text()').extract_first('')
                if not source_url:
                    continue
                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + source_url)
                    continue
                self.logDao.info(u'抓取文章' + title + ':' + source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'tengxun_detail',
                                         "title": title,
                                         'post_date': '',
                                         'source_url': source_url,
                                         'src_channel': src_channel,
                                         'sub_channel': sub_channel,
                                     },
                                     callback=self.parseArticle)
    def parseList(self, response):
        data = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            url = response.meta['url']
            self.logDao.info(u'开始解析列表' + url)

            data = data.lstrip('var jsonData = ').rstrip(';')
            # 格式化
            data = demjson.decode(data) or {}

            list = data.get('list', [])

            for item in list:
                channel = item.get('channel', {})
                channel_name = channel.get('title', '')
                title = item.get('title', '')
                source_url = item.get('url', '')
                callback = self.parseDetail2
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在:' + title + source_url)
                    continue
                self.logDao.info(u"开始抓取文章:" + item['url'])
                yield scrapy.Request(url=item['url'],
                                     meta={
                                         'request_type': 'sina_detail',
                                         'category': channel_name,
                                         'title': title,
                                         'source_url': source_url
                                     },
                                     callback=callback)
Example #9
0
    def parseList3(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            url = response.meta['url']
            self.logDao.info(u'开始解析列表' + url)

            selector = Selector(text=body)
            articles = selector.xpath('//*[@id="fin_tabs0_c0"]//a')
            # 格式化
            for article in articles:
                title = article.xpath('./text()').extract_first('')
                source_url = article.xpath('./@href').extract_first('')
                if not source_url:
                    continue

                callback = self.parseDetail2
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在:' + title + source_url)
                    continue
                self.logDao.info(u"开始抓取文章:" + source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'sina_detail',
                                         'title': title,
                                         'source_url': source_url,
                                         'src_channel': src_channel,
                                         'sub_channel': sub_channel
                                     },
                                     callback=callback)
    def parseArticleList(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            self.logDao.info(u'开始解析列表')
            selector = Selector(text=body)
            articles = selector.xpath('//div[@class="mod newslist"]//li')
            for article in articles:
                source_url = article.xpath('a/@href').extract_first('')
                title = article.xpath('a/text()').extract_first('')
                post_date = article.xpath('span/text()').extract_first('')
                post_date = time.strftime('%Y', time.localtime(
                    time.time())) + u'年' + post_date
                if not source_url:
                    continue
                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + post_date +
                                     ':' + source_url)
                    continue
                self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' +
                                 source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'demoName_detail',
                                         "title": title,
                                         'post_date': post_date,
                                         "source_url": source_url
                                     },
                                     callback=self.parseArticle)
 def parseArticleList(self, response):
     body = EncodeUtil.toUnicode(response.body)
     selector = Selector(text=body)
     source_url = response.meta['source_url']
     print source_url
     title = selector.xpath('//title/text()').extract_first('').strip(u' ')
     isN = u"请输入验证码" == title
     if isN or response.status == 302:
         self.logDao.info(u'访问过多被禁止,重新拨号')
         # 获取Ip # 同时空线程30s
         NetworkUtil.getNewIp()
         TimerUtil.sleep(50)
         NetworkUtil.openWebbrowser(source_url)
     else:
         source = response.meta['source']
         wx_account = response.meta['wx_account']
         wx_account_id = response.meta['wx_account_id']
         self.logDao.info(u'开始解析列表:' + wx_account)
         # 进行解析
         articleJS = selector.xpath('//script/text()').extract()
         for js in articleJS:
             if 'var msgList = ' in js:
                 p8 = re.compile('var\s*msgList\s*=.*;')
                 matchList = p8.findall(js)
                 for match in matchList:
                     match = match.lstrip('var msgList = ').rstrip(';')
                     # 格式化
                     articles = demjson.decode(match) or {}
                     articles = articles['list'] or []
                     self.logDao.info(u'匹配到文章列表' + wx_account)
                     for article in articles:
                         app_msg_ext_info = article.get(
                             'app_msg_ext_info') or {}
                         desc = app_msg_ext_info.get('digest') or ''
                         title = app_msg_ext_info.get('title') or ''
                         # 如果存在则不抓取
                         if self.checkDao.checkExist(title, wx_account, 1):
                             self.logDao.info(u'已经存在' + wx_account + ':' +
                                              title)
                             continue
                         detailUrl = app_msg_ext_info['content_url'] or ''
                         if not detailUrl:
                             continue
                         detailUrl = "http://mp.weixin.qq.com" + detailUrl
                         detailUrl = detailUrl.replace("amp;", "")
                         self.logDao.info(u'抓取' + wx_account + ':' + title +
                                          ':' + detailUrl)
                         yield scrapy.Request(url=detailUrl,
                                              meta={
                                                  'request_type':
                                                  'weixin_detail',
                                                  'wx_account': wx_account,
                                                  "source": source,
                                                  "title": title,
                                                  'wx_account_id':
                                                  wx_account_id,
                                                  "source_url": detailUrl
                                              },
                                              callback=self.parseArticle)
Example #12
0
    def parseArticleList(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            # 格式化
            jsonStr = demjson.decode(body.lstrip('(').rstrip(')')) or {}
            rst = jsonStr.get('rst', '')
            if not rst:
                self.logDao.info(u'不存在内容')
                return
            self.logDao.info(u'开始解析列表')
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            selector = Selector(text=rst)
            articles = selector.xpath(
                '//div[boolean(contains(@class,"news-view"))]')
            for article in articles:
                source_url = article.xpath(
                    './/div[@class="news-header"]//a/@href').extract_first('')
                title = article.xpath(
                    './/div[@class="news-header"]//a/@title | .//div[@class="news-header"]//a/text()'
                ).extract_first('')
                post_date = article.xpath(
                    './/div[@class="news-footer"]//span[@class="date"]/text()'
                ).extract_first('')
                tags = article.xpath(
                    './/div[@class="news-tag"]/a/text()').extract()

                if not source_url:
                    self.logDao.info(u'文章不存在' + title + ':' + source_url +
                                     ':' + post_date)
                    continue

                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + source_url +
                                     ':' + post_date)
                    continue

                self.logDao.info(u'抓取文章' + title + ':' + source_url + ':' +
                                 post_date)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'jiemian_detail',
                                         "title": title,
                                         'post_date': post_date,
                                         'sub_channel': sub_channel,
                                         'src_channel': src_channel,
                                         'tags': tags,
                                         'source_url': source_url
                                     },
                                     callback=self.parseArticle)
Example #13
0
    def parseArticleList(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            # 格式化
            articles = demjson.decode(
                body.lstrip('/**/').lstrip('(').rstrip(';').rstrip(')')) or []
            if not articles:
                self.logDao.info(u'不存在内容')
                return
            self.logDao.info(u'开始解析列表')
            for article in articles:
                id = article.get('id', '')
                authorId = article.get('authorId', '')
                if not id or not authorId:
                    continue
                pageId = str(id) + '_' + str(authorId)
                source_url = 'http://www.sohu.com/a/' + pageId + '?loc=1&focus_pic=0'
                title = article.get('title', '')
                post_user = article.get('authorName', '')
                tags = article.get('tags', [])
                tagsStr = []
                for tag in tags:
                    tagsStr.append(tag.get('name', ''))

                publicTime = article.get('publicTime', time.time() * 1000)
                post_date = time.strftime('%Y-%m-%d %H:%M:%S',
                                          time.localtime(publicTime / 1000))
                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + post_date +
                                     ':' + source_url)
                    continue
                self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' +
                                 source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'sohu_detail',
                                         'title': title,
                                         'post_date': post_date,
                                         'post_user': post_user,
                                         'tags': tagsStr,
                                         'source_url': source_url,
                                         'src_channel': src_channel,
                                         'sub_channel': sub_channel
                                     },
                                     callback=self.parseArticle)
Example #14
0
    def parse(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            pass
        else:
            # 格式化
            jsonStr = demjson.decode(body) or {}
            articles = jsonStr.get('result') or []
            for article in articles:
                default_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime(time.time()))
                title = article.get('title', '')
                post_date = article.get('date', default_time)
                summary = article.get('summary', '')
                keywords = article.get('keywords', [])
                source_url = article.get('url')
                content_type = article.get('content_type',
                                           'news')  # news video
                image_urls = article.get('image_urls', [])
                if not title or not source_url or content_type == 'video':
                    continue
                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.infoStr('已经存在')
                    continue
                contentItem = ContentItem()
                contentItem['title'] = title
                contentItem['post_date'] = post_date
                contentItem['summary'] = summary
                contentItem['keywords'] = ','.join(keywords)
                contentItem['source_url'] = source_url
                contentItem['content_type'] = content_type

                image_urls_new = []
                for image_url in image_urls:
                    if not image_url.startswith('http'):
                        image_url = 'http://i1.go2yd.com/image.php?type=thumbnail_336x216&url=%s' % image_url
                    image_urls_new.append(image_url)

                contentItem['image_urls'] = ','.join(image_urls_new)

                self.infoStr(u'抓取文章' + title + ':' + post_date + ':' +
                             source_url)
                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'zixun_detail',
                                         "title": title,
                                         'contentItem': contentItem,
                                         "source_url": source_url
                                     },
                                     callback=self.parseArticle)
Example #15
0
    def parseArticleList(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            self.logDao.info(u'开始解析列表')
            selector = Selector(text=body)
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            articles = selector.xpath('//dl[@class="f-cb dl-item"]')
            for article in articles:
                source_url = article.xpath('.//h3[@class="f-ff1 f-fwn f-fs22"]/a/@href').extract_first('')
                title = article.xpath('.//h3[@class="f-ff1 f-fwn f-fs22"]/a/text()').extract_first('')
                sub_channel_ = article.xpath('.//h5[@class="f-ff1 f-fwn f-fs14"]/a/text()').extract_first('')
                post_date = article.xpath('.//h4[@class="f-ff1 f-fwn f-fs14"]/span/text()').extract_first('')

                if not source_url:
                    continue

                if sub_channel_:
                    sub_channel = EncodeUtil.toUnicode(sub_channel) + u',' + EncodeUtil.toUnicode(sub_channel_)

                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + post_date + ':' + source_url)
                    continue
                self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' + source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'diyicaijing_detail',
                                         "title": title,
                                         'post_date': post_date,
                                         'source_url': source_url,
                                         'sub_channel': sub_channel,
                                         'src_channel': src_channel,
                                     },
                                     callback=self.parseArticle)
Example #16
0
    def parseArticleList(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            # 格式化
            articles = demjson.decode(body.lstrip('TradeTab_JsonData=')) or []
            if not articles:
                self.logDao.info(u'不存在内容')
                return
            self.logDao.info(u'开始解析列表')
            for article in articles:
                source_url = article.get('titleLink', '')
                title = article.get('title', '')
                post_date = article.get(
                    'dateInf',
                    time.strftime('%Y-%m-%d %H:%M',
                                  time.localtime(time.time())))
                if not source_url:
                    self.logDao.info(u'文章不存在' + title + ':' + source_url +
                                     ':' + post_date)
                    continue
                # 时间多久以前的不做抓取
                # TODO..
                # 如果存在则不抓取
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在' + title + ':' + post_date +
                                     ':' + source_url)
                    continue
                self.logDao.info(u'抓取文章' + title + ':' + post_date + ':' +
                                 source_url)

                yield scrapy.Request(url=source_url,
                                     meta={
                                         'request_type': 'hexun_detail',
                                         'title': title,
                                         'post_date': post_date,
                                         'source_url': source_url,
                                         'src_channel': src_channel,
                                         'sub_channel': sub_channel
                                     },
                                     callback=self.parseArticle)
Example #17
0
    def parseList(self, response):
        data = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            url = response.meta['url']
            self.logDao.info(u'开始解析列表' + url)

            data = data.lstrip('var jsonData = ').rstrip(';')
            # 格式化
            data = demjson.decode(data) or {}

            list = data.get('list', [])

            for item in list:
                channel = item.get('channel', {})
                channel_name = channel.get('title', '')
                if channel_name:
                    sub_channel = sub_channel + ',' + channel_name
                title = item.get('title', '')
                source_url = item.get('url', '')
                callback = self.parseDetail2
                if self.checkDao.checkExist(source_url):
                    self.logDao.info(u'文章已经存在:' + title + source_url)
                    continue
                self.logDao.info(u"开始抓取文章:" + item['url'])
                # item['url'] = "http://tech.sina.com.cn/d/f/2017-07-31/doc-ifyinvwu3872514.shtml"
                yield scrapy.Request(url=item['url'],
                                     meta={
                                         'request_type': 'sina_detail',
                                         'title': title,
                                         'source_url': source_url,
                                         'src_channel': src_channel,
                                         'sub_channel': sub_channel
                                     },
                                     callback=callback)
Example #18
0
    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            title = response.meta['title']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' +
                             source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)

            post_user = selector.xpath(
                '//div[@class="tip fl"]/text()').extract_first('').replace(
                    '\r', '').replace('\t', '').replace('\n', '')

            src_ref = selector.xpath(
                '//div[@class="tip fl"]/a/text()').extract_first('')

            post_date = selector.xpath(
                '//div[@class="tip fl"]/span[@class="pr20"]/text()'
            ).extract_first('')
            if post_date:
                try:
                    post_date = time.strftime(
                        "%Y-%m-%d %H:%M:%S",
                        time.strptime(post_date, "%Y-%m-%d %H:%M:%S"))
                except Exception as e:
                    self.logDao.warn(e.message)
                    pass

            content_html = selector.xpath('//div[@class="art_contextBox"]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath(
                '*[not(name(.)="script") and not(name(.)="style")  and not(name(.)="iframe") and not(boolean(name(.)="div" and @style="text-align:right;font-size:12px"))]'
            )
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div class="art_context"><div class="art_contextBox" style="visibility: visible; height:auto;">${++content++}</div></div> """
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(
                        image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = sub_channel
            contentItem['post_user'] = post_user
            contentItem['tags'] = ''
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 6
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = src_channel
            contentItem['src_ref'] = src_ref
            return contentItem
    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            title = response.meta['title']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' +
                             source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)

            category = selector.xpath(
                '//*[@class="a_catalog"]/a/text()|//*[@class="a_catalog"]/text()'
            ).extract_first('')

            post_user = selector.xpath(
                '//*[@class="a_author"]/text() | //*[@class="where"]/text()| //*[@class="where"]/a/text()'
            ).extract_first('')

            src_ref = selector.xpath(
                '//*[@class="a_source"]/text() | //*[@class="a_source"]/a/text()'
            ).extract_first('')

            a_time = selector.xpath(
                '//*[@class="a_time"]/text() | //*[@class="pubTime"]/text()'
            ).extract_first('')

            if a_time:
                post_date = a_time
            else:
                post_date = (post_date or '')

            post_date = post_date.replace(u'年', '-').replace(
                u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ')

            try:
                post_date = time.strftime(
                    "%Y-%m-%d %H:%M:%S",
                    time.strptime(post_date, "%Y-%m-%d %H:%M"))
            except Exception:
                pass

            content_html = selector.xpath('//div[@id="Cnt-Main-Article-QQ"]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath(
                '*[not(name(.)="script") and not(name(.)="style")  and not(name(.)="iframe") and not(boolean(@class="rv-root-v2 rv-js-root"))]'
            )
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div id="Cnt-Main-Article-QQ" class="Cnt-Main-Article-QQ" bosszone="content">${++content++}</div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(
                        image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = category
            contentItem['post_user'] = post_user
            contentItem['tags'] = ''
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 3
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = '腾讯科技'
            contentItem['src_ref'] = src_ref
            return contentItem
Example #20
0
    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            title = response.meta['title']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']
            sub_channel = response.meta['sub_channel']
            src_channel = response.meta['src_channel']
            tags = response.meta['tags']
            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' +
                             source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)
            styles = CssUtil.clearBackgroundColor(styles, ['#f5f5f5'])

            post_user = selector.xpath(
                '//div[@class="article-info"]//span[@class="author"]//text()'
            ).extract_first('')

            src_ref = src_channel

            post_date = selector.xpath(
                '//div[@class="article-info"]//span[@class="date"]//text()'
            ).extract_first('')

            try:
                post_date = time.strftime(
                    "%Y-%m-%d %H:%M:%S",
                    time.strptime(post_date, "%Y/%m/%d %H:%M"))
            except Exception:
                pass

            tags_ = selector.xpath(
                '//div[@class="article-info"]//*[@class="tags"]//text()'
            ).extract()
            tags = tags + tags_
            tags = ','.join(tags)
            """
                article-main
                    article-img
                    article-content
                        p
                        article-source
                            p:来源
                            p:点击下载“界面新闻”APP 不抓
            """

            # 得到article-img
            article_img = selector.xpath(
                '//div[@class="article-main"]/div[@class="article-img"]'
            ).extract_first('')

            # 得到article-content
            article_content = selector.xpath(
                '//div[@class="article-main"]/div[@class="article-content"]'
            ).extract_first('')

            if not article_content:
                self.logDao.info(u'文章不存在' + title + ':' + source_url + ':' +
                                 post_date)
                return

            contentSelector = Selector(text=article_content)
            content_items = contentSelector.xpath(
                '//div[@class="article-content"]/*[not(name(.)="script") and not('
                'name(.)="iframe") and not(name(.)="style") and not(boolean( '
                'contains(a//@href,"?m=app"))) and not(boolean(@class="share-view" '
                'or @class="article-source"))]')

            # 得到来源 做替换
            contentSource = contentSelector.xpath(
                '//div[@class="article-content"]/div[@class="article-source"]/p/text()'
            ).extract_first('')
            if contentSource:
                contentSource = contentSource.replace(u'来源:', u'')
                src_ref = contentSource

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = u"""<div class="article-main">${++articleImg++}<div class="article-content" style="font-family:
            'Microsoft YaHei', 黑体;">${++content++}</div></div> """

            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++articleImg++}',
                                           article_img).replace(
                                               '${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(
                        image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = sub_channel
            contentItem['post_user'] = post_user
            contentItem['tags'] = tags
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 5
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = src_channel
            contentItem['src_ref'] = src_ref
            return contentItem
Example #21
0
    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        selector = Selector(text=body)
        title = selector.xpath('//title/text()').extract_first('').strip(u' ')
        source_url = response.meta['source_url']
        wx_account = response.meta['wx_account']
        isN = u"请输入验证码" == title
        if isN or response.status == 302:
            self.logDao.info(u'访问过多被禁止,重新拨号')
            # 存起来
            self.brokenAccounts.append(wx_account)
            # 获取Ip # 同时空线程30s
            NetworkUtil.getNewIp()
            TimerUtil.sleep(80)
            NetworkUtil.openWebbrowser(source_url)
        else:
            title = response.meta['title']
            source_url = response.meta['source_url']
            wx_account_id = response.meta['wx_account_id']
            self.logDao.info(u'开始解析文章' + wx_account + ':' + title + ':' +
                             source_url)
            self.logDao.info(u'开始解析文章:' + source_url)
            # 进行解析
            post_date = selector.xpath(
                '//*[@id="post-date"]/text()').extract_first('')

            try:
                post_date = time.strftime("%Y-%m-%d %H:%M:%S",
                                          time.strptime(post_date, "%Y-%m-%d"))
            except Exception:
                pass

            styles = selector.xpath('//style/text()').extract()
            styles = CssUtil.compressCss(styles).replace('\'', '"').replace(
                '\\', '\\\\')
            styles = CssUtil.clearUrl(styles)
            styles = CssUtil.clearBackgroundColor(styles, ['#f3f3f3'])

            post_user = selector.xpath(
                '//*[@id="post-user"]/text()').extract_first('')
            content_html = selector.xpath('//*[@id="js_content"]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            content_items = content_html.xpath('*')
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # content_items_new = []
            # for item in content_items:
            #     itemStr = item.extract()
            #     if u'订阅微信' in itemStr:
            #         continue
            #     content_items_new.append(item)
            # content_items = content_items_new

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div class="rich_media_content " id="js_content">${++content++}</div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)

            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url = img.xpath('@src | @data-src').extract_first('')
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
            self.logDao.info(wx_account + u'得到文章:' + title + ":" + post_date +
                             ':' + post_user)
            self.logDao.info(u'得到文章:' + source_url)

            # 得到hashCode1
            hash_code = self.checkDao.getHashCode(title, wx_account, 1)

            self.saveFile(hash_code, body)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = ''
            contentItem['post_user'] = post_user
            contentItem['tags'] = ''
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 1
            contentItem['src_account_id'] = wx_account_id
            contentItem['src_channel'] = '微信公众号'
            contentItem['src_ref'] = ''
            contentItem['wx_account'] = wx_account

            return contentItem
Example #22
0
    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            styleUrlDefault = response.meta['styleUrlDefault']
            title = response.meta['title']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleUrls = styleUrls + styleUrlDefault
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)
            styles = CssUtil.clearBackgroundColor(styles, ['#eaeaea'])

            tags = selector.xpath(
                '//meta[@name="keywords"]/@content').extract_first('')

            category = selector.xpath(
                '//meta[boolean(contains(@name, "og:category"))]/@content'
            ).extract_first('')
            if category:
                sub_channel = sub_channel + ',' + category

            src_ref = selector.xpath(
                '//span[@class="ss03"]//text()').extract_first('')
            if not src_ref.replace('\n', '').replace(' ', ''):
                src_ref = selector.xpath(
                    '//div[@id="artical_sth"]/p/text()').extract()
                src_ref = ''.join(src_ref).replace('\n', '').replace(
                    u'来源:', '').replace(' ', '')

            post_date = selector.xpath(
                '//meta[@name="og:time"]/@content').extract_first('')
            post_date = post_date.replace(u'年', '-').replace(
                u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ')

            try:
                post_date = time.strftime(
                    "%Y-%m-%d %H:%M:%S",
                    time.strptime(post_date, "%Y-%m-%d %H:%M:%S"))
            except Exception as e:
                try:
                    post_date = time.strftime(
                        "%Y-%m-%d %H:%M:%S",
                        time.strptime(post_date, "%Y-%m-%d %H:%M"))
                except Exception as e:
                    self.logDao.warn(e.message)
                    pass
                pass

            content_html = selector.xpath('//div[@id="main_content"]')
            logoHtml = selector.xpath(
                '//span[@class="ifengLogo"]').extract_first('')

            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath(
                '*[not(name(.)="script") and not(name(.)="style")  and not(name(.)="iframe")]'
            )
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div id="artical_real" class="js_img_share_area"><div id="main_content" class="js_selection_area" bosszone="content">${++content++}</div></div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)
            content_html = content_html.replace(logoHtml, '')

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(
                        image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = sub_channel
            contentItem['post_user'] = ''
            contentItem['tags'] = tags
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 8
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = src_channel
            contentItem['src_ref'] = src_ref
            return contentItem
Example #23
0
    def parseDetail2(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            category = response.meta['category']
            title = response.meta['title']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + category + ':' + source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath('//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace('\\', '\\\\')
            styles = CssUtil.clearUrl(styles)
            styles = styles.replace('overflow-x:hidden', '').replace('overflow:hidden', '')

            post_date = selector.xpath('//*[@id="pub_date"]/text() | //*[@class="titer"]/text()').extract_first('')
            post_date = post_date.replace('\r\n', '').strip(' ').replace(u'年', '-').replace(u'月', '-').replace(u'日', ' ')

            try:
                post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M"))
            except Exception:
                pass

            src_ref = selector.xpath(
                '//*[@id="media_name"]/a[1]/text() | //*[@class="source"]/a/text() | //*[@class="source"]/text()').extract_first(
                '')

            post_user = selector.xpath('//*[@id="author_ename"]/a/text()').extract_first('')

            tags = selector.xpath('//p[@class="art_keywords"]/a/text()').extract() or []
            tags = ','.join(tags)

            content_html = selector.xpath('//*[@id="artibody"][1]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签  2017-07-24 19:23
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath('*[not(boolean(@class="entHdPic")) and not(name(.)="script")]')

            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                # TODO...之后处理 取出标题类型
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                if u'来源:' in allTxt and len(allTxt) < 25:
                    # 说明这是真正的来源
                    if not post_user:
                        # 先替换作者 ,如果不存在的话
                        post_user = src_ref
                    src_ref = allTxt.replace(u'来源:', '').strip(u' ')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)
            # 组装新的内容标签
            outHtml = """<div class="content_wrappr_left"><div class="content"><div class="BSHARE_POP blkContainerSblkCon clearfix blkContainerSblkCon_16" id="artibody">${++content++}</div></div></div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src
                image_url = img.xpath('@src').extract_first()
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = category
            contentItem['post_user'] = post_user
            contentItem['tags'] = tags
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 2
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = '新浪科技'
            contentItem['src_ref'] = src_ref
            return contentItem
Example #24
0
    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            title = response.meta['title']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url)

            selector = Selector(text=body)

            # 页面存在的css
            pageStyles = selector.xpath('//style/text()').extract()
            # 得到样式
            styleUrls = selector.xpath('//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styleList = pageStyles + styleList
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace('\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)

            tags = selector.xpath('//meta[@name="keywords"]/@content').extract_first('')

            src_ref = selector.xpath('//*[@class="m-title f-pr"]/h2[@class="f-ff1 f-fwn f-fs14"]/i//text()').extract_first('')

            content_html = selector.xpath('//div[@class="m-text"]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath('*[not(name(.)="script") and not(name(.)="style")  and not(name(.)="iframe")]')
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div class="m-text">${++content++}</div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = sub_channel
            contentItem['post_user'] = ''
            contentItem['tags'] = tags
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 9
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = src_channel
            contentItem['src_ref'] = src_ref
            return contentItem