Beispiel #1
0
 def parseArticle(self, response):
     source_url = response.meta['source_url']
     body = EncodeUtil.toUnicode(response.body)
     if False:
         self.infoStr(u'访问过多被禁止')
     else:
         self.infoStr(u'开始解析界面')
         title = response.meta['title']
         post_date = response.meta['post_date']
         source_url = response.meta['source_url']
         contentItem = response.meta['contentItem']
         selector = Selector(text=body)
         # 得到样式
         styleUrls = selector.xpath(
             '//link[@rel="stylesheet"]/@href | //style/text()').extract()
         styleList = []
         for styleUrl in styleUrls:
             # 得到hash作为key
             styleUrlHash = EncryptUtil.md5(styleUrl)
             if not self.css.get(styleUrlHash):
                 # 不存在则去下载 并保存
                 self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
             styleList.append(self.css[styleUrlHash])
         styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
             '\\', '\\\\')
         styles = CssUtil.clearUrl(styles)
         contentItem['styles'] = styles
         content_html = selector.xpath('//*[@id="imedia-article"]')
         if len(content_html):
             contentItem['content_html'] = content_html.extract_first('')
             return contentItem
Beispiel #2
0
    def process_item(self, item, spider):
        image_urls = []
        for image_url in item['image_urls']:
            url = image_url.get('url')
            urlHash = EncryptUtil.md5(url)
            path = 'full/' + str(urlHash) + '.jpg'
            detailPath = self.savePath + '/' + path
            # 创建目录
            saveDir = self.savePath + '/full'
            if not FileUtil.dirIsExist(saveDir):
                FileUtil.createDir(saveDir)

            if FileUtil.fileIsExist(detailPath):
                image_url_new = {'ok': True, 'x': {'url': url, 'path': path}}
            else:
                try:
                    fileResponse = requests.get(url, timeout=10)
                    req_code = fileResponse.status_code
                    req_msg = fileResponse.reason
                    if req_code == 200:
                        open(detailPath, 'wb').write(fileResponse.content)
                        image_url_new = {
                            'ok': True,
                            'x': {
                                'url': url,
                                'path': path
                            }
                        }
                    else:
                        print '下载图片失败', url
                        image_url_new = {
                            'ok': False,
                            'x': {
                                'url': url,
                            }
                        }
                except Exception, e:
                    print e
                    print '下载图片失败', url
                    image_url_new = {
                        'ok': False,
                        'x': {
                            'url': url,
                        }
                    }
            image_urls.append(image_url_new)
            # 空转2s
            TimerUtil.sleep(2)
Beispiel #3
0
 def getHashCode(self, source_url):
     # 具体逻辑
     return EncryptUtil.md5(source_url)
    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            title = response.meta['title']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' +
                             source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)

            category = selector.xpath(
                '//*[@class="a_catalog"]/a/text()|//*[@class="a_catalog"]/text()'
            ).extract_first('')

            post_user = selector.xpath(
                '//*[@class="a_author"]/text() | //*[@class="where"]/text()| //*[@class="where"]/a/text()'
            ).extract_first('')

            src_ref = selector.xpath(
                '//*[@class="a_source"]/text() | //*[@class="a_source"]/a/text()'
            ).extract_first('')

            a_time = selector.xpath(
                '//*[@class="a_time"]/text() | //*[@class="pubTime"]/text()'
            ).extract_first('')

            if a_time:
                post_date = a_time
            else:
                post_date = (post_date or '')

            post_date = post_date.replace(u'年', '-').replace(
                u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ')

            try:
                post_date = time.strftime(
                    "%Y-%m-%d %H:%M:%S",
                    time.strptime(post_date, "%Y-%m-%d %H:%M"))
            except Exception:
                pass

            content_html = selector.xpath('//div[@id="Cnt-Main-Article-QQ"]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath(
                '*[not(name(.)="script") and not(name(.)="style")  and not(name(.)="iframe") and not(boolean(@class="rv-root-v2 rv-js-root"))]'
            )
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div id="Cnt-Main-Article-QQ" class="Cnt-Main-Article-QQ" bosszone="content">${++content++}</div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(
                        image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = category
            contentItem['post_user'] = post_user
            contentItem['tags'] = ''
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 3
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = '腾讯科技'
            contentItem['src_ref'] = src_ref
            return contentItem
Beispiel #5
0
    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            title = response.meta['title']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']
            sub_channel = response.meta['sub_channel']
            src_channel = response.meta['src_channel']
            tags = response.meta['tags']
            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' +
                             source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)
            styles = CssUtil.clearBackgroundColor(styles, ['#f5f5f5'])

            post_user = selector.xpath(
                '//div[@class="article-info"]//span[@class="author"]//text()'
            ).extract_first('')

            src_ref = src_channel

            post_date = selector.xpath(
                '//div[@class="article-info"]//span[@class="date"]//text()'
            ).extract_first('')

            try:
                post_date = time.strftime(
                    "%Y-%m-%d %H:%M:%S",
                    time.strptime(post_date, "%Y/%m/%d %H:%M"))
            except Exception:
                pass

            tags_ = selector.xpath(
                '//div[@class="article-info"]//*[@class="tags"]//text()'
            ).extract()
            tags = tags + tags_
            tags = ','.join(tags)
            """
                article-main
                    article-img
                    article-content
                        p
                        article-source
                            p:来源
                            p:点击下载“界面新闻”APP 不抓
            """

            # 得到article-img
            article_img = selector.xpath(
                '//div[@class="article-main"]/div[@class="article-img"]'
            ).extract_first('')

            # 得到article-content
            article_content = selector.xpath(
                '//div[@class="article-main"]/div[@class="article-content"]'
            ).extract_first('')

            if not article_content:
                self.logDao.info(u'文章不存在' + title + ':' + source_url + ':' +
                                 post_date)
                return

            contentSelector = Selector(text=article_content)
            content_items = contentSelector.xpath(
                '//div[@class="article-content"]/*[not(name(.)="script") and not('
                'name(.)="iframe") and not(name(.)="style") and not(boolean( '
                'contains(a//@href,"?m=app"))) and not(boolean(@class="share-view" '
                'or @class="article-source"))]')

            # 得到来源 做替换
            contentSource = contentSelector.xpath(
                '//div[@class="article-content"]/div[@class="article-source"]/p/text()'
            ).extract_first('')
            if contentSource:
                contentSource = contentSource.replace(u'来源:', u'')
                src_ref = contentSource

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = u"""<div class="article-main">${++articleImg++}<div class="article-content" style="font-family:
            'Microsoft YaHei', 黑体;">${++content++}</div></div> """

            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++articleImg++}',
                                           article_img).replace(
                                               '${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(
                        image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = sub_channel
            contentItem['post_user'] = post_user
            contentItem['tags'] = tags
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 5
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = src_channel
            contentItem['src_ref'] = src_ref
            return contentItem
Beispiel #6
0
 def getHashCode(self, title, wx_account, source_id):
     # 具体逻辑
     return EncryptUtil.md5(
         title.encode('utf8') + wx_account.encode('utf8') + str(source_id))
Beispiel #7
0
    def parseDetail2(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            category = response.meta['category']
            title = response.meta['title']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + category + ':' + source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath('//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace('\\', '\\\\')
            styles = CssUtil.clearUrl(styles)
            styles = styles.replace('overflow-x:hidden', '').replace('overflow:hidden', '')

            post_date = selector.xpath('//*[@id="pub_date"]/text() | //*[@class="titer"]/text()').extract_first('')
            post_date = post_date.replace('\r\n', '').strip(' ').replace(u'年', '-').replace(u'月', '-').replace(u'日', ' ')

            try:
                post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(post_date, "%Y-%m-%d %H:%M"))
            except Exception:
                pass

            src_ref = selector.xpath(
                '//*[@id="media_name"]/a[1]/text() | //*[@class="source"]/a/text() | //*[@class="source"]/text()').extract_first(
                '')

            post_user = selector.xpath('//*[@id="author_ename"]/a/text()').extract_first('')

            tags = selector.xpath('//p[@class="art_keywords"]/a/text()').extract() or []
            tags = ','.join(tags)

            content_html = selector.xpath('//*[@id="artibody"][1]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签  2017-07-24 19:23
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath('*[not(boolean(@class="entHdPic")) and not(name(.)="script")]')

            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                # TODO...之后处理 取出标题类型
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                if u'来源:' in allTxt and len(allTxt) < 25:
                    # 说明这是真正的来源
                    if not post_user:
                        # 先替换作者 ,如果不存在的话
                        post_user = src_ref
                    src_ref = allTxt.replace(u'来源:', '').strip(u' ')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)
            # 组装新的内容标签
            outHtml = """<div class="content_wrappr_left"><div class="content"><div class="BSHARE_POP blkContainerSblkCon clearfix blkContainerSblkCon_16" id="artibody">${++content++}</div></div></div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src
                image_url = img.xpath('@src').extract_first()
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = category
            contentItem['post_user'] = post_user
            contentItem['tags'] = tags
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 2
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = '新浪科技'
            contentItem['src_ref'] = src_ref
            return contentItem
Beispiel #8
0
    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            title = response.meta['title']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' + source_url)

            selector = Selector(text=body)

            # 页面存在的css
            pageStyles = selector.xpath('//style/text()').extract()
            # 得到样式
            styleUrls = selector.xpath('//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styleList = pageStyles + styleList
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace('\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)

            tags = selector.xpath('//meta[@name="keywords"]/@content').extract_first('')

            src_ref = selector.xpath('//*[@class="m-title f-pr"]/h2[@class="f-ff1 f-fwn f-fs14"]/i//text()').extract_first('')

            content_html = selector.xpath('//div[@class="m-text"]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath('*[not(name(.)="script") and not(name(.)="style")  and not(name(.)="iframe")]')
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div class="m-text">${++content++}</div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = sub_channel
            contentItem['post_user'] = ''
            contentItem['tags'] = tags
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 9
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = src_channel
            contentItem['src_ref'] = src_ref
            return contentItem
    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            title = response.meta['title']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' +
                             source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)

            post_user = selector.xpath(
                '//div[@class="tip fl"]/text()').extract_first('').replace(
                    '\r', '').replace('\t', '').replace('\n', '')

            src_ref = selector.xpath(
                '//div[@class="tip fl"]/a/text()').extract_first('')

            post_date = selector.xpath(
                '//div[@class="tip fl"]/span[@class="pr20"]/text()'
            ).extract_first('')
            if post_date:
                try:
                    post_date = time.strftime(
                        "%Y-%m-%d %H:%M:%S",
                        time.strptime(post_date, "%Y-%m-%d %H:%M:%S"))
                except Exception as e:
                    self.logDao.warn(e.message)
                    pass

            content_html = selector.xpath('//div[@class="art_contextBox"]')
            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath(
                '*[not(name(.)="script") and not(name(.)="style")  and not(name(.)="iframe") and not(boolean(name(.)="div" and @style="text-align:right;font-size:12px"))]'
            )
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div class="art_context"><div class="art_contextBox" style="visibility: visible; height:auto;">${++content++}</div></div> """
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(
                        image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = sub_channel
            contentItem['post_user'] = post_user
            contentItem['tags'] = ''
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 6
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = src_channel
            contentItem['src_ref'] = src_ref
            return contentItem
Beispiel #10
0
    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            styleUrlDefault = response.meta['styleUrlDefault']
            title = response.meta['title']
            source_url = response.meta['source_url']
            self.logDao.info(u'开始解析文章:' + title + ':' + source_url)

            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleUrls = styleUrls + styleUrlDefault
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)
            styles = CssUtil.clearBackgroundColor(styles, ['#eaeaea'])

            tags = selector.xpath(
                '//meta[@name="keywords"]/@content').extract_first('')

            category = selector.xpath(
                '//meta[boolean(contains(@name, "og:category"))]/@content'
            ).extract_first('')
            if category:
                sub_channel = sub_channel + ',' + category

            src_ref = selector.xpath(
                '//span[@class="ss03"]//text()').extract_first('')
            if not src_ref.replace('\n', '').replace(' ', ''):
                src_ref = selector.xpath(
                    '//div[@id="artical_sth"]/p/text()').extract()
                src_ref = ''.join(src_ref).replace('\n', '').replace(
                    u'来源:', '').replace(' ', '')

            post_date = selector.xpath(
                '//meta[@name="og:time"]/@content').extract_first('')
            post_date = post_date.replace(u'年', '-').replace(
                u'月', '-').replace(u'日', u' ').replace(u'\xa0', u' ')

            try:
                post_date = time.strftime(
                    "%Y-%m-%d %H:%M:%S",
                    time.strptime(post_date, "%Y-%m-%d %H:%M:%S"))
            except Exception as e:
                try:
                    post_date = time.strftime(
                        "%Y-%m-%d %H:%M:%S",
                        time.strptime(post_date, "%Y-%m-%d %H:%M"))
                except Exception as e:
                    self.logDao.warn(e.message)
                    pass
                pass

            content_html = selector.xpath('//div[@id="main_content"]')
            logoHtml = selector.xpath(
                '//span[@class="ifengLogo"]').extract_first('')

            if not len(content_html):
                self.logDao.info(u'不存在内容:' + source_url)
                return
            # 去除内部不需要的标签
            # 完整案例:content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath(
                '*[not(name(.)="script") and not(name(.)="style")  and not(name(.)="iframe")]'
            )
            if not len(content_items):
                self.logDao.info(u'不存在内容:' + source_url)
                return

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div id="artical_real" class="js_img_share_area"><div id="main_content" class="js_selection_area" bosszone="content">${++content++}</div></div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)
            content_html = content_html.replace(logoHtml, '')

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url,然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片:' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(
                        image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = sub_channel
            contentItem['post_user'] = ''
            contentItem['tags'] = tags
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 8
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = src_channel
            contentItem['src_ref'] = src_ref
            return contentItem