コード例 #1
0
    def content_parse3(self, response):
        jsonbd = json.loads(response.text)
        if jsonbd == None or len(jsonbd) == 0: return
        data = jsonbd['data']

        for item in data:
            pipleitem = CctvOpinionmonitor2Item()

            pipleitem['date'] = helper.formatTime(item['period_num']) if 'period_num' in item.keys() else '2019-01-01'
            pipleitem['id'] = item['page_num'] if 'page_num' in item.keys() else None
            pipleitem['url'] = response.url
            pipleitem['title'] = item['page_name'] if 'page_name' in item.keys() else None
            pipleitem['source'] = '人民日报布版画'
            pipleitem['editor'] = None
            pipleitem['content'] = item['page_pic'] if 'page_pic' in item.keys() else None
            pipleitem['image_urls'] = item['page_pic'] if 'page_pic' in item.keys() else None
            pipleitem['video_urls'] = None
            pipleitem['share'] = None
            pipleitem['like'] = None
            pipleitem['dislike'] = None
            pipleitem['views'] = None
            pipleitem['comment'] = None
            pipleitem['crawl_time'] = helper.get_localtimestamp()

            return pipleitem
コード例 #2
0
    def content_parse(self, response):
        date = response.xpath('//*[@id="pubtime_baidu"]/text()').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor2Item()


        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = response.xpath('//meta[@name="contentid"]/@content').extract_first()
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath('//title/text()').extract_first()
        pipleitem['source'] = response.xpath('//meta[@name="source"]/@content').extract_first()
        pipleitem['editor'] = response.xpath('//meta[@name="author"]/@content').extract_first()
        pipleitem['content'] = helper.list2str(response.xpath('string(//div[contains(@id,"zw")])').extract())

        # 'http://news.eastday.com/images/thumbnailimg/month_1906/9979ded068194f2299bc158df90deb61.png'
        tmp = []
        for i in response.css('#zw img::attr(src)').extract():
            tmp.append('http://news.eastday.com{}'.format(i))

        pipleitem['image_urls'] = helper.list2str(tmp)
        pipleitem['video_urls'] = helper.list2str(response.css('#zw source::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
コード例 #3
0
    def content_parse(self, response):
        jsonbd = json.loads(response.text)
        date = jsonbd['time'] if 'time' in jsonbd.keys() and len(jsonbd['time']) != 0 else None
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor2Item()

        content = jsonbd['content'] if 'content' in jsonbd.keys() else None

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = jsonbd['newsId'] if 'newsId' in jsonbd.keys() else None
        pipleitem['url'] = response.url
        pipleitem['title'] = jsonbd['title'] if 'title' in jsonbd.keys() else None
        pipleitem['source'] = jsonbd['media']['mediaName'] if 'media' in jsonbd.keys() else None
        pipleitem['editor'] = None
        pipleitem['content'] = helper.list2str(re.findall('>(.*?)<', content)) if content != None else None

        imagelist = []
        for i in jsonbd['photos']:
            if i['pic'] != None and len(i['pic']) != 0:
                imagelist.append(i['pic'])
        pipleitem['image_urls'] = helper.list2str(imagelist)
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = response.meta['readCount']
        pipleitem['comment'] = response.meta['commentNum']
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
コード例 #4
0
    def content_parse(self, response):
        date = response.xpath('//span[@class="date"]/text()').extract_first()
        if date == None or len(date) == 0: return
        if helper.compare_time(date, self.limittime) < 0: return

        pipleitem = CctvOpinionmonitor2Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = re.findall('doc-(.*)\.', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath('//title/text()').extract_first()
        pipleitem['source'] = response.css('.source::text').extract_first()
        pipleitem['editor'] = None
        pipleitem['content'] = helper.list2str(
            response.xpath('string(//div[@id="article"])').extract())
        pipleitem['image_urls'] = helper.list2str(
            response.css('.article img::attr(src)').extract())
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
コード例 #5
0
    def content_parse(self, response):
        datelist = re.findall('\d{4}[-年.]+\d+[-月.]+\d+[日]*', response.text)
        if datelist == None or len(datelist) == 0: return
        try:
            if helper.compare_time(datelist[0], self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor2Item()

        pipleitem['date'] = helper.formatTime(datelist[0])
        pipleitem['id'] = None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath('//title/text()').extract_first()
        pipleitem['source'] = None
        pipleitem['editor'] = None
        pipleitem['content'] = None
        pipleitem['image_urls'] = None
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
コード例 #6
0
    def content_parse(self, response):
        date = response.meta['date'] if 'date' in response.meta.keys() else response.css(
            '.article-sub span:last-child::text').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor2Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys() else \
        re.findall('[com|cn]/(.*)', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath('//title/text()').extract_first()
        pipleitem['source'] = response.meta['source'] if 'source' in response.meta.keys() else response.css(
            '.article-sub span:first-child::text').extract_first()
        pipleitem['editor'] = None
        pipleitem['content'] = helper.list2str(response.xpath('string(//body)').extract())
        pipleitem['image_urls'] = helper.list2str(response.css('img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(response.css('video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = response.css('.share-count span::text').extract_first()
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
コード例 #7
0
    def content_parse4(self, response):
        jsonbd = json.loads(response.text)
        if len(jsonbd['data']) == 0: return
        for item in jsonbd['data']:
            pipleitem = CctvOpinionmonitor2Item()

            pipleitem['date'] = helper.formatTime(item['news_time']) if 'news_time' in item.keys() else '2019-01-01'
            pipleitem['id'] = item['id'] if 'id' in item.keys() else None
            pipleitem['url'] = response.url
            pipleitem['title'] = item['page_name'] if 'page_name' in item.keys() else None
            pipleitem['source'] = item['copyfrom'] if 'copyfrom' in item.keys() else None
            pipleitem['editor'] = None
            pipleitem['content'] = item['content'] if 'content' in item.keys() else None

            imagelist = []
            for i in item['image']:
                if i['url'] != None and len(i['url']) != 0:
                    imagelist.append(i['url'])
            pipleitem['image_urls'] = helper.list2str(imagelist)
            pipleitem['video_urls'] = item['video_url'] if 'video_url' in item.keys() else None
            pipleitem['share'] = item['share_count'] if 'share_count' in item.keys() else None
            pipleitem['like'] = item['likes_count'] if 'likes_count' in item.keys() else None
            pipleitem['dislike'] = None
            pipleitem['views'] = item['read_count'] if 'read_count' in item.keys() else None
            pipleitem['comment'] = item['comment_count'] if 'comment_count' in item.keys() else None
            pipleitem['crawl_time'] = helper.get_localtimestamp()

            return pipleitem
コード例 #8
0
    def content_parse2(self, response):
        jsonbd = json.loads(response.text)
        if jsonbd == None or len(jsonbd) == 0: return
        data = jsonbd['data']

        date = data['question']['date'] if 'question' in data.keys() else None
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor2Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = data['question_id'] if 'question_id' in data.keys() else None
        pipleitem['url'] = response.url
        pipleitem['title'] = data['title'] if 'title' in data.keys() else None
        pipleitem['source'] = '人民日报APP'
        pipleitem['editor'] = data['question']['user_name'] if 'question' in data.keys() else None
        pipleitem['content'] = data['question']['content'] if 'question' in data.keys() else None

        pipleitem['image_urls'] = None
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = data['like_num'] if 'like_num' in data.keys() else None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = data['comment_num'] if 'comment_num' in data.keys() else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.css('#newsdate::attr(value)').extract_first()
        if date == None or len(date) == 0: return
        if helper.compare_time(date, self.limittime) < 0: return

        pipleitem = CctvOpinionmonitor2Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = re.findall('\d{2}-\d{2}/(.*)\.', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath('//title/text()').extract_first()

        source = response.xpath('string(//div[@class="left-t"])').extract_first()
        if source != None: source = re.findall('来源:(\S*)', source)[0]
        pipleitem['source'] = source

        editor = response.css('.left_name .left_name::text').extract_first()
        if editor != None:
            tmp = re.findall('【编辑:(.*)】', editor)
            if len(tmp) != 0: editor = tmp[0]
        pipleitem['editor'] = editor

        pipleitem['content'] = helper.list2str(response.xpath('string(//div[@class="left_zw"])').extract()).replace(
            '\u3000', '')
        pipleitem['image_urls'] = helper.list2str(response.css('.left_zw img::attr(src)').extract())
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
コード例 #10
0
    def content_parse(self, response):
        date = response.css('.news_about p:nth-child(2)::text').extract_first()
        if response.meta['kw'] == '视频': date = response.css('.video_info_left span:first-child::text').extract_first()
        if date == None or len(date) == 0: return
        date = re.findall('[\d-]+\s*[\d:]*', date)[0]
        if helper.compare_time(date, self.limittime) < 0: return

        pipleitem = CctvOpinionmonitor2Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = re.findall('forward_(.*)', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath('//meta[@name="Description"]/@content').extract_first()

        source = response.css('.news_about span:first-child::text').extract_first()
        if response.meta['kw'] == '视频':
            for pt in ['.oriBox::text','.video_info_second span:first-child::text']:
                tmp = response.css(pt).extract_first()
                if tmp != None and len(tmp) > 1:
                    # source = re.findall('来源[\s:]*(.*)', tmp)[0]
                    source = tmp
                    break
        pipleitem['source'] = re.findall('来源[\s:]*(.*)', source)[0] if len(re.findall('来源[\s:]*(.*)', source)) > 0 else None

        editor = response.css('.news_infor_extra .infor_item:first-child::text').extract_first()
        if response.meta['kw'] == '视频': editor = response.css(
            '.video_info_second span:last-child::text').extract_first()
        if editor != None: editor = editor.replace('责任编辑:', '')
        pipleitem['editor'] = editor

        content = helper.list2str(response.xpath('string(//div[@class="news_txt"])').extract())
        if response.meta['kw'] == '视频': content = response.xpath('string(//div[@class="video_txt_l"])').extract_first()
        pipleitem['content'] = content

        pipleitem['image_urls'] = helper.list2str(response.css('.news_txt img::attr(src)').extract())
        pipleitem['video_urls'] = None
        pipleitem['share'] = None

        like = response.css('[class~=news_love] .nbgbox:first-child a::text').extract_first()
        if like == None: like = response.xpath('//div[@class="news_love detail_gov"]/div[1]/a/text()').extract_first()
        if response.meta['kw'] == '视频': like = response.css('.zanBox a::text').extract_first()

        pipleitem['like'] = re.findall('[\n\s]*(\d*)', like)[0]
        pipleitem['dislike'] = None
        pipleitem['views'] = None

        comment = None
        if response.meta['kw'] == '视频': comment = response.css('.reply::text').extract_first()
        pipleitem['comment'] = comment
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
コード例 #11
0
    def content_parse(self, response):
        date = response.meta['date'] if 'date' in response.meta.keys(
        ) else None
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        jsonbd = json.loads(response.text)
        if jsonbd == None or len(jsonbd) == 0: return

        pipleitem = CctvOpinionmonitor2Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys(
        ) else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.meta[
            'title'] if 'title' in response.meta.keys() else None
        pipleitem['source'] = response.meta[
            'source'] if 'source' in response.meta.keys() else None
        pipleitem['editor'] = jsonbd['content'][
            'cms_editor'] if 'cms_editor' in jsonbd['content'].keys() else None
        pipleitem['content'] = helper.list2str(
            re.findall('>(.*?)<', jsonbd['content']['text'])
        ) if 'text' in jsonbd['content'].keys() else None

        imglist = []
        videolist = []
        if 'attribute' in jsonbd.keys() and len(jsonbd['attribute']) != 0:
            for item in jsonbd['attribute'].keys():
                if re.search('VIDEO', item):
                    videolist.append(jsonbd['attribute'][item]['playurl'])
                if re.search('IMG', item):
                    imglist.append(jsonbd['attribute'][item]['url'])

        pipleitem['image_urls'] = helper.list2str(imglist)
        pipleitem['video_urls'] = helper.list2str(videolist)
        pipleitem['share'] = None
        pipleitem['like'] = response.meta[
            'likes_count'] if 'likes_count' in response.meta.keys() else None
        pipleitem['dislike'] = None
        pipleitem['views'] = response.meta[
            'read_count'] if 'read_count' in response.meta.keys() else None
        pipleitem['comment'] = response.meta[
            'comment_count'] if 'comment_count' in response.meta.keys(
            ) else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
コード例 #12
0
    def content_parse(self, response):
        date = response.css(
            '#post_head .atl-info span:nth-child(2)::text').extract_first()
        if date == None or len(date) == 0: return
        date = re.findall('时间:(\S*)', date)[0]

        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor2Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = re.findall('post-([a-z\d-]*).', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = '天涯社区'
        pipleitem['editor'] = response.xpath(
            '//meta[@name="author"]/@content').extract_first()

        pipleitem['content'] = helper.list2str(
            response.xpath(
                'string(//div[@class="bbs-content clearfix"])').extract())

        pipleitem['image_urls'] = helper.list2str(
            response.xpath(
                '//div[@class="bbs-content clearfix"]/img/@src').extract())
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = response.css(
            '.shang_zan::attr(data-number)').extract_first()
        pipleitem['dislike'] = None

        views = response.css(
            '#post_head .atl-info span:nth-child(3)::text').extract_first()
        views = re.findall(
            '点击:(.*)', views)[0] if views != None or len(views) != 0 else None
        pipleitem['views'] = views

        comment = response.css(
            '#post_head .atl-info span:nth-child(4)::text').extract_first()
        comment = re.findall(
            '回复:(.*)',
            comment)[0] if comment != None or len(comment) != 0 else None
        pipleitem['comment'] = comment
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
コード例 #13
0
    def content_parse(self, response):
        date = response.xpath(
            '//meta[@name="publishdate"]/@content').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor2Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = response.xpath(
            '//meta[@name="contentid"]/@content').extract_first()
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath('//title/text()').extract_first()
        pipleitem['source'] = response.xpath(
            '//meta[@name="source"]/@content').extract_first()

        editor = response.xpath(
            '//meta[@name="author"]/@content').extract_first()
        if editor == None or len(editor) == 0:
            editor = re.findall('编辑:(\S*)', response.text)[0]
        pipleitem['editor'] = editor

        content = helper.list2str(
            response.xpath(
                'string(//section[contains(@class,"detail_article_content")])'
            ).extract())
        if content == None or len(content) < 10:
            content = helper.list2str(
                response.xpath('string(//*[@id="articlecontent"])').extract())
        pipleitem['content'] = content

        pipleitem['image_urls'] = helper.list2str(
            response.css(
                '[class~=detail_article_content]	 img::attr(src)').extract())
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        like = requests.get('https://front-web.rednet.cn/content/star/' +
                            pipleitem['id']).text
        pipleitem['like'] = like
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
コード例 #14
0
    def content2_parse(self, response):
        date = response.xpath(
            '//meta[@itemprop="dateCreated"]/@content').extract_first()
        if date == None or len(date) == 0: return
        date = re.findall('\d{4}-\d+-\d+', date)[0]
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor2Item()

        extor = response.css('#js-initialData::text').extract_first()
        if extor == None or len(extor) == 0: return
        jsonbd = json.loads(extor)

        questions_body = jsonbd['initialState']['entities']['answers']
        for i in questions_body.keys():
            questions_body = questions_body[i]

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = questions_body['id']
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = '知乎'
        pipleitem['editor'] = questions_body['author']['name']
        # pipleitem['content'] = helper.list2str(response.xpath('string(//div[@class="QuestionHeader-detail"])').extract())
        pipleitem['content'] = helper.list2str(
            re.findall('>(.*?)<', questions_body['content']))
        # pipleitem['image_urls'] = helper.list2str(re.findall('<img.*?src="(.*?)"',questions_body['content']))
        pipleitem['image_urls'] = helper.list2str(
            response.css('.RichContent-inner img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            response.css('.RichContent-inner video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = questions_body['voteupCount']
        pipleitem['dislike'] = None
        # pipleitem['views'] = response.xpath('//strong[@class="NumberBoard-itemValue"]/@title').extract_first()
        pipleitem['views'] = None

        # pipleitem['comment'] = response.xpath('//meta[@itemprop="commentCount"]/@content').extract_first()
        pipleitem['comment'] = questions_body['commentCount']
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
コード例 #15
0
    def shiping_content_parse(self, response):
        titleleft = response.css(
            '.content_title .left p::text').extract_first()
        date = re.findall('\d{4}年\d{2}月\d{2}日\s*[\d:]*', titleleft)
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date[0], self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor2Item()

        pipleitem['date'] = helper.formatTime(date[0])
        pipleitem['id'] = re.findall('\d{2}-\d{2}/(.*)\.', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath('//title/text()').extract_first()

        source = re.findall('来源:(.*)', titleleft)
        if len(source) != 0: source = source[0]
        pipleitem['source'] = source

        editor = response.css('.content_desc span::text').extract_first()
        if editor != None:
            tmp = re.findall('责任编辑:【(.*)】', editor)
            if len(tmp) != 0: editor = tmp[0]
        pipleitem['editor'] = editor

        pipleitem['content'] = helper.list2str(
            response.xpath('string(//div[@class="content_desc"])').extract())
        pipleitem['image_urls'] = None
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
コード例 #16
0
    def content_parse(self, response):
        jsonbd = json.loads(response.text)
        if jsonbd == None or len(jsonbd) == 0: return
        data = jsonbd['data']

        date = data['news_datetime'] if 'news_datetime' in data.keys() else None
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor2Item()
        content = data['contents'] if 'contents' in data.keys() else ''
        images = data['image'] if 'image' in data.keys() else []

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = response.meta['id']
        pipleitem['url'] = response.url
        pipleitem['title'] = data['title'] if 'title' in data.keys() else None
        pipleitem['source'] = data['copyfrom'] if 'copyfrom' in data.keys() else None
        pipleitem['editor'] = data['admin_name'] if 'admin_name' in data.keys() else None
        pipleitem['content'] = helper.list2str(re.findall('>(.*?)<', content))

        imagelist = []
        for i in images:
            if i['url'] != None and len(i['url']) != 0:
                imagelist.append(i['url'])
        pipleitem['image_urls'] = helper.list2str(imagelist)
        pipleitem['video_urls'] = None
        pipleitem['share'] = response.meta['share_count']
        pipleitem['like'] = data['likes_count'] if 'likes_count' in data.keys() else None
        pipleitem['dislike'] = None
        pipleitem['views'] = data['read_count'] if 'read_count' in data.keys() else None
        pipleitem['comment'] = data['comment_count'] if 'comment_count' in data.keys() else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
コード例 #17
0
    def content_parse(self, response):
        # date = response.css('.news_content .about_news::text').extract_first()
        date = re.findall('(\d{4}-\d+-\d+[\s\d:]+)', response.text)
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date[0], self.limittime) < 0: return
        except:
            return
        pipleitem = CctvOpinionmonitor2Item()

        id = re.findall('"contId":"(\d+)"', response.text)
        editor = re.findall('责任编辑:(.*?)<', response.text)
        like = re.findall('<em></em>\s*?(\d*)</a>', response.text)

        pipleitem['date'] = helper.formatTime(date[0])
        pipleitem['id'] = id[0] if id != None and len(id) != 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = response.css(
            '.gg-gmcont a::text').extract_first()
        pipleitem['editor'] = editor[0] if editor != None and len(
            editor) != 0 else None
        pipleitem['content'] = helper.list2str(
            response.xpath('string(//div[@class="news_content"])').extract())
        pipleitem['image_urls'] = helper.list2str(
            response.css('.news_content img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            response.css('.news_content video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = like[0] if like != None and len(
            like) != 0 else None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = response.meta[
            'commentNum'] if 'commentNum' in response.meta.keys() else 0
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem