Example #1
0
    def content_parse(self, response):
        date = response.xpath(
            '//span[@class="date style-scope ytd-video-secondary-info-renderer"]/text()'
        ).extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(helper.formatTime(date),
                                   self.limittime) < 0:
                return
        except:
            return

        pipleitem = YoutubeItem()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = re.findall('v=(\S*)', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = 'Youtube'

        pipleitem['editor'] = response.xpath(
            '//yt-formatted-string[@id="owner-name"]/a/text()').extract_first(
            )
        pipleitem['content'] = helper.list2str(
            response.css('#description').xpath('string(.)').extract()).strip()
        views = response.xpath(
            '//span[@class="view-count style-scope yt-view-count-renderer"]/text()'
        ).extract_first()
        pipleitem['views'] = re.sub('\D', '', views) if len(views) > 0 else '0'
        pipleitem['image_urls'] = helper.list2str(
            response.css('img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            response.css('video::attr(src)').extract())
        pipleitem['share'] = None
        tmp = response.xpath(
            '//yt-formatted-string[@id="text"]/@aria-label').extract()
        for i in tmp:
            i = i.replace('No', '0')
            pipleitem['like'] = re.sub(
                '\D', '', i) if re.search('likes', i) != None else '0'
            pipleitem['dislike'] = re.sub(
                '\D', '', i) if re.search('dislikes', i) != None else '0'

        comment = response.xpath(
            '//h2[@id="count"]/yt-formatted-string/text()').extract_first()
        pipleitem['comment'] = re.sub('\D', '',
                                      comment) if len(comment) > 0 else 0
        # pipleitem['subscriber'] = response.meta['subscriber']

        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.meta['date']
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = response.meta['id']
        pipleitem['url'] = response.url
        pipleitem['title'] = response.meta['title']
        pipleitem['source'] = response.css(
            '#detail_Info_Tab_cout4_1 p:nth-last-child(2)::text'
        ).extract_first()
        pipleitem['editor'] = response.meta['editor']
        pipleitem['content'] = helper.list2str(
            response.xpath(
                'string(//div[@id="detail_infotab_cont_1"])').extract())
        pipleitem['image_urls'] = response.meta['pic']
        pipleitem['video_urls'] = helper.list2str(
            response.css('#videoPath::attr(value)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.css('h3.subheader time::text').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date)
        id = re.findall('diary/(.*)/',response.url)
        pipleitem['id'] = id[0] if id != None and len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = '中日通'
        pipleitem['editor'] = None
        pipleitem['content'] = helper.list2str(response.css('.markdown').xpath('string(.)').extract())
        pipleitem['image_urls'] = helper.list2str(response.css('.markdown img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(response.css('video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = response.css('.likes_count::text').extract_first()
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        jsonbd = json.loads(response.text)
        info = jsonbd['dramaInfo'] if 'dramaInfo' in jsonbd.keys() else None
        date = info['updateDesc'] if 'updateDesc' in info.keys() else None
        date = re.findall('[\d-]+', date)
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date[0], self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date[0])
        pipleitem['id'] = info['contId'] if 'contId' in info.keys() else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.meta[
            'name'] if 'name' in response.meta.keys() else None
        pipleitem['source'] = info['type'] if 'type' in info.keys() else None
        pipleitem['editor'] = None
        pipleitem['content'] = info[
            'description'] if 'description' in info.keys() else None
        pipleitem['image_urls'] = info['imageURL'] if 'imageURL' in info.keys(
        ) else None
        pipleitem['video_urls'] = info[
            'requestURL'] if 'requestURL' in info.keys() else None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.meta['newsTime'] if 'newsTime' in response.meta.keys() else None
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = response.meta['newsId'] if 'newsId' in response.meta.keys() else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = response.meta['newsResource'] if 'newsResource' in response.meta.keys() else None
        pipleitem['editor'] = None
        pipleitem['content'] = helper.list2str(response.xpath('string(//div[@class="m_details-con"])').extract())
        pipleitem['image_urls'] = helper.list2str(response.meta['picUrlList'] if 'picUrlList' in response.meta.keys() else [])
        pipleitem['video_urls'] = response.css('audio::attr(src)').extract_first()
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = response.meta['articleCount'] if 'articleCount' in response.meta.keys() else None
        pipleitem['comment'] = response.meta['commentNum'] if 'commentNum' in response.meta.keys() else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def video_parse(self, response):
        date = response.xpath(
            '//meta[@name="pubdate"]/@content').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(helper.formatTime2(date.strip()),
                                   self.limittime) < 0:
                return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date.strip())
        id = re.findall('com/(.*)/', response.url)
        pipleitem['id'] = id[0] if id != None and len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = 'CNN'
        pipleitem['editor'] = response.xpath(
            '//meta[@name="author"]/@content').extract_first()
        pipleitem['content'] = helper.list2str(
            response.css('#[id~=js-video_description]::text').extract())
        pipleitem['image_urls'] = helper.list2str(
            response.css('img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            re.findall('"videoUrl":\s*"(.*?)"', response.text))
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        jsonbd = json.loads(response.text)
        if len(jsonbd) == 0: return
        cards = jsonbd[0]
        date = cards['time'] if 'time' in cards.keys() else None

        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = cards['GlobalID'] if 'GlobalID' in cards.keys(
        ) else None
        pipleitem['url'] = response.url
        pipleitem['title'] = cards['title'] if 'title' in cards.keys(
        ) else None
        pipleitem['source'] = cards['docfrom'] if 'docfrom' in cards.keys(
        ) else None
        pipleitem['editor'] = cards['autor'] if 'autor' in cards.keys(
        ) else None
        pipleitem['content'] = cards['summary'] if 'summary' in cards.keys(
        ) else None
        pipleitem['image_urls'] = cards['photo'] if 'photo' in cards.keys(
        ) else None
        pipleitem['video_urls'] = cards[
            'videourl'] if 'videourl' in cards.keys() else None

        html = requests.get(url=self.url.format(pipleitem['id']))
        bd = json.loads(html.text)
        if 'result' not in bd.keys() or len(bd['result']) == 0:
            bd = None
        else:
            bd = bd['result'][0]

        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = bd['views'] if bd != None and 'views' in bd.keys(
        ) else None
        pipleitem['comment'] = bd[
            'commentnum'] if bd != None and 'commentnum' in bd.keys() else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        # 'http://apiapp.people.cn/apiv3.3.0/get_article_info.php?globalids=1061648&parents=0,0,0,0,0,0&juxian_liveid=0,0,0,0,0,0&juxian_companyid=0,0,0,0,0,0&deviceid=A000009114F247'

        return pipleitem
    def content_parse(self, response):
        date = response.css('span.date::text').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(helper.formatTime(date.strip()), self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date.strip())
        id = re.findall('news/([a-z\d]*)',response.url)
        pipleitem['id'] = id[0] if id != None and len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = 'CGTN'
        pipleitem['editor'] = response.xpath('//div[@class="news-author news-text"]/text()').extract_first()
        content = helper.list2str(response.xpath('string(//div[@id="cmsMainContent"])').extract())
        pipleitem['content'] = content if len(content) > 10 else helper.list2str(response.css('#cmsMainContent::attr(data-json)').extract())
        pipleitem['image_urls'] = helper.list2str(response.css('.cg-padding img::attr(src)').extract())

        list = []
        if response.meta['key'] == 'live':
            html = requests.get('https://mapi.cgtn.com/mobileapp/v2/live/event/info?id={}'.format(pipleitem['id'])).text
            for i in json.loads(html)['response']['videos']:
                list.append(i['url'])

        pipleitem['video_urls'] = helper.list2str(list)
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
Example #9
0
    def content_parse(self, response):
        jsonbd = json.loads(response.text)
        if 'cardgroups' not in jsonbd.keys() or len(jsonbd['cardgroups']) == 0:
            return
        cards = jsonbd['cardgroups'][0] if len(
            jsonbd['cardgroups']) != 0 else {}
        cards = cards['cards'][0] if len(cards['cards']) != 0 else {}

        date = helper.formatTime(
            cards['date']) if 'date' in cards.keys() else None
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = date
        pipleitem['id'] = response.meta['id']
        pipleitem['url'] = response.url
        pipleitem['title'] = cards['title'] if 'title' in cards.keys(
        ) else None
        pipleitem['source'] = cards['source'] if 'source' in cards.keys(
        ) else None
        pipleitem['editor'] = None
        pipleitem['content'] = helper.list2str(
            re.findall('>(.*?)<',
                       cards['content'] if 'content' in cards.keys() else ''))
        pipleitem['image_urls'] = helper.list2str(
            cards['photoList'] if 'photoList' in cards.keys() else [])
        pipleitem['video_urls'] = cards['video'][
            'url'] if 'video' in cards.keys() else None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem