Esempio n. 1
0
    def content_parse(self, response):
        date = response.css('.atc-MetaTime::attr(datetime)').extract_first()
        if date == None or len(date) == 0: return
        date = re.findall('\d+[-:]\d+[-:]*\d*', date)

        try:
            if helper.compare_time(helper.list2str(date), self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor5Item()

        id = re.findall('\d{8}', response.url)
        pipleitem['date'] = helper.list2str(date)
        pipleitem['id'] = id[0] if len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('.atc-HeadlineText::text').extract_first()
        pipleitem['source'] = 'FAZ.NET - Nachrichten'
        pipleitem['content'] = helper.list2str(response.css('div[class*="atc-Text"]').xpath('string(.)').extract())
        pipleitem['editor'] = response.css('.atc-MetaAuthorLink::text').extract_first()
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(response.css('.atc-ImageContainer img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(response.css('.atc a::attr(href)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.xpath(
            '//meta[@property="article:published_time"]/@content'
        ).extract_first()
        if date == None or len(date) == 0: return
        date = re.findall('\d+[-:]\d+[-:]*\d*', date)

        try:
            if helper.compare_time(helper.list2str(date), self.limittime) < 0:
                return
        except:
            return

        pipleitem = CctvOpinionmonitor5Item()

        id = re.findall('\d{7,}', response.url)
        pipleitem['date'] = helper.list2str(date)
        pipleitem['id'] = id[0] if len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = response.xpath(
            '//span[@class="margin_top_sm ui_bold"]/text()').extract_first()
        pipleitem['content'] = helper.list2str(
            response.xpath(
                'string(//div[@class="article-section margin_bottom_article"])'
            ).extract())
        pipleitem['editor'] = response.xpath(
            '//span[@class="margin_top_sm ui_bold"]/text()').extract_first()
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(
            input=response.css('article.grid img::attr(src)').extract(),
            prefix='http://www.leparisien.fr')
        pipleitem['video_urls'] = helper.list2str(
            response.xpath('//iframe[@allow="autoplay"]/@src').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.xpath(
            '//div[@class="fl times"]/text()').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor5Item()

        pipleitem['date'] = date
        pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys(
        ) else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = re.sub(
            '来源:', '',
            response.xpath('//div[@class="fl origin"]/text()').extract_first())
        pipleitem['content'] = helper.list2str(
            response.css('.news-detail-cont').xpath('string(.)').extract())
        pipleitem['editor'] = None
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(
            response.css('.news-detail-cont img::attr(src)').extract())
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = response.css(
            '#support .num-total::text').extract_first()
        pipleitem['dislike'] = response.css(
            '#against .num-total::text').extract_first()
        self.commentheaders['Referer'] = response.url
        self.commentpar['articleId'] = response.meta['id']
        html = requests.post(
            url='https://comment.yorkbbs.ca/api/comment/getComment',
            data=self.commentpar,
            headers=self.commentheaders)
        pipleitem['comment'] = json.loads(html.text)['totalCount']
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
Esempio n. 4
0
    def content_parse(self, response):
        jsbd = response.xpath(
            '//script[@type="application/ld+json"]/text()').extract_first()
        date = json.loads(jsbd).get(
            'datePublished', None) if jsbd != None and len(jsbd) > 0 else '{}'
        if date == None or len(date) == 0: return
        date = re.findall('\d+[-:]\d+[-:]\d+', date)

        try:
            if helper.compare_time(helper.list2str(date), self.limittime) < 0:
                return
        except:
            return

        pipleitem = CctvOpinionmonitor5Item()

        id = re.findall('com/(.*)', response.url)
        pipleitem['date'] = helper.list2str(date)
        pipleitem['id'] = id[0] if len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = response.css(
            '.article-source::text').extract_first()
        pipleitem['content'] = helper.list2str(
            response.css('.article-body').xpath('string(.)').extract())
        pipleitem['editor'] = response.css(
            '.author-byline span span a::text').extract_first()
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(
            response.css('.article-body img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            response.css(
                '.article-body video-container a::attr(href)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        timestamp = response.xpath(
            '//meta[@name="cXenseParse:cbc-publishedTime"]/@content'
        ).extract_first()
        if timestamp == None or len(timestamp) == 0: return
        date = helper.get_makedtime(format='%Y-%m-%d %H:%M:%S',
                                    timestamp=timestamp)

        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor5Item()

        pipleitem['date'] = helper.list2str(date)
        pipleitem['id'] = response.meta.get('id', None)
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = response.xpath(
            '//span[@class="detail-link-label sclt-storySectionLink"]/a/text()'
        ).extract_first()
        pipleitem['content'] = helper.list2str(
            response.css('.story').xpath('string(.)').extract())
        pipleitem['editor'] = response.meta.get('author', None)
        pipleitem['views'] = response.meta.get('numViewers', None)
        pipleitem['image_urls'] = helper.list2str(
            response.css('.detailBodyContainer img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            response.css('.detailBodyContainer video-container a::attr(href)').
            extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
Esempio n. 6
0
    def content_parse(self, response):
        date = response.css('time.LastUpdated::attr(datetime)').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(helper.formatTime(date),
                                   self.limittime) < 0:
                return
        except:
            return

        pipleitem = CctvOpinionmonitor5Item()

        id = re.findall('[A-Z\d]{12}', response.url)
        source = response.xpath(
            '//div[@class="TagMemberSilver"]/a/text()').extract_first()
        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = id[0] if id != None and len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = source if source != None and len(
            source) > 0 else '朝日新聞デジタル'
        pipleitem['content'] = helper.list2str(
            response.css('.ArticleText').xpath('string(.)').extract())
        pipleitem['editor'] = response.css(
            '.TagUnderTitle .Sub::text').extract_first()
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(
            response.css('.ArticleBody img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            response.css('.ArticleBody video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
        if date == None or len(date) == 0: return
        date = re.findall('\d+[-:]\d+[-:]*\d*', date)

        try:
            if helper.compare_time(helper.list2str(date), self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor5Item()

        id = re.findall('\d{8,}', response.url)
        pipleitem['date'] = helper.list2str(date)
        pipleitem['id'] = id[0] if len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = response.xpath('//meta[@property="article:publisher"]/@content').extract_first()
        pipleitem['content'] = helper.list2str(response.xpath('string(//span[@itemprop="articleBody"])').extract())
        pipleitem['editor'] = response.css('.author span::text').extract_first()
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(response.css('.body-text img::attr(src)').extract())
        videos_tmp = []
        videos = response.xpath('//div[@class="share-button gs-social-popup-trigger"]/@data-sharebuttons').extract()
        for item in videos if videos != None else []:
            mp4 = re.findall('https://media.gedidigital.it/[^.]+.mp4',item)
            videos_tmp.append(mp4[0]) if len(mp4) > 0 else ''

        pipleitem['video_urls'] = helper.list2str(videos_tmp)
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem