def content_parse(self, response): date = re.findall('\d{4}-\d+-\d+',response.css('.author-timestamp::attr(content)').extract_first()) if len(date) > 0: if helper.compare_time(date[0], self.limittime) < 0: return else: return pipleitem = CctvOpinionmonitorItem() pipleitem['date'] = date[0] pipleitem['id'] = re.findall('\d{2}/(\S*)',response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.xpath('//h1[@itemprop="headline"]/text()').extract_first() pipleitem['source'] = 'The Washington Post' pipleitem['editor'] = response.css('.author-byline .author-name::text').extract_first() pipleitem['content'] = helper.list2str(response.xpath('string(//div[@id="article-body"])').extract()) pipleitem['image_urls'] = helper.list2str(response.css('#article-body img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str(response.xpath('//div[@class="news-content info-content"]').css(' video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = re.findall( '\d{4}-\d+-\d+', response.xpath( '//meta[@name="analyticsAttributes.articleDate"]/@content'). extract_first()) if len(date) > 0: if helper.compare_time(date[0], self.limittime) < 0: return else: return pipleitem = CctvOpinionmonitorItem() pipleitem['date'] = date[0] pipleitem['id'] = re.findall('id\S*', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.xpath( '//h1[@class="ArticleHeader_headline"]/text()').extract_first() pipleitem['source'] = 'Reuters' # pipleitem['editor'] = response.css('.BylineBar_byline::text').extract_first() pipleitem['editor'] = response.xpath( '//meta[@name="Author"]/@content').extract_first() pipleitem['content'] = helper.list2str( response.xpath( 'string(//div[@class="StandardArticleBody_body"])').extract()) pipleitem['image_urls'] = helper.list2str( response.css('.StandardArticleBody_body img::attr(src)').extract()) pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.meta['date'] if len(date) == 0: return if helper.compare_time(date, self.limittime) < 0: return pipleitem = CctvOpinionmonitorItem() pipleitem['date'] = date pipleitem['id'] = re.findall('doc-([a-z\d]*).', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.xpath( '//h1[@id="artibodyTitle"]/text()').extract_first() pipleitem['source'] = response.css( 'p.info *:nth-last-child(1)::text').extract_first() pipleitem['editor'] = None pipleitem['content'] = helper.list2str( response.xpath('string(//div[@id="artibody"])').extract()).replace( u'\u3000', u'') pipleitem['image_urls'] = helper.list2str( response.css('#artibody img::attr(src)').extract()) pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = re.findall( '\d{4}-\d+-\d+', response.xpath( '//meta[@name="article.published"]/@content').extract_first()) if len(date) > 0: if helper.compare_time(date[0], self.limittime) < 0: return else: return pipleitem = CctvOpinionmonitorItem() pipleitem['date'] = date[0] pipleitem['id'] = re.findall('-\d{5,}', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.css('head title::text').extract_first() # pipleitem['title'] = response.xpath('//h1[@class="wsj-article-headline"]/text()').extract_first() pipleitem['source'] = response.xpath( '//meta[@name="page.content.source"]/@content').extract_first() pipleitem['editor'] = response.xpath( '//meta[@name="author"]/@content').extract_first() pipleitem['content'] = helper.list2str( response.xpath( 'string(//div[@class="wsj-snippet-body"])').extract()) pipleitem['image_urls'] = None pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = re.findall('\d{4}-\d+-\d+', response.css('head').extract_first()) if len(date) > 0: date = date[0] if helper.compare_time(date, self.limittime) < 0: return else: return pipleitem = CctvOpinionmonitorItem() pipleitem['date'] = date pipleitem['id'] = None pipleitem['url'] = response.url pipleitem['title'] = response.css('head title::text').extract_first() pipleitem['source'] = None pipleitem['editor'] = None pipleitem['content'] = None pipleitem['image_urls'] = helper.list2str( response.css('img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( response.css('video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.xpath( '//span[@class="date style-scope ytd-video-secondary-info-renderer"]/text()' ).extract_first() log.msg(message=date, level=log.WARNING) if len(date) > 0 and isinstance(date, str): if date == None or date.find('getSimpleString') > 0 or date.find( 'ago') > 0: return if helper.compare_time(helper.formatTime(date), self.limittime) < 0: return else: return pipleitem = CctvOpinionmonitorItem() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = re.findall('v=(\S*).', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.xpath( '//div[@id="container"]/h1/yt-formatted-string/text()' ).extract_first() pipleitem['source'] = 'Youtube' pipleitem['editor'] = response.css( '#owner-name a::text').extract_first() pipleitem['content'] = None views = re.findall( '\d*', response.xpath( '//span[@class="view-count style-scope yt-view-count-renderer"]/text()' ).extract_first()) if len(views) > 0: pipleitem['views'] = views[0] pipleitem['image_urls'] = None pipleitem['video_urls'] = response.css( 'video::attr(src)').extract_first() pipleitem['share'] = None tmp = response.xpath( '//yt-formatted-string[@id="text"]/@aria-label').extract() if len(tmp) < 2: tmp = ['0', '0'] for i in range(len(tmp)): if re.search('No', tmp[i]): tmp[i] = '0' pipleitem['like'] = re.findall('\d*', tmp[0])[0] pipleitem['dislike'] = re.findall('\d*', tmp[1])[0] pipleitem['comment'] = '0' comment = re.findall( '\d*', response.xpath('//h2[@id="count"]/yt-formatted-string/text()'). extract_first())[0] if len(comment) > 0: pipleitem['comment'] = comment pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): datestr = response.css('.horn-txt p::text').extract_first() if datestr != None: datestr = re.findall('\d{4}-\d+-\d+\s*[\d:]*', datestr)[0] if helper.compare_time(datestr, self.limittime) < 0: return else: return pipleitem = CctvOpinionmonitorItem() pipleitem['date'] = datestr pipleitem['id'] = response.meta['id'] pipleitem['url'] = response.url pipleitem['title'] = response.css( '.new-headline::text').extract_first() pipleitem['source'] = '华人头条' pipleitem['editor'] = response.css( '.horn-mark p::text').extract_first() pipleitem['content'] = helper.list2str( response.xpath( 'string(//div[@class="news-content info-content"])').extract()) pipleitem['image_urls'] = helper.list2str( response.xpath('//div[@class="news-content info-content"]').css( ' img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( response.xpath('//div[@class="news-content info-content"]').css( ' video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None comment_count = re.findall( '\d+', response.css('.comment-num span::text').extract_first()) if comment_count != None: pipleitem['comment'] = comment_count[0] else: pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): pipleitem = CctvOpinionmonitorItem() pipleitem['date'] = None pipleitem['id'] = re.findall('no=([a-z\d]*)', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.meta['title'] pipleitem['content'] = response.xpath( 'string(//p[@style="margin-right:8px;margin-bottom:18px"])' ).extract_first() pipleitem['source'] = '今日华人网' pipleitem['editor'] = None pipleitem['image_urls'] = None pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
# -*- coding: utf-8 -*-