def content_parse(self, response): date = response.css('.atc-MetaTime::attr(datetime)').extract_first() if date == None or len(date) == 0: return date = re.findall('\d+[-:]\d+[-:]*\d*', date) try: if helper.compare_time(helper.list2str(date), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor5Item() id = re.findall('\d{8}', response.url) pipleitem['date'] = helper.list2str(date) pipleitem['id'] = id[0] if len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('.atc-HeadlineText::text').extract_first() pipleitem['source'] = 'FAZ.NET - Nachrichten' pipleitem['content'] = helper.list2str(response.css('div[class*="atc-Text"]').xpath('string(.)').extract()) pipleitem['editor'] = response.css('.atc-MetaAuthorLink::text').extract_first() pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str(response.css('.atc-ImageContainer img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str(response.css('.atc a::attr(href)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.xpath( '//meta[@property="article:published_time"]/@content' ).extract_first() if date == None or len(date) == 0: return date = re.findall('\d+[-:]\d+[-:]*\d*', date) try: if helper.compare_time(helper.list2str(date), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor5Item() id = re.findall('\d{7,}', response.url) pipleitem['date'] = helper.list2str(date) pipleitem['id'] = id[0] if len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = response.xpath( '//span[@class="margin_top_sm ui_bold"]/text()').extract_first() pipleitem['content'] = helper.list2str( response.xpath( 'string(//div[@class="article-section margin_bottom_article"])' ).extract()) pipleitem['editor'] = response.xpath( '//span[@class="margin_top_sm ui_bold"]/text()').extract_first() pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str( input=response.css('article.grid img::attr(src)').extract(), prefix='http://www.leparisien.fr') pipleitem['video_urls'] = helper.list2str( response.xpath('//iframe[@allow="autoplay"]/@src').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.xpath( '//div[@class="fl times"]/text()').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor5Item() pipleitem['date'] = date pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys( ) else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = re.sub( '来源:', '', response.xpath('//div[@class="fl origin"]/text()').extract_first()) pipleitem['content'] = helper.list2str( response.css('.news-detail-cont').xpath('string(.)').extract()) pipleitem['editor'] = None pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str( response.css('.news-detail-cont img::attr(src)').extract()) pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = response.css( '#support .num-total::text').extract_first() pipleitem['dislike'] = response.css( '#against .num-total::text').extract_first() self.commentheaders['Referer'] = response.url self.commentpar['articleId'] = response.meta['id'] html = requests.post( url='https://comment.yorkbbs.ca/api/comment/getComment', data=self.commentpar, headers=self.commentheaders) pipleitem['comment'] = json.loads(html.text)['totalCount'] pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): jsbd = response.xpath( '//script[@type="application/ld+json"]/text()').extract_first() date = json.loads(jsbd).get( 'datePublished', None) if jsbd != None and len(jsbd) > 0 else '{}' if date == None or len(date) == 0: return date = re.findall('\d+[-:]\d+[-:]\d+', date) try: if helper.compare_time(helper.list2str(date), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor5Item() id = re.findall('com/(.*)', response.url) pipleitem['date'] = helper.list2str(date) pipleitem['id'] = id[0] if len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = response.css( '.article-source::text').extract_first() pipleitem['content'] = helper.list2str( response.css('.article-body').xpath('string(.)').extract()) pipleitem['editor'] = response.css( '.author-byline span span a::text').extract_first() pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str( response.css('.article-body img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( response.css( '.article-body video-container a::attr(href)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): timestamp = response.xpath( '//meta[@name="cXenseParse:cbc-publishedTime"]/@content' ).extract_first() if timestamp == None or len(timestamp) == 0: return date = helper.get_makedtime(format='%Y-%m-%d %H:%M:%S', timestamp=timestamp) try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor5Item() pipleitem['date'] = helper.list2str(date) pipleitem['id'] = response.meta.get('id', None) pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = response.xpath( '//span[@class="detail-link-label sclt-storySectionLink"]/a/text()' ).extract_first() pipleitem['content'] = helper.list2str( response.css('.story').xpath('string(.)').extract()) pipleitem['editor'] = response.meta.get('author', None) pipleitem['views'] = response.meta.get('numViewers', None) pipleitem['image_urls'] = helper.list2str( response.css('.detailBodyContainer img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( response.css('.detailBodyContainer video-container a::attr(href)'). extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.css('time.LastUpdated::attr(datetime)').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(helper.formatTime(date), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor5Item() id = re.findall('[A-Z\d]{12}', response.url) source = response.xpath( '//div[@class="TagMemberSilver"]/a/text()').extract_first() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = id[0] if id != None and len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = source if source != None and len( source) > 0 else '朝日新聞デジタル' pipleitem['content'] = helper.list2str( response.css('.ArticleText').xpath('string(.)').extract()) pipleitem['editor'] = response.css( '.TagUnderTitle .Sub::text').extract_first() pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str( response.css('.ArticleBody img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( response.css('.ArticleBody video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() if date == None or len(date) == 0: return date = re.findall('\d+[-:]\d+[-:]*\d*', date) try: if helper.compare_time(helper.list2str(date), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor5Item() id = re.findall('\d{8,}', response.url) pipleitem['date'] = helper.list2str(date) pipleitem['id'] = id[0] if len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = response.xpath('//meta[@property="article:publisher"]/@content').extract_first() pipleitem['content'] = helper.list2str(response.xpath('string(//span[@itemprop="articleBody"])').extract()) pipleitem['editor'] = response.css('.author span::text').extract_first() pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str(response.css('.body-text img::attr(src)').extract()) videos_tmp = [] videos = response.xpath('//div[@class="share-button gs-social-popup-trigger"]/@data-sharebuttons').extract() for item in videos if videos != None else []: mp4 = re.findall('https://media.gedidigital.it/[^.]+.mp4',item) videos_tmp.append(mp4[0]) if len(mp4) > 0 else '' pipleitem['video_urls'] = helper.list2str(videos_tmp) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem