def content_parse(self, response): date = response.meta['date'] if 'date' in response.meta.keys() else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor4Item() pipleitem['date'] = date pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys() else None pipleitem['url'] = response.url pipleitem['title'] = response.css('.title::text').extract_first() pipleitem['source'] = response.xpath('//div[@name="source"]/@content').extract_first() pipleitem['content'] = helper.list2str(response.css('.story-body__inner p').xpath('string(.)').extract()) pipleitem['editor'] = response.css('.author a::text').extract_first() pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str(response.css('img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str(response.css('video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): jsonbd = json.loads(response.text) if jsonbd == None or len(jsonbd) == 0: return date = jsonbd['pub_time'] if 'pub_time' in jsonbd.keys() else None if date == None: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor4Item() pipleitem['date'] = date pipleitem['id'] = jsonbd['id'] if 'id' in jsonbd.keys() else None pipleitem['url'] = response.url pipleitem['title'] = jsonbd['title'] if 'title' in jsonbd.keys( ) else None pipleitem['source'] = jsonbd['card'][ 'chlname'] if 'card' in jsonbd.keys() else None pipleitem['content'] = helper.list2str( re.findall('>(.*?)<', jsonbd['content']['text']) ) if 'content' in jsonbd.keys() and jsonbd['content'] != None else '' pipleitem['editor'] = None pipleitem['views'] = jsonbd['count_info'][ 'playcount'] if 'playcount' in jsonbd['count_info'].keys( ) else None videoList = [] imageslist = [] for i in jsonbd['attribute'].keys() if isinstance( jsonbd['attribute'], dict) else []: if re.search('VIDEO', i) != None: videoList.append( jsonbd['attribute'][i]['playurl'] ) if 'playurl' in jsonbd['attribute'][i].keys() else '' if re.search('IMG', i) != None: imageslist.append( jsonbd['attribute'][i] ['url']) if 'url' in jsonbd['attribute'][i].keys() else '' imageslist.append( jsonbd['imgurl']) if 'imgurl' in jsonbd.keys() else '' pipleitem['image_urls'] = helper.list2str(imageslist) pipleitem['video_urls'] = helper.list2str(videoList) pipleitem['share'] = jsonbd['count_info'][ 'share_count'] if 'share_count' in jsonbd['count_info'].keys( ) else None pipleitem['like'] = jsonbd['count_info'][ 'like_info'] if 'like_info' in jsonbd['count_info'].keys( ) else None pipleitem['dislike'] = None pipleitem['comment'] = jsonbd['count_info'][ 'comments'] if 'comments' in jsonbd['count_info'].keys() else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def parse(self, response): jsbd = json.loads(response.text) commentlist = {} for item in jsbd['comment_infos']['result']: commentlist[item['context_id']] = item['count'] for item in jsbd['items']['result']: date = item['published_at'] if 'published_at' in item.keys( ) else None if date == None or len(str(date)) == 0: continue try: if helper.compare_time( helper.get_makedtime('%Y-%m-%d %H:%M:%S', date), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor4Item() pipleitem['date'] = helper.get_makedtime('%Y-%m-%d %H:%M:%S', date) pipleitem['id'] = item['context_id'] if 'context_id' in item.keys( ) else None pipleitem['url'] = item['link'] if 'link' in item.keys() else None pipleitem['title'] = item['title'] if 'title' in item.keys( ) else None pipleitem['source'] = item[ 'publisher'] if 'publisher' in item.keys() else None pipleitem['content'] = helper.list2str( re.findall( '>(.*?)<', item['content'])) if 'content' in item.keys() else '' pipleitem['editor'] = item[ 'author_name'] if 'author_name' in item.keys() else None pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str( re.findall( '<img.*?src="(.*?)"', item['content'])) if 'content' in item.keys() else '' pipleitem['video_urls'] = helper.list2str( re.findall( '<video.*?src="(.*?)"', item['content'])) if 'content' in item.keys() else '' pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = commentlist[ item['context_id']] if 'context_id' in item.keys( ) and item['context_id'] in commentlist.keys() else None pipleitem['crawl_time'] = helper.get_localtimestamp() yield pipleitem
def content_parse(self, response): jsonbd = json.loads(response.text) if jsonbd == None or len(jsonbd) == 0: return date = jsonbd['publish_time'] if 'publish_time' in jsonbd.keys( ) else None if date == None: return try: if helper.compare_time( helper.get_makedtime('%Y-%m-%d %H:%M:%S', date), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor4Item() pipleitem['date'] = helper.get_makedtime('%Y-%m-%d %H:%M:%S', date) pipleitem['id'] = jsonbd['article_id'] if 'article_id' in jsonbd.keys( ) else None pipleitem['url'] = response.url pipleitem['title'] = jsonbd['title'] if 'title' in jsonbd.keys( ) else None pipleitem['source'] = response.meta[ 'mediaName'] if 'mediaName' in response.meta.keys() else None pipleitem['content'] = helper.list2str( re.findall('>(.*?)<', jsonbd['content']) ) if 'content' in jsonbd.keys() and jsonbd['content'] != None else '' pipleitem['editor'] = helper.list2str( jsonbd['media']['author']) if 'media' in jsonbd.keys( ) and 'author' in jsonbd['media'].keys() else None pipleitem['views'] = None imageList = [] for i in jsonbd['images'] if 'images' in jsonbd.keys( ) and jsonbd['images'] != None else []: imageList.append(i['url']) pipleitem['image_urls'] = helper.list2str(imageList) pipleitem['video_urls'] = helper.list2str( jsonbd['videos']) if 'videos' in jsonbd.keys() else [] pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = response.meta[ 'comment'] if 'comment' in response.meta.keys() else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.xpath( '//div[@class="fl times"]/text()').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor4Item() pipleitem['date'] = date pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys( ) else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = re.sub( '来源:', '', response.xpath('//div[@class="fl origin"]/text()').extract_first()) pipleitem['content'] = helper.list2str( response.css('.news-detail-cont').xpath('string(.)').extract()) pipleitem['editor'] = None pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str( response.css('.news-detail-cont img::attr(src)').extract()) pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = response.css( '#support .num-total::text').extract_first() pipleitem['dislike'] = response.css( '#against .num-total::text').extract_first() self.commentheaders['Referer'] = response.url self.commentpar['articleId'] = response.meta['id'] html = requests.post( url='https://comment.yorkbbs.ca/api/comment/getComment', data=self.commentpar, headers=self.commentheaders) pipleitem['comment'] = json.loads(html.text)['totalCount'] pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): jsonbd = json.loads(response.text) if jsonbd == None or len(jsonbd) == 0 or 'data' not in jsonbd.keys(): return jsonbd = jsonbd['data'] date = jsonbd['datetime'] if 'datetime' in jsonbd.keys() else None if date == None: return try: if helper.compare_time( helper.get_makedtime('%Y-%m-%d %H:%M:%S', date), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor4Item() pipleitem['date'] = helper.get_makedtime('%Y-%m-%d %H:%M:%S', date) pipleitem['id'] = jsonbd['ikey'] if 'ikey' in jsonbd.keys() else None pipleitem['url'] = response.url pipleitem['title'] = jsonbd['title'] if 'title' in jsonbd.keys( ) else None pipleitem['source'] = '多维新闻' pipleitem['content'] = helper.list2str( re.findall('>(.*?)<', jsonbd['content']) ) if 'content' in jsonbd.keys() and jsonbd['content'] != None else '' pipleitem['editor'] = response.meta[ 'author'] if 'author' in response.meta.keys() else None pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str( jsonbd['picurl']) if 'picurl' in jsonbd.keys() else [] pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.meta['date'] if 'date' in response.meta.keys( ) else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor4Item() pipleitem['date'] = date pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys( ) else None pipleitem['url'] = response.url pipleitem['title'] = response.css( '.container h3::text').extract_first() pipleitem['source'] = response.xpath( '//div[@class="source imedia"]/text()').extract_first() pipleitem['content'] = helper.list2str( response.css('#js-article p').xpath('string(.)').extract()) editor = response.css('#yidian_editor::text').extract_first() pipleitem['editor'] = re.sub('责任编辑:', '', editor) if editor != None else '' pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str( response.css('#yidian_editor img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( response.css('#yidian_editor video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = response.meta[ 'like'] if 'like' in response.meta.keys() else 0 pipleitem['dislike'] = None pipleitem['comment'] = response.meta[ 'comment_count'] if 'comment_count' in response.meta.keys() else 0 pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.meta['post_date'] if 'post_date' in response.meta.keys( ) else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor4Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys( ) else None pipleitem['url'] = response.url pipleitem['title'] = response.css( '#arcmaintitle::text').extract_first() pipleitem['source'] = response.meta[ 'source'] if 'source' in response.meta.keys() else None pipleitem['content'] = helper.list2str( response.css('#arcbody').xpath('string(.)').extract()) pipleitem['editor'] = response.meta[ 'author'] if 'author' in response.meta.keys() else None pipleitem['views'] = None imageList = [] for i in response.css('#arcbody img::attr(src)').extract(): imageList.append('https://info.51.ca{}'.format(i)) pipleitem['image_urls'] = helper.list2str(imageList) pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = response.meta[ 'comments_num'] if 'comments_num' in response.meta.keys() else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.css('.author-timestamp::attr(content)').extract_first() date = helper.list2str( re.findall('(\d{4}-\d{2}-\d{2}|\d{2}:\d{2})', date)) if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor4Item() pipleitem['date'] = date id = re.findall('(\d{2,4}/).*', response.url) pipleitem['id'] = id[0] if len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('.title::text').extract_first() pipleitem['source'] = 'WashingtonPost' pipleitem['content'] = helper.list2str( response.css('.paywall p').xpath('string(.)').extract()) pipleitem['editor'] = response.css('.author::text').extract_first() pipleitem['views'] = None pipleitem['image_urls'] = helper.list2str( response.css('.paywall img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( response.css('.paywall video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None url = re.findall('"@id":(\S*)', response.text)[0] rs = requests.get(url='{asset(url:' + url + '){totalCommentCount}}', headers=self.headers).text pipleitem['comment'] = re.findall( '\d*', rs)[0] if len(re.findall('\d*', rs)) > 0 else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem