def content_parse4(self, response): jsonbd = json.loads(response.text) if len(jsonbd['data']) == 0: return for item in jsonbd['data']: pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.formatTime(item['news_time']) if 'news_time' in item.keys() else '2019-01-01' pipleitem['id'] = item['id'] if 'id' in item.keys() else None pipleitem['url'] = response.url pipleitem['title'] = item['page_name'] if 'page_name' in item.keys() else None pipleitem['source'] = item['copyfrom'] if 'copyfrom' in item.keys() else None pipleitem['editor'] = None pipleitem['content'] = item['content'] if 'content' in item.keys() else None imagelist = [] for i in item['image']: if i['url'] != None and len(i['url']) != 0: imagelist.append(i['url']) pipleitem['image_urls'] = helper.list2str(imagelist) pipleitem['video_urls'] = item['video_url'] if 'video_url' in item.keys() else None pipleitem['share'] = item['share_count'] if 'share_count' in item.keys() else None pipleitem['like'] = item['likes_count'] if 'likes_count' in item.keys() else None pipleitem['dislike'] = None pipleitem['views'] = item['read_count'] if 'read_count' in item.keys() else None pipleitem['comment'] = item['comment_count'] if 'comment_count' in item.keys() else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.xpath('//*[@id="pubtime_baidu"]/text()').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = response.xpath('//meta[@name="contentid"]/@content').extract_first() pipleitem['url'] = response.url pipleitem['title'] = response.xpath('//title/text()').extract_first() pipleitem['source'] = response.xpath('//meta[@name="source"]/@content').extract_first() pipleitem['editor'] = response.xpath('//meta[@name="author"]/@content').extract_first() pipleitem['content'] = helper.list2str(response.xpath('string(//div[contains(@id,"zw")])').extract()) # 'http://news.eastday.com/images/thumbnailimg/month_1906/9979ded068194f2299bc158df90deb61.png' tmp = [] for i in response.css('#zw img::attr(src)').extract(): tmp.append('http://news.eastday.com{}'.format(i)) pipleitem['image_urls'] = helper.list2str(tmp) pipleitem['video_urls'] = helper.list2str(response.css('#zw source::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.meta['date'] if 'date' in response.meta.keys() else response.css( '.article-sub span:last-child::text').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys() else \ re.findall('[com|cn]/(.*)', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.xpath('//title/text()').extract_first() pipleitem['source'] = response.meta['source'] if 'source' in response.meta.keys() else response.css( '.article-sub span:first-child::text').extract_first() pipleitem['editor'] = None pipleitem['content'] = helper.list2str(response.xpath('string(//body)').extract()) pipleitem['image_urls'] = helper.list2str(response.css('img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str(response.css('video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = response.css('.share-count span::text').extract_first() pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): datelist = re.findall('\d{4}[-年.]+\d+[-月.]+\d+[日]*', response.text) if datelist == None or len(datelist) == 0: return try: if helper.compare_time(datelist[0], self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.formatTime(datelist[0]) pipleitem['id'] = None pipleitem['url'] = response.url pipleitem['title'] = response.xpath('//title/text()').extract_first() pipleitem['source'] = None pipleitem['editor'] = None pipleitem['content'] = None pipleitem['image_urls'] = None pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse2(self, response): jsonbd = json.loads(response.text) if jsonbd == None or len(jsonbd) == 0: return data = jsonbd['data'] date = data['question']['date'] if 'question' in data.keys() else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = data['question_id'] if 'question_id' in data.keys() else None pipleitem['url'] = response.url pipleitem['title'] = data['title'] if 'title' in data.keys() else None pipleitem['source'] = '人民日报APP' pipleitem['editor'] = data['question']['user_name'] if 'question' in data.keys() else None pipleitem['content'] = data['question']['content'] if 'question' in data.keys() else None pipleitem['image_urls'] = None pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = data['like_num'] if 'like_num' in data.keys() else None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = data['comment_num'] if 'comment_num' in data.keys() else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.css('#newsdate::attr(value)').extract_first() if date == None or len(date) == 0: return if helper.compare_time(date, self.limittime) < 0: return pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = re.findall('\d{2}-\d{2}/(.*)\.', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.xpath('//title/text()').extract_first() source = response.xpath('string(//div[@class="left-t"])').extract_first() if source != None: source = re.findall('来源:(\S*)', source)[0] pipleitem['source'] = source editor = response.css('.left_name .left_name::text').extract_first() if editor != None: tmp = re.findall('【编辑:(.*)】', editor) if len(tmp) != 0: editor = tmp[0] pipleitem['editor'] = editor pipleitem['content'] = helper.list2str(response.xpath('string(//div[@class="left_zw"])').extract()).replace( '\u3000', '') pipleitem['image_urls'] = helper.list2str(response.css('.left_zw img::attr(src)').extract()) pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse3(self, response): jsonbd = json.loads(response.text) if jsonbd == None or len(jsonbd) == 0: return data = jsonbd['data'] for item in data: pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.formatTime(item['period_num']) if 'period_num' in item.keys() else '2019-01-01' pipleitem['id'] = item['page_num'] if 'page_num' in item.keys() else None pipleitem['url'] = response.url pipleitem['title'] = item['page_name'] if 'page_name' in item.keys() else None pipleitem['source'] = '人民日报布版画' pipleitem['editor'] = None pipleitem['content'] = item['page_pic'] if 'page_pic' in item.keys() else None pipleitem['image_urls'] = item['page_pic'] if 'page_pic' in item.keys() else None pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): jsonbd = json.loads(response.text) date = jsonbd['time'] if 'time' in jsonbd.keys() and len(jsonbd['time']) != 0 else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor2Item() content = jsonbd['content'] if 'content' in jsonbd.keys() else None pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = jsonbd['newsId'] if 'newsId' in jsonbd.keys() else None pipleitem['url'] = response.url pipleitem['title'] = jsonbd['title'] if 'title' in jsonbd.keys() else None pipleitem['source'] = jsonbd['media']['mediaName'] if 'media' in jsonbd.keys() else None pipleitem['editor'] = None pipleitem['content'] = helper.list2str(re.findall('>(.*?)<', content)) if content != None else None imagelist = [] for i in jsonbd['photos']: if i['pic'] != None and len(i['pic']) != 0: imagelist.append(i['pic']) pipleitem['image_urls'] = helper.list2str(imagelist) pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = response.meta['readCount'] pipleitem['comment'] = response.meta['commentNum'] pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.xpath('//span[@class="date"]/text()').extract_first() if date == None or len(date) == 0: return if helper.compare_time(date, self.limittime) < 0: return pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = re.findall('doc-(.*)\.', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.xpath('//title/text()').extract_first() pipleitem['source'] = response.css('.source::text').extract_first() pipleitem['editor'] = None pipleitem['content'] = helper.list2str( response.xpath('string(//div[@id="article"])').extract()) pipleitem['image_urls'] = helper.list2str( response.css('.article img::attr(src)').extract()) pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.css('.news_about p:nth-child(2)::text').extract_first() if response.meta['kw'] == '视频': date = response.css('.video_info_left span:first-child::text').extract_first() if date == None or len(date) == 0: return date = re.findall('[\d-]+\s*[\d:]*', date)[0] if helper.compare_time(date, self.limittime) < 0: return pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = re.findall('forward_(.*)', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.xpath('//meta[@name="Description"]/@content').extract_first() source = response.css('.news_about span:first-child::text').extract_first() if response.meta['kw'] == '视频': for pt in ['.oriBox::text','.video_info_second span:first-child::text']: tmp = response.css(pt).extract_first() if tmp != None and len(tmp) > 1: # source = re.findall('来源[\s:]*(.*)', tmp)[0] source = tmp break pipleitem['source'] = re.findall('来源[\s:]*(.*)', source)[0] if len(re.findall('来源[\s:]*(.*)', source)) > 0 else None editor = response.css('.news_infor_extra .infor_item:first-child::text').extract_first() if response.meta['kw'] == '视频': editor = response.css( '.video_info_second span:last-child::text').extract_first() if editor != None: editor = editor.replace('责任编辑:', '') pipleitem['editor'] = editor content = helper.list2str(response.xpath('string(//div[@class="news_txt"])').extract()) if response.meta['kw'] == '视频': content = response.xpath('string(//div[@class="video_txt_l"])').extract_first() pipleitem['content'] = content pipleitem['image_urls'] = helper.list2str(response.css('.news_txt img::attr(src)').extract()) pipleitem['video_urls'] = None pipleitem['share'] = None like = response.css('[class~=news_love] .nbgbox:first-child a::text').extract_first() if like == None: like = response.xpath('//div[@class="news_love detail_gov"]/div[1]/a/text()').extract_first() if response.meta['kw'] == '视频': like = response.css('.zanBox a::text').extract_first() pipleitem['like'] = re.findall('[\n\s]*(\d*)', like)[0] pipleitem['dislike'] = None pipleitem['views'] = None comment = None if response.meta['kw'] == '视频': comment = response.css('.reply::text').extract_first() pipleitem['comment'] = comment pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.meta['date'] if 'date' in response.meta.keys( ) else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return jsonbd = json.loads(response.text) if jsonbd == None or len(jsonbd) == 0: return pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys( ) else None pipleitem['url'] = response.url pipleitem['title'] = response.meta[ 'title'] if 'title' in response.meta.keys() else None pipleitem['source'] = response.meta[ 'source'] if 'source' in response.meta.keys() else None pipleitem['editor'] = jsonbd['content'][ 'cms_editor'] if 'cms_editor' in jsonbd['content'].keys() else None pipleitem['content'] = helper.list2str( re.findall('>(.*?)<', jsonbd['content']['text']) ) if 'text' in jsonbd['content'].keys() else None imglist = [] videolist = [] if 'attribute' in jsonbd.keys() and len(jsonbd['attribute']) != 0: for item in jsonbd['attribute'].keys(): if re.search('VIDEO', item): videolist.append(jsonbd['attribute'][item]['playurl']) if re.search('IMG', item): imglist.append(jsonbd['attribute'][item]['url']) pipleitem['image_urls'] = helper.list2str(imglist) pipleitem['video_urls'] = helper.list2str(videolist) pipleitem['share'] = None pipleitem['like'] = response.meta[ 'likes_count'] if 'likes_count' in response.meta.keys() else None pipleitem['dislike'] = None pipleitem['views'] = response.meta[ 'read_count'] if 'read_count' in response.meta.keys() else None pipleitem['comment'] = response.meta[ 'comment_count'] if 'comment_count' in response.meta.keys( ) else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.css( '#post_head .atl-info span:nth-child(2)::text').extract_first() if date == None or len(date) == 0: return date = re.findall('时间:(\S*)', date)[0] try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = re.findall('post-([a-z\d-]*).', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = '天涯社区' pipleitem['editor'] = response.xpath( '//meta[@name="author"]/@content').extract_first() pipleitem['content'] = helper.list2str( response.xpath( 'string(//div[@class="bbs-content clearfix"])').extract()) pipleitem['image_urls'] = helper.list2str( response.xpath( '//div[@class="bbs-content clearfix"]/img/@src').extract()) pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = response.css( '.shang_zan::attr(data-number)').extract_first() pipleitem['dislike'] = None views = response.css( '#post_head .atl-info span:nth-child(3)::text').extract_first() views = re.findall( '点击:(.*)', views)[0] if views != None or len(views) != 0 else None pipleitem['views'] = views comment = response.css( '#post_head .atl-info span:nth-child(4)::text').extract_first() comment = re.findall( '回复:(.*)', comment)[0] if comment != None or len(comment) != 0 else None pipleitem['comment'] = comment pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.xpath( '//meta[@name="publishdate"]/@content').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = response.xpath( '//meta[@name="contentid"]/@content').extract_first() pipleitem['url'] = response.url pipleitem['title'] = response.xpath('//title/text()').extract_first() pipleitem['source'] = response.xpath( '//meta[@name="source"]/@content').extract_first() editor = response.xpath( '//meta[@name="author"]/@content').extract_first() if editor == None or len(editor) == 0: editor = re.findall('编辑:(\S*)', response.text)[0] pipleitem['editor'] = editor content = helper.list2str( response.xpath( 'string(//section[contains(@class,"detail_article_content")])' ).extract()) if content == None or len(content) < 10: content = helper.list2str( response.xpath('string(//*[@id="articlecontent"])').extract()) pipleitem['content'] = content pipleitem['image_urls'] = helper.list2str( response.css( '[class~=detail_article_content] img::attr(src)').extract()) pipleitem['video_urls'] = None pipleitem['share'] = None like = requests.get('https://front-web.rednet.cn/content/star/' + pipleitem['id']).text pipleitem['like'] = like pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content2_parse(self, response): date = response.xpath( '//meta[@itemprop="dateCreated"]/@content').extract_first() if date == None or len(date) == 0: return date = re.findall('\d{4}-\d+-\d+', date)[0] try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor2Item() extor = response.css('#js-initialData::text').extract_first() if extor == None or len(extor) == 0: return jsonbd = json.loads(extor) questions_body = jsonbd['initialState']['entities']['answers'] for i in questions_body.keys(): questions_body = questions_body[i] pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = questions_body['id'] pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = '知乎' pipleitem['editor'] = questions_body['author']['name'] # pipleitem['content'] = helper.list2str(response.xpath('string(//div[@class="QuestionHeader-detail"])').extract()) pipleitem['content'] = helper.list2str( re.findall('>(.*?)<', questions_body['content'])) # pipleitem['image_urls'] = helper.list2str(re.findall('<img.*?src="(.*?)"',questions_body['content'])) pipleitem['image_urls'] = helper.list2str( response.css('.RichContent-inner img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( response.css('.RichContent-inner video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = questions_body['voteupCount'] pipleitem['dislike'] = None # pipleitem['views'] = response.xpath('//strong[@class="NumberBoard-itemValue"]/@title').extract_first() pipleitem['views'] = None # pipleitem['comment'] = response.xpath('//meta[@itemprop="commentCount"]/@content').extract_first() pipleitem['comment'] = questions_body['commentCount'] pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def shiping_content_parse(self, response): titleleft = response.css( '.content_title .left p::text').extract_first() date = re.findall('\d{4}年\d{2}月\d{2}日\s*[\d:]*', titleleft) if date == None or len(date) == 0: return try: if helper.compare_time(date[0], self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.formatTime(date[0]) pipleitem['id'] = re.findall('\d{2}-\d{2}/(.*)\.', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.xpath('//title/text()').extract_first() source = re.findall('来源:(.*)', titleleft) if len(source) != 0: source = source[0] pipleitem['source'] = source editor = response.css('.content_desc span::text').extract_first() if editor != None: tmp = re.findall('责任编辑:【(.*)】', editor) if len(tmp) != 0: editor = tmp[0] pipleitem['editor'] = editor pipleitem['content'] = helper.list2str( response.xpath('string(//div[@class="content_desc"])').extract()) pipleitem['image_urls'] = None pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): jsonbd = json.loads(response.text) if jsonbd == None or len(jsonbd) == 0: return data = jsonbd['data'] date = data['news_datetime'] if 'news_datetime' in data.keys() else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor2Item() content = data['contents'] if 'contents' in data.keys() else '' images = data['image'] if 'image' in data.keys() else [] pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = response.meta['id'] pipleitem['url'] = response.url pipleitem['title'] = data['title'] if 'title' in data.keys() else None pipleitem['source'] = data['copyfrom'] if 'copyfrom' in data.keys() else None pipleitem['editor'] = data['admin_name'] if 'admin_name' in data.keys() else None pipleitem['content'] = helper.list2str(re.findall('>(.*?)<', content)) imagelist = [] for i in images: if i['url'] != None and len(i['url']) != 0: imagelist.append(i['url']) pipleitem['image_urls'] = helper.list2str(imagelist) pipleitem['video_urls'] = None pipleitem['share'] = response.meta['share_count'] pipleitem['like'] = data['likes_count'] if 'likes_count' in data.keys() else None pipleitem['dislike'] = None pipleitem['views'] = data['read_count'] if 'read_count' in data.keys() else None pipleitem['comment'] = data['comment_count'] if 'comment_count' in data.keys() else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): # date = response.css('.news_content .about_news::text').extract_first() date = re.findall('(\d{4}-\d+-\d+[\s\d:]+)', response.text) if date == None or len(date) == 0: return try: if helper.compare_time(date[0], self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor2Item() id = re.findall('"contId":"(\d+)"', response.text) editor = re.findall('责任编辑:(.*?)<', response.text) like = re.findall('<em></em>\s*?(\d*)</a>', response.text) pipleitem['date'] = helper.formatTime(date[0]) pipleitem['id'] = id[0] if id != None and len(id) != 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = response.css( '.gg-gmcont a::text').extract_first() pipleitem['editor'] = editor[0] if editor != None and len( editor) != 0 else None pipleitem['content'] = helper.list2str( response.xpath('string(//div[@class="news_content"])').extract()) pipleitem['image_urls'] = helper.list2str( response.css('.news_content img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( response.css('.news_content video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = like[0] if like != None and len( like) != 0 else None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = response.meta[ 'commentNum'] if 'commentNum' in response.meta.keys() else 0 pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = re.findall('"timestamp":"(\d*)"', response.text)[0] if date == None or len(date) == 0: return try: if helper.compare_time(helper.get_makedtime('%Y-%m-%d', date), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = helper.get_makedtime('%Y-%m-%d %H:%M:%S', date) pipleitem['id'] = re.findall('\d+/\d+', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = re.findall('<title>(.*?)</title>', response.text)[0] pipleitem['source'] = re.findall('"befrom":"(.*?)"', response.text)[0] pipleitem['editor'] = None content = re.findall('"content":"(.*)"}', response.text)[0] if content == None or len(content) < 10: content = None pipleitem['content'] = helper.list2str(re.findall('>(.*?)<', content)) # 'https://imgcdn.toutiaoyule.com/20190610/20190610084624793572a.jpg' tmp = [] for i in re.findall('<img src=(.*?)>', response.text): tmp.append(i.replace('\\"', '').replace('//', 'https://')) pipleitem['image_urls'] = helper.list2str(tmp) pipleitem['video_urls'] = None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def video_content_parse(self, response): date = response.meta['date'] if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor2Item() pipleitem['date'] = date pipleitem['id'] = re.findall('/(\d+)', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.xpath('//title/text()').extract_first() pipleitem['source'] = response.meta['source'] pipleitem['editor'] = None pipleitem['content'] = None pipleitem['image_urls'] = helper.list2str( response.css('img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( response.css('video::attr(src)').extract()) pipleitem['share'] = None tmp = 'https://videoback.dftoutiao.com/thumpupapi/get_data_by_uk?uk={uk}&type={type}' like = requests.get(tmp.format(uk=pipleitem['id'], type='thumpup')).text dislike = requests.get(tmp.format(uk=pipleitem['id'], type='stepon')).text pipleitem['like'] = re.findall('"data":(\d*)', like)[0] pipleitem['dislike'] = re.findall('"data":(\d*)', dislike)[0] pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem