def content_parse(self, response): date = response.xpath( '//span[@class="date style-scope ytd-video-secondary-info-renderer"]/text()' ).extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(helper.formatTime(date), self.limittime) < 0: return except: return pipleitem = YoutubeItem() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = re.findall('v=(\S*)', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = 'Youtube' pipleitem['editor'] = response.xpath( '//yt-formatted-string[@id="owner-name"]/a/text()').extract_first( ) pipleitem['content'] = helper.list2str( response.css('#description').xpath('string(.)').extract()).strip() views = response.xpath( '//span[@class="view-count style-scope yt-view-count-renderer"]/text()' ).extract_first() pipleitem['views'] = re.sub('\D', '', views) if len(views) > 0 else '0' pipleitem['image_urls'] = helper.list2str( response.css('img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( response.css('video::attr(src)').extract()) pipleitem['share'] = None tmp = response.xpath( '//yt-formatted-string[@id="text"]/@aria-label').extract() for i in tmp: i = i.replace('No', '0') pipleitem['like'] = re.sub( '\D', '', i) if re.search('likes', i) != None else '0' pipleitem['dislike'] = re.sub( '\D', '', i) if re.search('dislikes', i) != None else '0' comment = response.xpath( '//h2[@id="count"]/yt-formatted-string/text()').extract_first() pipleitem['comment'] = re.sub('\D', '', comment) if len(comment) > 0 else 0 # pipleitem['subscriber'] = response.meta['subscriber'] pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.meta['date'] if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = response.meta['id'] pipleitem['url'] = response.url pipleitem['title'] = response.meta['title'] pipleitem['source'] = response.css( '#detail_Info_Tab_cout4_1 p:nth-last-child(2)::text' ).extract_first() pipleitem['editor'] = response.meta['editor'] pipleitem['content'] = helper.list2str( response.xpath( 'string(//div[@id="detail_infotab_cont_1"])').extract()) pipleitem['image_urls'] = response.meta['pic'] pipleitem['video_urls'] = helper.list2str( response.css('#videoPath::attr(value)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.css('h3.subheader time::text').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date) id = re.findall('diary/(.*)/',response.url) pipleitem['id'] = id[0] if id != None and len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = '中日通' pipleitem['editor'] = None pipleitem['content'] = helper.list2str(response.css('.markdown').xpath('string(.)').extract()) pipleitem['image_urls'] = helper.list2str(response.css('.markdown img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str(response.css('video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = response.css('.likes_count::text').extract_first() pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): jsonbd = json.loads(response.text) info = jsonbd['dramaInfo'] if 'dramaInfo' in jsonbd.keys() else None date = info['updateDesc'] if 'updateDesc' in info.keys() else None date = re.findall('[\d-]+', date) if date == None or len(date) == 0: return try: if helper.compare_time(date[0], self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date[0]) pipleitem['id'] = info['contId'] if 'contId' in info.keys() else None pipleitem['url'] = response.url pipleitem['title'] = response.meta[ 'name'] if 'name' in response.meta.keys() else None pipleitem['source'] = info['type'] if 'type' in info.keys() else None pipleitem['editor'] = None pipleitem['content'] = info[ 'description'] if 'description' in info.keys() else None pipleitem['image_urls'] = info['imageURL'] if 'imageURL' in info.keys( ) else None pipleitem['video_urls'] = info[ 'requestURL'] if 'requestURL' in info.keys() else None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.meta['newsTime'] if 'newsTime' in response.meta.keys() else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = response.meta['newsId'] if 'newsId' in response.meta.keys() else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = response.meta['newsResource'] if 'newsResource' in response.meta.keys() else None pipleitem['editor'] = None pipleitem['content'] = helper.list2str(response.xpath('string(//div[@class="m_details-con"])').extract()) pipleitem['image_urls'] = helper.list2str(response.meta['picUrlList'] if 'picUrlList' in response.meta.keys() else []) pipleitem['video_urls'] = response.css('audio::attr(src)').extract_first() pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = response.meta['articleCount'] if 'articleCount' in response.meta.keys() else None pipleitem['comment'] = response.meta['commentNum'] if 'commentNum' in response.meta.keys() else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def video_parse(self, response): date = response.xpath( '//meta[@name="pubdate"]/@content').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(helper.formatTime2(date.strip()), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date.strip()) id = re.findall('com/(.*)/', response.url) pipleitem['id'] = id[0] if id != None and len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = 'CNN' pipleitem['editor'] = response.xpath( '//meta[@name="author"]/@content').extract_first() pipleitem['content'] = helper.list2str( response.css('#[id~=js-video_description]::text').extract()) pipleitem['image_urls'] = helper.list2str( response.css('img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( re.findall('"videoUrl":\s*"(.*?)"', response.text)) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): jsonbd = json.loads(response.text) if len(jsonbd) == 0: return cards = jsonbd[0] date = cards['time'] if 'time' in cards.keys() else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = cards['GlobalID'] if 'GlobalID' in cards.keys( ) else None pipleitem['url'] = response.url pipleitem['title'] = cards['title'] if 'title' in cards.keys( ) else None pipleitem['source'] = cards['docfrom'] if 'docfrom' in cards.keys( ) else None pipleitem['editor'] = cards['autor'] if 'autor' in cards.keys( ) else None pipleitem['content'] = cards['summary'] if 'summary' in cards.keys( ) else None pipleitem['image_urls'] = cards['photo'] if 'photo' in cards.keys( ) else None pipleitem['video_urls'] = cards[ 'videourl'] if 'videourl' in cards.keys() else None html = requests.get(url=self.url.format(pipleitem['id'])) bd = json.loads(html.text) if 'result' not in bd.keys() or len(bd['result']) == 0: bd = None else: bd = bd['result'][0] pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = bd['views'] if bd != None and 'views' in bd.keys( ) else None pipleitem['comment'] = bd[ 'commentnum'] if bd != None and 'commentnum' in bd.keys() else None pipleitem['crawl_time'] = helper.get_localtimestamp() # 'http://apiapp.people.cn/apiv3.3.0/get_article_info.php?globalids=1061648&parents=0,0,0,0,0,0&juxian_liveid=0,0,0,0,0,0&juxian_companyid=0,0,0,0,0,0&deviceid=A000009114F247' return pipleitem
def content_parse(self, response): date = response.css('span.date::text').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(helper.formatTime(date.strip()), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date.strip()) id = re.findall('news/([a-z\d]*)',response.url) pipleitem['id'] = id[0] if id != None and len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = 'CGTN' pipleitem['editor'] = response.xpath('//div[@class="news-author news-text"]/text()').extract_first() content = helper.list2str(response.xpath('string(//div[@id="cmsMainContent"])').extract()) pipleitem['content'] = content if len(content) > 10 else helper.list2str(response.css('#cmsMainContent::attr(data-json)').extract()) pipleitem['image_urls'] = helper.list2str(response.css('.cg-padding img::attr(src)').extract()) list = [] if response.meta['key'] == 'live': html = requests.get('https://mapi.cgtn.com/mobileapp/v2/live/event/info?id={}'.format(pipleitem['id'])).text for i in json.loads(html)['response']['videos']: list.append(i['url']) pipleitem['video_urls'] = helper.list2str(list) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): jsonbd = json.loads(response.text) if 'cardgroups' not in jsonbd.keys() or len(jsonbd['cardgroups']) == 0: return cards = jsonbd['cardgroups'][0] if len( jsonbd['cardgroups']) != 0 else {} cards = cards['cards'][0] if len(cards['cards']) != 0 else {} date = helper.formatTime( cards['date']) if 'date' in cards.keys() else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = date pipleitem['id'] = response.meta['id'] pipleitem['url'] = response.url pipleitem['title'] = cards['title'] if 'title' in cards.keys( ) else None pipleitem['source'] = cards['source'] if 'source' in cards.keys( ) else None pipleitem['editor'] = None pipleitem['content'] = helper.list2str( re.findall('>(.*?)<', cards['content'] if 'content' in cards.keys() else '')) pipleitem['image_urls'] = helper.list2str( cards['photoList'] if 'photoList' in cards.keys() else []) pipleitem['video_urls'] = cards['video'][ 'url'] if 'video' in cards.keys() else None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem