def video_about_parse(self, response): items = [] try: show_id = response.xpath( '//meta[@itemprop="channelId"]/@content').extract() user_name = response.xpath( '//span[@class="qualified-channel-title-text"]/a/text()' ).extract() fans = response.xpath('//ul[@class="about-stats"]/li').re( re.compile(r'<li.*>.*<b>([\d|,]*)</b>.*subscribers.*</li>', re.S)) played = response.xpath('//ul[@class="about-stats"]/li').re( re.compile(r'<li.*>.*<b>([\d|,]*)</b>.*views.*</li>', re.S)) intro = response.xpath( '//div[@class="about-description branded-page-box-padding"]/descendant-or-self::*/text()' ).extract() if show_id: user_item = UserItem() user_item['show_id'] = show_id[0] if user_name: user_item['user_name'] = user_name[0] if fans: user_item['fans'] = Util.normalize_played(fans[0]) if played: user_item['played'] = Util.normalize_played(played[0]) if intro: user_item['intro'] = "".join(intro).strip() user_item['spider_id'] = self.spider_id user_item['site_id'] = self.site_id user_item['url'] = response.request.url[:-len('/about')] items.append(user_item) except Exception, e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse(self, response): try: logging.log(logging.INFO, "parse:%s" % response.request.url) audit = response.request.meta['audit'] cat_name = response.request.meta['cat_name'] kw_id = response.request.meta['kw_id'] priority = response.request.meta['priority'] items = [] #video items yk_v = response.xpath('//div[@class="sk-vlist clearfix"]/div[@class="v"]') for v in yk_v: url = v.xpath('./div[@class="v-meta va"]/div[@class="v-meta-title"]/a/@href').extract() thumb_urls = v.xpath('./div[@class="v-link"]/a/@href').extract() if thumb_urls: thumb_url = thumb_urls[0] if thumb_url == 'http://g1.ykimg.com/': thumb_url = None else: thumb_url = None pl = v.xpath('./div[@class="v-meta va"]/div[@class="v-meta-entry"]/div/label[text()="%s"]/../span/text()' % u'播放: ').extract() if pl: pld = Util.normalize_played(pl[0]) played = int(pld) else: played = None if url: items.append(Request(url=url[0], callback=self.parse_episode, meta={'audit': audit, 'thumb_url': thumb_url, 'played': played, 'cat_name': cat_name, 'kw_id': kw_id, 'priority': priority})) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_page(self, response): try: logging.log(logging.INFO, 'page:%s' % response.request.url) cat = response.request.meta['cat'] items = [] qq_v = response.xpath('//div[@class="mod_cont"]/ul/li') for v in qq_v: urls = v.xpath('./h6/a/@href').extract() titles = v.xpath('./h6/a/@text').extract() thumb_urls = v.xpath('./a/img/@src').extract() durations = v.xpath( './a/div/span[@class="mod_version"]/text()').extract() playeds = v.xpath('./p/span/text()').extract() title = titles[0] if titles else None thumb_url = thumb_urls[0] if thumb_urls else None duration = Util.get_qq_duration( durations[0]) if durations else None played = Util.normalize_played(Util.normalize_vp( playeds[0])) if playeds else None if urls: r = Request(url=urls[0], callback=self.parse_episode) d = { 'title': title, 'thumb_url': thumb_url, 'duration': duration, 'played': played } d.update(order) r.meta.update({'order': d}) items.append(r) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_owner(self, response): try: logging.log(logging.INFO, "owner:%s" % response.request.url) items = [] user_item = UserItem() #owner id script = response.xpath('/html/head/script') owner_id = script.re('ownerId = \"(\d+)\"') show_id = script.re('ownerEncodeid = \'(.+)\'') if owner_id: user_item['owner_id'] = owner_id[0] if show_id: user_item['show_id'] = show_id[0] else: return #user profile up = response.xpath('//div[@class="profile"]') if up: user_name = up.xpath('./div[@class="info"]/div[@class="username"]/a[1]/@title').extract() played = up.xpath('./div[@class="state"]/ul/li[@class="vnum"]/em/text()').extract() fans = up.xpath('./div[@class="state"]/ul/li[@class="snum"]/em/text()').extract() if user_name: user_item['user_name'] = user_name[0] if played: #user_item['played'] = Util.normalize_vp(played[0]) user_item['played'] = Util.normalize_played(Util.normalize_vp(played[0])) if fans: user_item['fans'] = Util.normalize_vp(fans[0]) #youku profile yp = response.xpath('//div[@class="YK-profile"]') if yp: intro = yp.xpath('./div[@class="userintro"]/div[@class="desc"]/p[2]/text()').extract() if intro: user_item['intro'] = ''.join(intro) #count yh = response.xpath('//div[@class="YK-home"]') vcount = None if yh: video_count = yh.xpath('div[1]/div/div/div/div[@class="title"]/span/a/text()').re(u'\((\d+)\)') if video_count: vcount = video_count[0] user_item['vcount'] = vcount user_item['spider_id'] = self.spider_id user_item['site_id'] = self.site_id user_item['url'] = response.request.url items.append(user_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_vpaction(self, response): try: logging.log(logging.INFO, 'vpaction:%s' % response.request.url) item = response.request.meta['item'] vp = response.xpath('//ul[@class="player_info"]/li[@class="sum"]/text()').extract() if vp: item['played'] = Util.normalize_played(Util.normalize_vp(vp[0].replace('总播放:', ''))) show_id = item['show_id'] item = Request(url=self.playlength_url+show_id, callback=self.parse_playlength, meta={'item':item}) return item except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_media(self, response, **kwargs): items = [] try: title = response.xpath( '//div[@id="crumbsBar"]/div/div[@class="left"]/h2/text()' ).extract() tag = response.xpath('//meta[@name="keywords"]/@content').extract() #show_id = Util.get_sohu_showid(response.request.url) thumb = response.xpath('//script').re(',sCover: \'(.*)\'') upload = response.xpath('//script').re(',uploadTime: \'(.*)\'') description = response.xpath( '//p[@class="rel cfix"]/@title').extract() played = response.xpath( '//span[@class="vbtn vbtn-play"]/em/i/text()').extract() print played, upload video_id = response.xpath('//script').re('vid = \'(\d+)\'') ep_item = EpisodeItem() if video_id: ep_item['video_id'] = video_id[0] ep_item['show_id'] = video_id[0] if title: ep_item['title'] = title[0] if tag: ep_item['tag'] = tag[0].strip().replace(',', '|') if upload: ep_item['upload_time'] = upload[0] + ":00" if description: ep_item['description'] = description[0].strip() if thumb: ep_item['thumb_url'] = thumb[0] if played: ep_item['played'] = Util.normalize_played(played[0]) ep_item['category'] = u"搞笑" ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url items.append(ep_item) log.msg("spider success, title:%s" % (ep_item['title']), level=log.INFO) except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) finally: return items
def parse_page(self, response): try: log.msg('%s: %s' % (response.request.url, response.request.meta['page'])) cat_id = response.request.meta['cat_id'] page = response.request.meta['page'] if int(page) > int(self.max_search_page): return items = [] sel = Selector(response) #video items yk_v = sel.xpath('//div[@class="yk-col4"]') for v in yk_v: url = v.xpath('./div/div[@class="v-link"]/a/@href').extract() pl = v.xpath( './div/div[@class="v-meta va"]/div[@class="v-meta-entry"]/span/text()' ).extract() if url and pl: pld = Util.normalize_played(pl[0]) if int(pld) >= int(self.hottest_played_threshold): items.append( Request(url=url[0], callback=self.parse_episode, meta={'cat_id': cat_id})) #else: # log.msg('discard: %s' % url[0]) #pages next_page = sel.xpath( '//div[@class="yk-pager"]/ul/li[@class="next"]/a/@href' ).extract() if next_page: items.append( Request(url=self.url_prefix + next_page[0], callback=self.parse_page, meta={ 'page': page + 1, 'cat_id': cat_id })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_page(self, response): try: logging.log(logging.INFO, 'page:%s' % response.request.url) order = response.request.meta['order'] items = [] qq_v = response.xpath( '//ul[@id="videolst_cont"]/li[@class="list_item"]') for v in qq_v: urls = v.xpath('./strong/a/@href').extract() titles = v.xpath('./strong/a/text()').extract() thumb_urls = v.xpath('./a/img/@src').extract() durations = v.xpath('./a/span/em/text()').extract() playeds = v.xpath( './div/span[@class="figure_info_play"]/span/text()' ).extract() upload_times = v.xpath( './div/span[@class="figure_info_time"]/text()').extract() title = titles[0] if titles else None thumb_url = thumb_urls[0] if thumb_urls else None duration = Util.get_qq_duration( durations[0]) if durations else None played = Util.normalize_played(Util.normalize_vp( playeds[0])) if playeds else None upload_time = Util.get_qq_upload_time( upload_times[0]) if upload_times else None if urls: r = Request(url=urls[0], callback=self.parse_episode) d = { 'title': title, 'thumb_url': thumb_url, 'duration': duration, 'played': played, 'upload_time': upload_time } d.update(order) r.meta.update({'order': d}) items.append(r) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_vpaction(self, response): try: #log.msg('%s' % response.request.url) item = response.request.meta['item'] sel = Selector(response) #vp = sel.xpath('//div[@id="videodetailInfo"]/ul/li').re(u'<label>总播放数:</label><span.*>(.+)</span>') #vp = sel.xpath('//div[@class="info_num"]/span/text()').extract() vp = sel.xpath('//ul[@class="player_info"]/li[@class="sum"]/text()' ).extract() if vp: item['played'] = Util.normalize_played( Util.normalize_vp(vp[0].replace('总播放:', ''))) show_id = item['show_id'] item = Request(url=self.playlength_url + show_id, callback=self.parse_playlength, meta={'item': item}) return item except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): try: log.msg('%s' % response.request.url) thumb_url = response.request.meta['thumb_url'] upload_time = response.request.meta['upload_time'] category = response.request.meta['category'] kw_id = response.request.meta[ 'kw_id'] if 'kw_id' in response.request.meta else 1 items = [] #owner owner = response.xpath( '//div[@class="yt-user-info"]/a/@data-ytid').extract() owner_url = response.xpath( '//div[@class="yt-user-info"]/a/@href').extract() owner_show_id = None if owner: owner_show_id = owner[0] items.append( Request(url=self.url_prefix + owner_url[0] + "/about", callback=self.parse_about)) #video info title = response.xpath('//span[@id="eow-title"]/text()').extract() #category = response.xpath('//p[@id="eow-category"]/a/text()').extract() tag = response.xpath( './head/meta[@name="keywords"]/@content').extract() #upload = response.xpath('//p[@id="watch-uploader-info"]/strong/text()').extract() description = response.xpath( '//p[@id="eow-description"]/descendant-or-self::*/text()' ).extract() played = response.xpath( '//div[@class="watch-view-count"]/text()').extract() #other info sts = re.search(r'\"sts\": ?(\d+)', response.body) ep_item = EpisodeItem() ep_item['show_id'] = Util.get_youtube_showid(response.request.url) if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = title[0].strip() if tag: ep_item['tag'] = tag[0].replace(', ', '|') if category: #ep_item['category'] = category[0].replace('&', '|') ep_item['category'] = category ''' if upload: ptime = Util.get_youtube_publish(upload[0]) if ptime: ep_item['upload_time'] = ptime ''' if upload_time: t = Util.get_youtube_upload_time(upload_time[0].strip()) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = "\n".join(description) if thumb_url: ep_item['thumb_url'] = thumb_url[0] if played: pld = Util.normalize_played(played[0]) if pld: ep_item['played'] = Util.normalize_played(played[0]) else: ep_item['played'] = '0' ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = Util.normalize_youtube_url(response.request.url) ep_item['kw_id'] = kw_id query = Util.encode({'video_id': ep_item['show_id'], \ 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \ 'sts': sts.groups()[0] if sts else ''}) items.append( Request(url='http://www.youtube.com/get_video_info?' + query, callback=self.parse_other_info, meta={'item': ep_item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def video_parse(self, response): items = [] try: kw_id = response.request.meta[ 'kw_id'] if 'kw_id' in response.request.meta else None pg_id = response.request.meta[ 'pg_id'] if 'pg_id' in response.request.meta else None cat_id = response.request.meta[ 'cat_id'] if 'cat_id' in response.request.meta else None subject_id = response.request.meta[ 'subject_id'] if 'subject_id' in response.request.meta else None show_id = Util.get_youtube_showid(response.request.url) if not show_id: return items #owner owner = response.xpath( '//div[@class="yt-user-info"]/a/@data-ytid').extract() owner_url = response.xpath( '//div[@class="yt-user-info"]/a/@href').extract() owner_show_id = None if owner: owner_show_id = owner[0] items.append( Request(url=self.youtube_url_prefix + owner_url[0] + "/about", callback=self.video_about_parse)) #video info title = response.xpath('//span[@id="eow-title"]/text()').extract() tag = response.xpath( './head/meta[@name="keywords"]/@content').extract() description = response.xpath( '//p[@id="eow-description"]/descendant-or-self::*/text()' ).extract() played = response.xpath( '//div[@class="watch-view-count"]/text()').extract() category = response.xpath( '//div[@id="watch-description"]//ul[@class="content watch-info-tag-list"]/li/a/text()' ).extract() upload = response.xpath( '//meta[@itemprop="datePublished"]/@content').extract() #该方法获取的缩略图 thumb_url = response.xpath( '//link[@itemprop="thumbnailUrl"]/@href').extract() #other info sts = re.search(r'\"sts\": ?(\d+)', response.body) ep_item = EpisodeItem() ep_item['show_id'] = show_id #这里缩略图采用合成的方式得到['default', 'mqdefault', 'hqdefault', 'sddefault', 'maxresdefault'] #ep_item['thumb_url'] = self.thumb_url_prefix + '/' + show_id + '/default.jpg' if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = title[0].strip() if tag: ep_item['tag'] = tag[0].replace(', ', '|') if description: ep_item['description'] = "\n".join(description) if played: pld = Util.normalize_played(played[0]) if pld: ep_item['played'] = Util.normalize_played(played[0]) else: ep_item['played'] = '0' if kw_id: ep_item['kw_id'] = kw_id if pg_id: ep_item['pg_id'] = pg_id if cat_id: ep_item['cat_id'] = cat_id if subject_id: ep_item['subject_id'] = subject_id if thumb_url: ep_item['thumb_url'] = thumb_url[0] if category: category = category[0].strip() #https://www.youtube.com/watch?v=lwy4qwaByVQ ep_item['category'] = category.replace('&', '|') if upload: upload = upload[0].strip() struct_time = None struct_time = time.strptime(upload, '%b %d, %Y') if not struct_time: struct_time = time.strptime(upload, '%Y年%m月%d日') if struct_time: time_str = time.strftime('%Y-%m-%d %H:%M:%S', struct_time) #time_str = "%s-%s-%s %s" % (struct_time.tm_year, struct_time.tm_mon, struct_time.tm_mday, time_str) ep_item['upload_time'] = time_str ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = Util.normalize_youtube_url(response.request.url) query = Util.encode({'video_id': ep_item['show_id'], \ 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \ 'sts': sts.groups()[0] if sts else ''}) items.append( Request(url='http://www.youtube.com/get_video_info?' + query, callback=self.video_other_info_parse, meta={'item': ep_item})) except Exception, e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_video_page(self, response): try: # 默认是最新发布 logging.log(logging.INFO, 'page:%s' % response.request.url) audit = response.request.meta['audit'] cat_name = response.request.meta['cat_name'] show_id = response.request.meta['show_id'] priority = response.request.meta['priority'] page = 1 items = [] #get videos yk_v = response.xpath('//div[@class="yk-col4"]/div') for v in yk_v: url = v.xpath('./div[@class="v-link"]/a/@href').extract() thumb_urls = v.xpath( './div/div[@class="v-thumb"]/img/@src').extract() if thumb_urls: thumb_url = thumb_urls[0] if thumb_url == 'http://g1.ykimg.com/': thumb_url = None else: thumb_url = None pl = v.xpath( './div[@class="v-meta va"]/div[@class="v-meta-entry"]/span[@class="v-num"]/text()' ).extract() if pl: pld = Util.normalize_played(pl[0]) played = int(pld) else: played = None if url: items.append( Request(url=url[0], callback=self.parse_episode, meta={ 'audit': audit, 'thumb_url': thumb_url, 'played': played, 'cat_name': cat_name, 'show_id': show_id, 'priority': priority })) #get last_str and ajax_url last_str = response.selector.re(u'\'last_str\':\'([^\']*)\'') ajax_url = response.selector.re(u'\'ajax_url\':\'([^\']*)\'') #reqest sibling page if ajax_url: sibling_page = (3 * page - 1, 3 * page) for p in sibling_page: s = last_str[0] if last_str else u'' para = { "v_page": str(page), "page_num": str(p), "page_order": "1", "last_str": s } items.append( FormRequest(url=self.url_prefix + ajax_url[0] + "fun_ajaxload/", formdata=para, method='GET', callback=self.parse_video_page, meta={ 'audit': audit, 'cat_name': cat_name, 'show_id': show_id, 'priority': priority })) #request next page ''' next_page = response.xpath('//ul[@class="YK-pages"]/li[@class="next"]/a/@href').extract() if next_page: items.append(Request(url=self.url_prefix+next_page[0], callback=self.parse_video_page, meta={'page':page+1})) ''' return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())