def parse_owner(self, response): try: logging.log(logging.INFO, "owner:%s" % response.request.url) items = [] user_item = UserItem() #owner id script = response.xpath('/html/head/script') owner_id = script.re('ownerId = \"(\d+)\"') show_id = script.re('ownerEncodeid = \'(.+)\'') if owner_id: user_item['owner_id'] = owner_id[0] if show_id: user_item['show_id'] = show_id[0] else: return #user profile up = response.xpath('//div[@class="profile"]') if up: user_name = up.xpath('./div[@class="info"]/div[@class="username"]/a[1]/@title').extract() played = up.xpath('./div[@class="state"]/ul/li[@class="vnum"]/em/text()').extract() fans = up.xpath('./div[@class="state"]/ul/li[@class="snum"]/em/text()').extract() if user_name: user_item['user_name'] = user_name[0] if played: #user_item['played'] = Util.normalize_vp(played[0]) user_item['played'] = Util.normalize_played(Util.normalize_vp(played[0])) if fans: user_item['fans'] = Util.normalize_vp(fans[0]) #youku profile yp = response.xpath('//div[@class="YK-profile"]') if yp: intro = yp.xpath('./div[@class="userintro"]/div[@class="desc"]/p[2]/text()').extract() if intro: user_item['intro'] = ''.join(intro) #count yh = response.xpath('//div[@class="YK-home"]') vcount = None if yh: video_count = yh.xpath('div[1]/div/div/div/div[@class="title"]/span/a/text()').re(u'\((\d+)\)') if video_count: vcount = video_count[0] user_item['vcount'] = vcount user_item['spider_id'] = self.spider_id user_item['site_id'] = self.site_id user_item['url'] = response.request.url items.append(user_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_page(self, response): try: logging.log(logging.INFO, 'page:%s' % response.request.url) cat = response.request.meta['cat'] items = [] qq_v = response.xpath('//div[@class="mod_cont"]/ul/li') for v in qq_v: urls = v.xpath('./h6/a/@href').extract() titles = v.xpath('./h6/a/@text').extract() thumb_urls = v.xpath('./a/img/@src').extract() durations = v.xpath( './a/div/span[@class="mod_version"]/text()').extract() playeds = v.xpath('./p/span/text()').extract() title = titles[0] if titles else None thumb_url = thumb_urls[0] if thumb_urls else None duration = Util.get_qq_duration( durations[0]) if durations else None played = Util.normalize_played(Util.normalize_vp( playeds[0])) if playeds else None if urls: r = Request(url=urls[0], callback=self.parse_episode) d = { 'title': title, 'thumb_url': thumb_url, 'duration': duration, 'played': played } d.update(order) r.meta.update({'order': d}) items.append(r) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_vpaction(self, response): try: logging.log(logging.INFO, 'vpaction:%s' % response.request.url) item = response.request.meta['item'] vp = response.xpath('//ul[@class="player_info"]/li[@class="sum"]/text()').extract() if vp: item['played'] = Util.normalize_played(Util.normalize_vp(vp[0].replace('总播放:', ''))) show_id = item['show_id'] item = Request(url=self.playlength_url+show_id, callback=self.parse_playlength, meta={'item':item}) return item except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def vpaction_parse(self, response): items = [] try: episode_item = response.request.meta['episode_item'] vp = response.xpath('//div[@id="videodetailInfo"]/ul/li').re( u'<label>总播放数:</label><span.*>(.+)</span>') if vp: episode_item['played'] = Util.normalize_vp(vp[0]) show_id = episode_item['show_id'] if show_id: items.append( Request(url=self.playlength_url + show_id, callback=self.playlength_parse, meta={'episode_item': episode_item})) except Exception, e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_vpaction(self, response): try: #log.msg('%s' % response.request.url) item = response.request.meta['item'] vp = response.xpath('//div[@id="videodetailInfo"]/ul/li').re( u'<label>总播放数:</label><span.*>(.+)</span>') if vp: item['played'] = Util.normalize_vp(vp[0]) show_id = item['show_id'] item = Request(url=self.playlength_url + show_id, callback=self.parse_playlength, meta={'item': item}) return item except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_page(self, response): try: logging.log(logging.INFO, 'page:%s' % response.request.url) order = response.request.meta['order'] items = [] qq_v = response.xpath( '//ul[@id="videolst_cont"]/li[@class="list_item"]') for v in qq_v: urls = v.xpath('./strong/a/@href').extract() titles = v.xpath('./strong/a/text()').extract() thumb_urls = v.xpath('./a/img/@src').extract() durations = v.xpath('./a/span/em/text()').extract() playeds = v.xpath( './div/span[@class="figure_info_play"]/span/text()' ).extract() upload_times = v.xpath( './div/span[@class="figure_info_time"]/text()').extract() title = titles[0] if titles else None thumb_url = thumb_urls[0] if thumb_urls else None duration = Util.get_qq_duration( durations[0]) if durations else None played = Util.normalize_played(Util.normalize_vp( playeds[0])) if playeds else None upload_time = Util.get_qq_upload_time( upload_times[0]) if upload_times else None if urls: r = Request(url=urls[0], callback=self.parse_episode) d = { 'title': title, 'thumb_url': thumb_url, 'duration': duration, 'played': played, 'upload_time': upload_time } d.update(order) r.meta.update({'order': d}) items.append(r) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_vpaction(self, response): try: #log.msg('%s' % response.request.url) item = response.request.meta['item'] sel = Selector(response) #vp = sel.xpath('//div[@id="videodetailInfo"]/ul/li').re(u'<label>总播放数:</label><span.*>(.+)</span>') #vp = sel.xpath('//div[@class="info_num"]/span/text()').extract() vp = sel.xpath('//ul[@class="player_info"]/li[@class="sum"]/text()' ).extract() if vp: item['played'] = Util.normalize_played( Util.normalize_vp(vp[0].replace('总播放:', ''))) show_id = item['show_id'] item = Request(url=self.playlength_url + show_id, callback=self.parse_playlength, meta={'item': item}) return item except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse(self, response): try: log.msg(response.request.url, level=log.INFO) cust_para = response.request.meta['cust_para'] items = [] user_item = UserItem() #owner id script = response.xpath('/html/head/script') owner_id = script.re('ownerId = \"(\d+)\"') show_id = script.re('ownerEncodeid = \'(.+)\'') if owner_id: user_item['owner_id'] = owner_id[0] if show_id: user_item['show_id'] = show_id[0] else: return #user profile up = response.xpath('//div[@class="profile"]') if up: user_name = up.xpath( './div[@class="info"]/div[@class="username"]/a[1]/@title' ).extract() played = up.xpath( './div[@class="state"]/ul/li[@class="vnum"]/em/text()' ).extract() fans = up.xpath( './div[@class="state"]/ul/li[@class="snum"]/em/text()' ).extract() if user_name: user_item['user_name'] = user_name[0] if played: user_item['played'] = Util.normalize_vp(played[0]) if fans: user_item['fans'] = Util.normalize_vp(fans[0]) #youku profile yp = response.xpath('//div[@class="YK-profile"]') if yp: intro = yp.xpath( './div[@class="userintro"]/div[@class="desc"]/p[2]/text()' ).extract() if intro: user_item['intro'] = ''.join(intro) #count yh = response.xpath('//div[@class="YK-home"]') vcount = '0' if yh: video_count = yh.xpath( 'div[1]/div/div/div/div[@class="title"]/span/a/text()').re( u'\((\d+)\)') if video_count: vcount = video_count[0] user_item['vcount'] = vcount user_item['spider_id'] = self.spider_id user_item['site_id'] = self.site_id items.append(user_item) #videos items.append( Request(url=response.request.url + "videos", callback=self.parse_video_page, meta={ 'page': 1, 'cust_para': cust_para })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)