def parse_episode_iqiyi(self, response): try: logging.log(logging.INFO, "parse_youku_playlength:%s" % response.request.url) pg_id = response.request.meta['pg_id'] cat_name = response.request.meta['cat_name'] site_id = response.request.meta['site_id'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #show_id show_id = Util.get_iqiyi_showid(response.request.url) albumid = response.selector.re(re.compile(r'albumId: ?(\d+)')) #video info title = response.xpath( '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()' ).extract() #category = response.xpath('//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract() #if not category: # category = response.xpath('//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()').extract() #if not category: # category = response.xpath('//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract() #if not category: # category = response.xpath('//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()').extract() upload_time = response.xpath( '//div[@class="crumb_bar"]/span[3]/span/text()').extract() if not upload_time: upload_time = response.xpath( '//div[@class="crumb_bar"]/span[2]/span/text()').extract() tag = response.xpath( '//span[@id="widget-videotag"]/descendant::*/text()').extract( ) if not tag: tag = response.xpath( '//span[@class="mod-tags_item vl-block"]/descendant::*/text()' ).extract() if not tag: tag = response.xpath( '//div[@class="crumb_bar"]/span[2]/a/text()').extract() ep_item = EpisodeItem() if title: ep_item['title'] = "".join([t.strip() for t in title]) if show_id: ep_item['show_id'] = show_id if tag: ep_item['tag'] = "|".join([t.strip() for t in tag]) if upload_time: ep_item['upload_time'] = upload_time[0].strip() #if category: # ep_item['category'] = category[0].strip() ep_item['category'] = cat_name ep_item['spider_id'] = self.spider_id ep_item['site_id'] = site_id ep_item['pg_id'] = pg_id ep_item['audit'] = audit ep_item['url'] = response.request.url ep_item['format_id'] = self.format_id ep_item['priority'] = priority if albumid: items.append( Request(url=self.playlength_url + albumid[0], callback=self.parse_playlength, meta={ 'item': ep_item, 'albumid': albumid[0] })) else: items.append(ep_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) cat_id = response.request.meta['cat_id'] thumb_url = response.request.meta['thumb'] items = [] #show_id show_id = Util.get_iqiyi_showid(response.request.url) #space maybe exist: "albumId:326754200" or "albumId: 326754200" albumid = response.selector.re(re.compile(r'albumId: ?(\d+)')) #video info title = response.xpath('//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()').extract() if not title: title = response.xpath('//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()').extract() if not title: title = response.xpath('//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()').extract() if not title: title = response.xpath('//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()').extract() category = response.xpath('//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract() if not category: category = response.xpath('//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()').extract() if not category: category = response.xpath('//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract() if not category: category = response.xpath('//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()').extract() upload_time = response.xpath('//div[@class="crumb_bar"]/span[3]/span/text()').extract() if not upload_time: upload_time = response.xpath('//div[@class="crumb_bar"]/span[2]/span/text()').extract() tag = response.xpath('//span[@id="widget-videotag"]/descendant::*/text()').extract() if not tag: tag = response.xpath('//span[@class="mod-tags_item vl-block"]/descendant::*/text()').extract() if not tag: tag = response.xpath('//div[@class="crumb_bar"]/span[2]/a/text()').extract() ep_item = EpisodeItem() if title: ep_item['title'] = "".join([t.strip() for t in title]) if show_id: ep_item['show_id'] = show_id if tag: ep_item['tag'] = "|".join([t.strip() for t in tag]) if upload_time: ep_item['upload_time'] = upload_time[0].strip() if category: ep_item['category'] = category[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['cat_id'] = cat_id if albumid: items.append(Request(url=self.playlength_url+albumid[0], callback=self.parse_playlength, meta={'item':ep_item,'albumid':albumid[0]})) else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)