Example #1
0
 def parse_fontpage(self, response):
     for zone in response.xpath(
             "//div[contains(@name,'m_pos')]/div[contains(@class,'mod-new')]"
     ):
         try:
             label_ = zone.xpath("div/h2/img/@title").extract()[0]
         except:
             continue
         if label_ == '放剧场':
             continue
         # 放剧场片名太怪,会引起冲突
         label = self.font_table[label_]
         zone_set = []
         zone_set += zone.xpath("div//div[@class='p-thumb']")
         for hide_eles in zone.xpath("div//textarea/text()").extract():
             zone_set += Selector(
                 text=hide_eles).xpath("//div[@class='p-thumb']")
         for ele in zone_set:
             try:
                 vid = re.findall(self.id_pattern,
                                  ele.xpath("./a/@href").extract()[0])[0]
             except:
                 continue
             img_c = ele.xpath(
                 "./img[contains(@src,'ykimg')]/@src").extract()
             img_c += ele.xpath(
                 "./img[contains(@alt,'ykimg')]/@alt").extract()
             img = img_c[0]
             title = ele.xpath('./a/@title').extract()[0]
             if label == '剧集':
                 yield scrapy.Request(url=self.page.format(vid=vid),
                                      meta={
                                          'img': img,
                                          'series': title
                                      },
                                      callback=self.parse_tv)
             elif label == '综艺':
                 if label_ == '综艺':
                     series = ele.xpath(
                         "./following-sibling::ul[@class='info-list']/li/span/text()"
                     ).extract()[0]
                 elif label_ == '自频道精选':
                     series = ele.xpath(
                         "./preceding-sibling::div[@class='p-user']/@title"
                     ).extract()[0]
                 yield scrapy.Request(url=self.page.format(vid=vid),
                                      meta={'series': series},
                                      callback=self.parse_show)
             else:
                 video = YoukuItem()
                 video['vid'] = vid
                 video['img'] = img
                 video['title'] = ele.xpath('./a/@title').extract()[0]
                 video['update_time'] = datetime.datetime.now().strftime(
                     "%Y-%m-%d %H:%M:%S")
                 video['category'] = label
                 video['series'] = ''
                 yield video
Example #2
0
 def get_tab1(self, response):
     items = []
     for rank in response.xpath('./div[@class="rank"]/table/tbody/tr'):
         item = YoukuItem()
         item['rank_category'] = 'dianying'
         item['rank_name'] = rank.xpath(
             './td[@class="key"]/a/@title').extract()
         item['rank_actor'] = rank.xpath(
             './td[@class="intro"]/a/text()').extract()
         item['rank_index'] = rank.xpath(
             './td[@class="status"]/span/a/text()').extract()
         item['rank_trend'] = rank.xpath(
             './td[@class="trend"]/span/@class').extract()
         items.append(item)
     return items
Example #3
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.xpath('//*/ul[@class="panel"]/li[@class="yk-col4 mr1"]')
        for site in sites:
            movie = YoukuItem()
            movie['name'] = site.xpath(
                './/li[@class="title"]/a/text()').extract()
            movie['actor'] = site.xpath(
                './/li[@class="actor"]/a/text()').extract()
            movie['playcounts'] = site.xpath('.//li[3]/text()').extract()
            yield movie

        hxs = HtmlXPathSelector(response)
        next_url = "http:" + hxs.xpath(
            '//ul[@class="yk-pages"]/li[@class="next"]/a/@href')[0].extract()
        yield scrapy.Request(next_url, callback=self.parse)
Example #4
0
 def parse_tv(self, response):
     for item in response.xpath(
             "//div[@class='tvlists']//div[contains(@class,'items')]/div[contains(@name,'tvlist')]"
     ):
         video = YoukuItem()
         try:
             video['vid'] = re.findall(self.id_pattern,
                                       item.xpath('./@id').extract()[0])[0]
         except:
             continue
         title = response.meta['series'] + \
             item.xpath('./@title').extract()[0]
         video['title'] = title
         video['update_time'] = datetime.datetime.now().strftime(
             "%Y-%m-%d %H:%M:%S")
         video['category'] = '剧集'
         video['img'] = response.meta['img']
         video['series'] = response.meta['series']
         yield video
Example #5
0
 def parse_movie(self, response):
     i = 0
     for item in response.xpath("//div[contains(@class,'p-thumb')]"):
         img = item.xpath(".//img/@src").extract()
         if not img:
             continue
         video = YoukuItem()
         video['img'] = img[0]
         video['category'] = '电影'
         video['title'] = item.xpath(".//a/@title").extract()[0]
         try:
             video['vid'] = re.findall(
                 self.id_pattern,
                 item.xpath(".//a/@href").extract()[0])[0]
         except:
             continue
         video['update_time'] = datetime.datetime.now().strftime("%Y-%m-%d")
         video['rank'] = response.meta['rank'] * 30 + i
         i += 1
         yield video
Example #6
0
 def parse_rank(self, response):
     data = json.loads(response.body.decode())
     for video_info in data['result']['data']:
         video = YoukuItem()
         video['title'] = video_info['title']
         video['update_time'] = datetime.datetime.now().strftime(
             "%Y-%m-%d %H:%M:%S")
         vid = re.findall(self.id_pattern, video_info['homepageurl'])
         if not vid:
             continue
         video['vid'] = vid[0]
         video['img'] = video_info['avatar']
         video['rank'] = video_info['order']
         if video_info['kind']:
             if type(video_info['kind']) is str:
                 video['label'] = video_info['kind']
             elif type(video_info['kind']) is list:
                 video['label'] = ','.join(video_info['kind'])
         video['category'] = response.meta['category']
         yield video
Example #7
0
 def parse_show(self, response):
     for item in response.xpath(
             "//div[@class='showlists']//div[contains(@class,'items')]/div[contains(@id,'child')]"
     ):
         video = YoukuItem()
         try:
             video['vid'] = re.findall(
                 self.id_pattern,
                 item.xpath(".//div[contains(@id,'item_')]/@id").extract()
                 [0])[0]
         except:
             continue
         title = item.xpath(
             ".//div[contains(@id,'item_')]/@title").extract()[0]
         video['title'] = title
         video['update_time'] = datetime.datetime.now().strftime(
             "%Y-%m-%d %H:%M:%S")
         video['category'] = '综艺'
         video['img'] = item.xpath(
             ".//div[contains(@class,'cover')]/img/@src").extract()[0]
         video['series'] = response.meta['series']
         yield video
Example #8
0
    def parse(self, response):

        play_list = response.xpath(
            '//div[@id="playList"]//div[@class="p-thumb"]')

        items = []
        for play in play_list:
            url = play.xpath('a/@href').extract_first()
            # url = "//v.youku.com/v_show/id_XMjc2NjE5NjU0OA==.html?f=49412420&o=1"
            id = re.findall(r'(id_)(.*)(\.)', url)[0][1]
            url = "http:" + url
            album_id = "49412420"
            title = play.xpath('a/@title').extract_first().encode('utf-8')
            img_url = play.xpath('img/@src').extract_first()

            item = YoukuItem()
            item['id'] = id
            item['album_id'] = album_id
            item['title'] = title
            item['url'] = url
            item['img_url'] = img_url

            items.append(item)
        return items