def parse(self, response):
        print(f'in parse of hotdownload_spider, url:{response.url}')
        # print(response.text)
        films = Selector(
            response=response).xpath('/html/body/div[2]/div/div[1]/div/ul/li')

        # print(films)
        for li in films:
            # print(li)
            film_name = li.xpath('./a/@title').extract()[0]
            film_url = response.url + li.xpath('./a/@href').extract()[0]
            print(f'{film_name} \t {film_url}')
            # from src.homework.week03.rrys.rrys.items import RrysItem
            item = RrysItem()
            item['film_name'] = film_name
            item['film_rank'] = ''
            item['film_class'] = ''
            item['film_viewcount'] = ''
            item['film_cover'] = ''

            print(item)

            yield scrapy.Request(url=film_url,
                                 meta={'item': item},
                                 callback=self.parse_film)
Beispiel #2
0
    def parse(self, response):
        movies = Selector(
            response=response).xpath('//div[@class="box clearfix"]//li')

        for movie in movies:
            title = movie.xpath('./a/text()')
            link = movie.xpath('./a/@href')

            # debug
            # print(self.start_urls[0])
            # print(title)
            # print(link)
            # print('-----------')
            # print(title.extract())
            # print(link.extract())
            # print('-----------')
            # print(title.extract_first().strip())
            # print(self.start_urls[0] + link.extract_first().strip())

            item = RrysItem()
            titles = title.extract_first().strip()
            links = self.start_urls[0] + link.extract_first().strip()
            item['titles'] = titles
            item['links'] = links

            yield scrapy.Request(url=links,
                                 meta={'item': item},
                                 callback=self.parse2)
Beispiel #3
0
 def parse(self, response):
     selector = lxml.etree.HTML(response.text)
     for i in range(1,13):
         item = RrysItem()
         url = selector.xpath(
             f"/html/body/div[2]/div/div[1]/div/ul/li[{i}]/a/@href")
         link = f'http://www.rrys2019.com{url[0]}'
         print(f"link={link}")
         print(link.split('/')[-1])
         
         yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
Beispiel #4
0
 def parse(self, response):
     html = Selector(text=response.text)
     nodes = html.xpath('//div[@class="middle-box"]/div/div[@class="fl box top24"]/div[@class="box clearfix"]/ul/li/em[contains(text(),"电影")]/../a')
     for node in nodes:
         item = RrysItem()
         link, name = self.start_urls[0] + node.xpath('@href').extract()[0][1:], node.xpath('text()').extract()[0]
         item['name'] = name
         item['link'] = link
         rid = link.split('/')[-1]
         url = f'http://www.rrys2019.com/resource/index_json/rid/{rid}/channel/movie'
         yield scrapy.Request(url=url, meta={'item': item}, callback=self.getViews)
Beispiel #5
0
    def parse(self, response):
        movies = Selector(response=response).xpath(
            query='//div[@class="box clearfix"]/ul/li')
        for movie in movies:
            title = movie.xpath('./a/@title').get()
            link = movie.xpath('./a/@href').get()

            movie_item = RrysItem()
            movie_item['title'] = title
            movie_item['id'] = link.split("/")[-1]
            yield scrapy.Request(url=f'http://rrys2019.com/{link}',
                                 meta={'item': movie_item},
                                 callback=self.parse_detail)
    def parse(self, response):
        #print(response.text)
        for i in range(1,14):
            category = Selector(response=response).xpath(f'/html/body/div[2]/div/div[1]/div/ul/li[{i}]/em/text()').extract()
            title = Selector(response=response).xpath(f'/html/body/div[2]/div/div[1]/div/ul/li[{i}]/a/text()').extract()
            url = Selector(response=response).xpath(f'/html/body/div[2]/div/div[1]/div/ul/li[{i}]/a/@href')[0].extract()
            rank = Selector(response=response).xpath(f'/html/body/div[2]/div/div[1]/div/ul/li[{i}]/span/text()').extract()
            item = RrysItem()
            item['title'] = title
            item['rank'] = rank
            item['category'] = category
            item['url'] = f'http://www.rrys2019.com{url}'

            yield scrapy.Request(url=item['url'], meta={'item': item}, callback=self.parse2)
Beispiel #7
0
 def parse(self, response):
     movies_li = Selector(
         response=response).xpath('//div[@class="box clearfix"]/ul/li')
     for movie_item in movies_li:
         rrys_item = RrysItem()
         movie = movie_item.xpath('./a/@title').extract_first()
         link = movie_item.xpath('./a/@href').extract_first()
         movie_type = movie_item.xpath(
             './em/text()').extract_first().strip()
         rrys_item['movie'] = movie
         rrys_item['movie_type'] = movie_type
         yield scrapy.Request(url=self.start_urls[0] + link,
                              meta={'rrys_item': rrys_item},
                              callback=self.parse_moive_detail)
 def parse(self, response):
     top24_contents = Selector(response=response).xpath('//div[@class="fl box top24"]//li')
     i = 0
     for content in top24_contents:
         res_type = content.xpath('./em/text()').extract_first()
         if res_type == '电影':
             item = RrysItem()
             i += 1
             item['m_num'] = i
             item['m_index'] = content.xpath('./span/text()').extract_first()
             url_suffix = content.xpath('./a/@href').extract_first()
             item['m_resid'] = url_suffix.split("/")[2]
             item['m_url'] = self.start_urls[0] + url_suffix
             item['m_name'] = content.xpath('./a/@title').extract_first()
             yield scrapy.Request(url=item['m_url'], meta={'item': item}, callback=self.get_movie_info)
Beispiel #9
0
    def parse(self, response):
        selector = lxml.etree.HTML(response.text)
        mlist = selector.xpath('/html/body/div[2]/div/div[1]/div/ul/li')
        for i in mlist:
            type = i.xpath('./em/text()')[0]
            if (type == '电影'):

                item = RrysItem()
                item['rank'] = i.xpath('./span/text()')[0]
                item['title'] = i.xpath('./a/text()')[0]
                rurl = i.xpath('./a/@href')[0]
                item['rid'] = rurl.split('/')[-1]
                print(item)
                rurl = "http://www.rrys2019.com" + rurl
                yield scrapy.Request(url=rurl,
                                     meta={'item': item},
                                     callback=self.parse2)
    def parse(self, response):
        top_24 = Selector(
            response=response).xpath('/html/body/div[2]/div/div[1]')
        movies = top_24.xpath('./li')
        print(movies)
        for movie in movies:
            item = RrysItem()

            rank = movie.xpath('./em/text()')
            title = movie.xpath('./a/text()')
            link = movie.xpath('./a/@href')
            print('hohohohoho')
            print(link)
            item['title'] = titile
            item['rank'] = rank
            item['link'] = link

            yield scrapy.Request(url=link,
                                 meta={'item': item},
                                 callback=self.parse2)
    def parse_items(self, response):
        item = RrysItem()

        item['name'] = response.xpath(
            '//div[@class="resource-tit"]//h2/text()[1]').re(r'[《](.*)[》]')[0]
        item['ranking'] = response.xpath(
            '//div[@class="box score-box"]//p[@class]/text()').extract_first(
            ).replace(u'\xa0', u'').strip()
        item['level'] = response.xpath(
            '//div[@class="level-item"]//img/@src').re(r'/([a-zA-Z])-big')[0]
        item['cover'] = response.xpath(
            '//div[@class="imglink"]/a/@href').extract_first()

        #
        views_link = response.xpath(
            '//script[contains(@src,"/resource/index_json/rid")]/@src').get()

        yield response.follow(views_link,
                              self.parse_views,
                              meta={'item': item})
Beispiel #12
0
    def get_view_counts(self, response):
        resid = str(response.url).split('/')[-3]
        item = RrysItem()
        content = response.text[15:]
        j = json.loads(content)
        print(response.url)
        print(response.text)
        # print(j['cate_ranks'][0]['views'])
        self.data[resid]['m_views'] = j['views']

        item['m_num'] = self.data[resid]['m_num']
        item['m_index'] = self.data[resid]['m_index']
        item['m_resid'] = self.data[resid]['m_resid']
        item['m_url'] = self.data[resid]['m_url']
        item['m_rank'] = self.data[resid]['m_rank']
        item['m_name'] = self.data[resid]['m_name']
        item['m_img'] = self.data[resid]['m_img']
        item['m_level'] = self.data[resid]['m_level']
        item['m_views'] = self.data[resid]['m_views']

        yield(item)
Beispiel #13
0
    def parse(self, response):
        hotmovies = Selector(
            response=response).xpath('//div[@class="box clearfix"]')
        movies = hotmovies.xpath('./ul/li')
        items = []
        for movie in movies:
            item = RrysItem()
            titles = movie.xpath('./a/text()')
            title = titles.extract_first()
            links = movie.xpath('./a/@href')
            link = links.extract_first()
            mid = link[10:]
            link = f'http://www.rrys2019.com{link}'
            item['title'] = title
            item['mid'] = mid
            items.append(item)

            print(title)
            #print(mid)

            yield scrapy.Request(url=link,
                                 meta={'item': item},
                                 callback=self.parse2)