Esempio n. 1
0
    def parse(self, response):
        #filename = response.url.split("/")[-2]

        #filename = "movies"
        #with open(filename, 'ab') as f:
        # f.write(response.body)

        item = MovieItem()
        entity = json.loads(response.body)
        movie = entity['data']['movieDetails']['movie']
        item['movieid'] = entity['data']['movieDetails']['movieId']
        item['moviename'] = movie['title']
        item['directors'] = ",".join(movie['directors'])
        item['actors'] = ",".join(movie['actors'])
        item['posterPath'] = "http://image.tmdb.org/t/p/w185" + (
            movie['posterPath'])
        item['plotSummary'] = movie['plotSummary']
        item['averageratings'] = movie['avgRating']
        item['numRatings'] = movie['numRatings']
        yield item

        while self.movie_id < 140215:
            self.movie_id += 1
            url = self.start_urls[0] + str(self.movie_id)
            yield scrapy.Request(url, dont_filter=True, callback=self.parse)
Esempio n. 2
0
    def parse(self, response):
        sel = scrapy.selector.Selector(response)
        sites = sel.xpath('//ul/li/h5')
        item = MovieItem()
        for site in sites:

            item['title'] = site.xpath('a/text()').extract()[0]
            item['link'] = 'http://www.meijutt.com/' + site.xpath(
                'a/@href').extract()[0]

            yield item
Esempio n. 3
0
 def parse(self, response):
     movies = Selector(response).xpath(
         '//*[@id="content"]/div/div[1]/div/div/table')
     for movie in movies:
         item = MovieItem()
         if movie.xpath(".//a"):
             item['name'] = movie.xpath(".//a/@title").extract()[0].strip()
             item['link'] = movie.xpath(".//a/@href").extract()[0].strip()
             item['score'] = movie.xpath(
                 ".//span[re:test(@class,'rating_nums')]/text()").extract(
                 )[0].strip()
             print("%s %s %s" % (item['name'], item['link'], item['score']))
             yield item
Esempio n. 4
0
    def parse_indetail(self, response):
        item = MovieItem()
        item['title'] = response.xpath(
            '//div[@class="title_wrapper"]/h1/text()').extract()[:-1]
        item['crew'] = response.xpath(
            '//div[@class ="credit_summary_item"]/a/text()').extract()[:-1]

        #item['writers']=response.xpath('//div[@class="credit_summary_item"]/span[@itemprop="creator"]/a/span/text()').extract()
        #item['stars']=response.xpath('//div[@class="credit_summary_item"]/span[@itemprop="actors"]/a/span/text()').extract()
        item['popularity'] = response.xpath(
            '//div[@class="titleReviewBarSubItem"]/div/span/text()').extract(
            )[2][21:-8]

        return item
Esempio n. 5
0
 def parse(self, response):
     # movies=response.url.split('/')[-2]
     # filename=response.url.split('/')[-2]
     # with open(filename,'wb') as f:
     #     f.write(response.body)
     sel = scrapy.selector.Selector(response)
     sites = sel.xpath('//ul[@class="navUl"]/li')
     items = []
     for site in sites:
         # title=site.xpath('a/text()').extract()
         # link=
         # print(title)
         item = MovieItem()
         item['title'] = site.xpath('a/text()').extract()
         items.append(item)
     return items
 def parse_indetail(self, response):
     item = MovieItem()
     item['title'] = response.xpath(
         '//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[2]/div[2]/h1/text()'
     ).extract()
     item['directors'] = response.xpath(
         '//*[@id="title-overview-widget"]/div[2]/div[1]/div[2]/a/text()'
     ).extract()
     item['writers'] = response.xpath(
         '//*[@id="title-overview-widget"]/div[2]/div[1]/div[3]/a[1]/text()'
     ).extract()
     item['stars'] = response.xpath(
         '//*[@id="title-overview-widget"]/div[2]/div[1]/div[4]/a[1]/text()'
     ).extract()
     item['popularity'] = response.xpath(
         '//*[@id="title-overview-widget"]/div[1]/div[2]/div/div[1]/div[1]/a/span/text()'
     ).extract()
     return item
Esempio n. 7
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     actors = hxs.select('//td[@class="nm"]')
     items = MovieItem()
     items["movie"] = {
         "url": response.url,
         "name": hxs.select("/html/head/title/text()").extract()
     }
     items["cast"] = []
     for actor in actors:
         item = TutorialItem()
         item["name"] = actor.select('a/text()').extract()
         item["link"] = actor.select('a/@href').extract()
         print item["name"], item["link"]
         items["cast"].append(item)
     items["rank"] = self.rank
     self.rank = self.rank + 1
     return items
Esempio n. 8
0
    def parse(self, response):
        item = MovieItem()
        entity = json.loads(response.body)
        movie = entity['data']['movieDetails']['movie']
        item['movieid'] = entity['data']['movieDetails']['movieId']
        item['moviename'] = movie['title']
        item['directors'] = ','.join(movie['directors'])
        item['actors'] = ",".join(movie['actors'])
        item['posterPath'] = movie['posterPath']
        item['plotSummary'] = movie['plotSummary']
        item['averageratings'] = movie['avgRating']
        item['numRatings'] = movie['numRatings']
        yield item

        while self.movie_id < 140215:
            self.movie_id += 1
            url = self.start_urls[0] + str(self.movie_id)
            yield scrapy.Request(url,
                                 dont_filter=True,
                                 callback=self.parse,
                                 headers=self.headers,
                                 cookies=self.cookie)
Esempio n. 9
0
    def parse(self, response):
        self.wanted_num = 10  #For testing easily,we may not want all these data which could take a very long time~
        for sel in response.xpath(
                "//*[contains(@class,'chart full-width')]/tbody/tr"
        ):  #//TODO==king it seems that IMDB has changed the html structure for these information
            item = MovieItem()
            item['Title'] = sel.xpath('td[2]/a/text()').extract()[0]
            item['Rating'] = sel.xpath('td[3]/strong/text()').extract()[0]
            #have to use python's re model
            item['Ranking'] = re.match(
                r'(^[0-9]+)',
                sel.xpath('td[2]/text()').extract()
                [0].__str__().strip()).group(1)
            item['ReleaseDate'] = sel.xpath('td[2]/span/text()').extract()[0]
            item['MainPageUrl'] = "http://imdb.com" + sel.xpath(
                'td[2]/a/@href').extract()[0]

            request = scrapy.Request(item['MainPageUrl'],
                                     callback=self.parseMovieDetails)
            request.meta['item'] = item
            if (int(item['Ranking']) >= self.wanted_num + 1):
                return
            yield request