コード例 #1
0
ファイル: imdbspider.py プロジェクト: DoubleHok/Scrapy_Notes
 def parse_imdb(self, response):
     item = ImdbItem()
     item['url'] = response.url
     item['title'] = "".join(
         response.xpath(
             '//*[@class="fk-3"]/div[@class="hdd"]/h3/text()').extract())
     pass
コード例 #2
0
    def parse(self, response):
        '''
        this part is for debug
        :param response:
        :return:
        '''
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)

        soup = BeautifulSoup(response.text)
        movies = soup.find("tbody", {"class": "lister-list"})
        for movie in movies.findAll("tr"):
            item = ImdbItem()
            poster = movie.find("td", {"class": "posterColumn"})
            item["score"] = poster.find("span", {"name": "ir"})["data-value"]
            movie_link = movie.find("td", {
                "class": "titleColumn"
            }).find('a')["href"]
            url = "http://www.imdb.com" + movie_link
            year_str = movie.find("td", {"class": "titleColumn"}).text
            year_pattern = re.compile('\d{4}')
            item["year"] = int(year_pattern.search(year_str).group())
            id_pattern = re.compile(r'(?<=tt)\d+(?=/?)')
            item["movie_id"] = int(id_pattern.search(movie_link).group())
            item["movie_name"] = movie.select_one('.titleColumn').select_one(
                'a').string
            # yield item
            yield scrapy.Request(url,
                                 meta={'item': item},
                                 callback=self.parse_2)
コード例 #3
0
 def parse(self, response):
     itens_number = response.css(
         '.lister-current-last-item::text').extract_first()
     next_page = response.css(
         '.lister-page-next::attr(href)').extract_first()
     for selector in response.css('.lister-item'):
         item = ItemLoader(item=ImdbItem(), selector=selector)
         item.add_value(
             'position',
             selector.css('.lister-item-index::text').extract_first(
                 default=''))
         item.add_value(
             'name',
             selector.css('.lister-item-header > a::text').extract_first(
                 default=''))
         item.add_value(
             'genre',
             selector.css('.genre::text').extract_first(default=''))
         item.add_value(
             'rating',
             selector.css('.ratings-imdb-rating > strong::text').
             extract_first(default=''))
         yield item.load_item()
     if int(itens_number) < 500:
         next_url = response.urljoin(next_page)
         request = Request(url=next_url, callback=self.parse)
         yield request
コード例 #4
0
ファイル: imdb_spider.py プロジェクト: gvivek2692/cinefy
 def parse(self, response):
     movie_id = response.url.split("/")[-2]
     for sel in response.xpath('//div[@id="tn15content"]//p'):
         item = ImdbItem()
         item['movie'] = movie_id
         item['review'] = sel.xpath('text()').extract()
         yield item
コード例 #5
0
    def parse_item(self, response):
        # yield {
        #     'title' : response.xpath("normalize-space(//div[@class='title_wrapper']/h1/text())").get(),
        #     'realease' : response.xpath("//div[@class='title_wrapper']/h1/span/a/text()").get(),
        #     'rating' : response.xpath("//span[@itemprop='ratingValue']/text()").get(),
        #     'genre' : response.xpath("//div[@class='subtext']/a[1]/text()").get(),
        #     'duration' : response.xpath("normalize-space(//div[@class='subtext']/time/text())").get(),
        #     'url' : response.url
        # }

        title = response.xpath(
            "normalize-space(//div[@class='title_wrapper']/h1/text())").get(),
        realease = response.xpath(
            "//div[@class='title_wrapper']/h1/span/a/text()").get(),
        rating = response.xpath(
            "//span[@itemprop='ratingValue']/text()").get(),
        genre = response.xpath("//div[@class='subtext']/a[1]/text()").get(),
        duration = response.xpath(
            "normalize-space(//div[@class='subtext']/time/text())").get(),
        url = response.url

        imdb_item = ImdbItem(title=title,
                             realease=realease,
                             rating=rating,
                             genre=genre,
                             duration=duration,
                             url=url)

        yield imdb_item
コード例 #6
0
    def parse_movie_page(self, response):

        parse = Parse_Imdb(response)
        title = parse.get_title()
        imdb_score = parse.get_imdb_score()
        metascore = parse.get_metascore()
        genres = parse.get_genres()
        country = parse.get_country()
        release_date = parse.get_release_date()
        budget = parse.get_budget()
        opening_usa = parse.get_opening_usa()
        usa_gross = parse.get_usa_gross()
        worldwide_gross = parse.get_worldwide_gross()

        root_url = response.request.url.split('/')[:-1]
        root_url.append('companycredits?ref_=tt_dt_co')
        url_companies = '/'.join(root_url)

        item = ImdbItem()
        item['title'] = title
        item['imdb_score'] = imdb_score
        item['metascore'] = metascore
        item['genres'] = genres
        item['country'] = country
        item['release_date'] = release_date
        item['budget'] = budget
        item['opening_usa'] = opening_usa
        item['usa_gross'] = usa_gross
        item['worldwide_gross'] = worldwide_gross

        yield Request(url_companies,
                      callback=self.parse_companies,
                      meta={'imdb_item': item})
コード例 #7
0
    def parse(self, response):
        #pass
        bad_chars = "()"
        for sel in response.xpath('//td[@class="title"]'):
            #print len(title)
            item = ImdbItem()
            title = sel.xpath('a/text()').extract()
            title = ''.join(title)
            title_href = sel.xpath('a/@href').extract()
            title_href = ''.join(title_href)
            year_type = sel.xpath('span[@class="year_type"]/text()').extract()
            year_type = ''.join(year_type)
            year_type = year_type.strip("()")
            #year_type=self.filterNA(year_type)
            user_rating = sel.xpath(
                'div[@class="user_rating"]/div/@title').extract()
            user_rating = ''.join(user_rating)
            user_rating = user_rating.replace(',', '')
            user_rating = user_rating[user_rating.find('(') +
                                      1:user_rating.find(' votes')]
            user_rating = self.filterNA(user_rating)
            rating_rating = sel.xpath(
                'div[@class="user_rating"]/div/span[@class="rating-rating"]/span/text()'
            ).extract()
            if len(rating_rating) == 3:
                rating_rating = rating_rating[0]
            else:
                rating_rating = "NA"
            outline = sel.xpath('span[@class="outline"]/text()').extract()
            outline = ''.join(outline)
            outline = self.filterNA(outline)

            credit_dir = sel.xpath('span[@class="credit"]/a/text()').extract()
            credit_dir = self.filterNA(credit_dir)

            credit_with = sel.xpath('span[@class="credit"]/a/text()').extract()
            credit_with = self.filterNA(credit_with)

            genre = sel.xpath('span[@class="genre"]/a/text()').extract()
            genre = self.filterNA(genre)

            mins = sel.xpath('span[@class="runtime"]/text()').extract()
            mins = ''.join(mins)
            if not mins:
                mins = "NA"
            else:
                mins = mins.split(' ')[0]

            item['title'] = title
            item['title_href'] = title_href
            item['year_type'] = year_type
            item['user_rating'] = user_rating
            item['rating_rating'] = rating_rating
            item['outline'] = outline
            item['credit_dir'] = credit_dir
            item['credit_with'] = credit_with
            item['genre'] = genre
            item['mins'] = mins
            yield item
コード例 #8
0
ファイル: spider.py プロジェクト: August1996/Scrapy-Projects
	def parse_detail(self,response):
		items = ImdbItem()
		items['name'] = response.css('#prometer_container+.header .itemprop::text').extract()
		items['publish_date'] = response.css('#prometer_container+.header .nobr a::text').extract()
		items['last_time'] = response.css('.infobar time::text').extract()
		items['classifications'] = response.css('.infobar>a>span::text').extract()
		items['score'] = response.css('.star-box .titlePageSprite::text').extract()
		items['link'] = response.url
		yield items
コード例 #9
0
 def parse(self, response):
     sel = Selector(response)
     sites = sel.xpath('//ul[@class="directory-url"]/li')
     items = []
     item = ImdbItem()
     item['title'] = sel.xpath('//h1[@class="header"]/span[@class="itemprop"]/text()').extract()
     item['cast_information'] = sel.xpath('//div[@itemprop="actors"]/a/span/text()').extract()
     item['sypnosis'] = sel.xpath('//p[@itemprop="description"]/text()').extract()
     item['broadcast_date'] = sel.xpath('//div[@id="titleDetails"]/div/h4[text()="Release Date:"]/../text()').extract()
     item['production_company'] = sel.xpath('//div[@id="titleDetails"]/div/h4[text()="Production Co:"]/../span/a/span/text()').extract()
     return item
コード例 #10
0
 def parse(self, response):
     for tr in response.xpath("//tr[@class='odd'] | //tr[@class='even']"):
         imdb_item = ImdbItem()
         imdb_item['actor'] = tr.xpath(
             "./td[@itemprop='actor']//span/text()").extract()
         imdb_item['character'] = tr.xpath(
             "./td[@class='character']/div/a/text()").extract()
         actor_url = tr.xpath(
             "./td[@itemprop='actor']/a/@href").extract_first()
         request = scrapy.Request(response.urljoin(actor_url),
                                  callback=self.parse_actor_page)
         request.meta['imdb_item'] = imdb_item
         yield request
コード例 #11
0
ファイル: imdbspider.py プロジェクト: zyzeng1412/cs839
    def parse_imdb(self, response):
        item = ImdbItem()
        try:
            item['video_url'] = response.url
            item['video_title'] = "".join(
                response.xpath(
                    '//*[@class="title_wrapper"]/h1/text()').extract())
            item['video_year'] = "".join(
                response.xpath('//*[@id="titleYear"]/a/text()').extract())

            item['video_level'] = "".join(
                response.xpath('//*[@class="subtext"]/text()').extract())
            item['video_genres'] = "".join(
                response.xpath('//*[@class="subtext"]/a/text()').extract())
            yield item
        except Exception as error:
            log(error)
コード例 #12
0
ファイル: imdbspider.py プロジェクト: jtlqh/imdb
    def parse_result_page(self, response):
        self.start += 50
        titles = response.xpath(
            '//div[@class="lister-list"]/div/div[2]/a/@href').extract()
        titles = [re.findall(r'(tt\d+)', x)[0] for x in titles]
        item = ImdbItem()
        item['title'] = titles
        yield item

        start, end, total = self.check_current_page(response)
        print(self.start, start)
        if self.start <= start + 50:

            url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie,tv_series,tv_episode,tv_special,tv_miniseries,documentary,video_game,short,video,tv_short&release_date=1900-01-01,2019-12-31&start={}&ref_=adv_nxt'.format(
                self.start)

            yield Request(url=url, callback=self.parse_result_page)
        else:
            exit(1)
コード例 #13
0
 def parse(self, response):
     for sel in response.xpath(
             "//*[contains(@class,'chart full-width')]/tbody/tr"):
         item = ImdbItem()
         item['titulo'] = sel.xpath('td[2]/a/text()').extract()[0].strip()
         item['puntuacionIMDB'] = sel.xpath(
             'td[3]/strong/text()').extract()[0].strip()
         item['ranking'] = re.match(
             r'(^[0-9]+)',
             sel.xpath('td[2]/text()').extract()
             [0].__str__().strip()).group(1)
         item['anyoEstreno'] = sel.xpath(
             'normalize-space(td[2]/span/text())').extract()[0].strip()
         item['enlace'] = "http://imdb.com" + sel.xpath(
             'td[2]/a/@href').extract()[0]
         request = scrapy.Request(item['enlace'],
                                  callback=self.parsearInfoPelicula)
         request.meta['item'] = item
         yield request
コード例 #14
0
    def parse_movie_page(self, response):
        hxs = HtmlXPathSelector(response)

        title_h1 = hxs.select('//h1[@class="header"]')
        year = title_h1.select('span/a/text()').extract().pop(0)

        ratings = hxs.select('//div[@class="star-box-details"]')

        item = ImdbItem()

        url = response.url
        if url[0:4] != 'http':
            url = 'http://www.imdb.com' + url

        id = re.search('(\d+)', url).group()
        item['id'] = int(id) if id else 0

        item['url'] = url
        item['title'] = title_h1.select('text()').re('.*[^<]').pop(1)

        item['year'] = int(year)

        description = hxs.select(
            '//p[@itemprop="description"]/text()').extract()
        item['description'] = description.pop(0).strip() if description else ''

        cover = hxs.select('//td[@id="img_primary"]/a/img/@src').extract()

        rating = ratings.select('.//span[@itemprop="ratingValue"]/text()'
                                ).extract().pop(0).strip()
        item['rating'] = float(rating) if ratings else 0.00

        votes = ratings.select('.//span[@itemprop="ratingCount"]/text()'
                               ).extract().pop(0).replace(',', '').strip()
        item['votes'] = int(votes) if votes else 0

        # for ImagePipeline
        item['image_urls'] = cover if cover else []

        return item
コード例 #15
0
 def parse_movies(self, response):
     movies_list = []
     for sel in response.xpath(
             "//tr[@class='odd detailed' or @class='even detailed']"):
         movie = ImdbItem()
         movie['url'] = sel.xpath("td[@class='title']/a/@href").extract()
         movie['title'] = sel.xpath("td[@class='title']/a/text()").extract()
         movie['year'] = sel.xpath(
             "td[@class='title']/span[@class='year_type']/text()").extract(
             )
         movie['rating'] = sel.xpath(
             "td[@class='title']/div[@class='user_rating']/div/span[@class='rating-rating']/span[@class='value']/text()"
         ).extract()
         movie['votes'] = sel.xpath(
             "td[@class='title']/div[@class='user_rating']/div/@title"
         ).extract()
         movie['genre'] = sel.xpath(
             "td[@class='title']/span[@class='genre']/a/text()").extract()
         movie['length'] = sel.xpath(
             "td[@class='title']/span[@class='runtime']/text()").extract()
         movies_list.append(movie)
     return (movies_list)
コード例 #16
0
ファイル: imdb_spider.py プロジェクト: fx86/Scrapers
    def parse(self, response):
        for movie in response.css('tr.detailed'):
            p = movie.xpath
            item = ImdbItem()
            link = p('td[@class="image"]').xpath('a')
            item['uri'] = link.xpath('@href').extract_first()
            item['name'] = link.xpath('@title').extract_first()
            item['gross'] = p('td[@class="sort_col"]/text()').extract_first()
            title = p('td[@class="title"]')
            item['rating'] = title.xpath('div/div/@title').extract_first()
            item['desc'] = title.xpath('span[3]/text()').extract_first()
            item['duration'] = title.xpath('span[7]/text()').extract_first()
            credit = title.xpath('span[4]').extract_first()
            item['credit'] = remove_html(credit)
            item['genre'] = remove_html(title.xpath('span[5]').extract_first())
            yield item

        next_path = '//div[@class="leftright"]/div[2]/span/a[contains(text(), "Next")]/@href'
        next_page = response.xpath(next_path)
        if next_page:
            url = response.urljoin(next_page.extract()[0])
            logging.info("Going for %s" % url)
            yield scrapy.Request(url, self.parse, dont_filter=True)
コード例 #17
0
    def parse2(self, response):
        movie = []
        main = response.css('div#content-2-wide')
        maintop = main.css(
            'div#main_top div.title-overview div#title-overview-widget')
        mainbottom = main.css('div#main_bottom')
        titlecast = main.css('div#titleCast')
        storyline = main.css('div#titleStoryLine')
        detail = main.css('div#titleDetails')

        titlebarwrapper = maintop.css(
            'div.vital div.title_block div.title_bar_wrapper')
        slatewrapper = maintop.css('div.vital div.slate_wrapper')

        poster = slatewrapper.css('div.poster a img::attr(src)').extract()
        poster = (poster[0].split('_V1_')[0] +
                  '_V1_.jpg') if len(poster) else ''
        if poster == '':
            poster = maintop.css(
                'div.minPosterWithPlotSummaryHeight div.poster a img[itemprop="image"]::attr(src)'
            ).extract()
            poster = (poster[0].split('_V1_')[0] +
                      '_V1_.jpg') if len(poster) else ''

        slate = slatewrapper.css(
            'div.slate a.slate_button.prevent-ad-overlay.video-modal::attr(href)'
        ).extract()
        slate = 'http://www.imdb.com' + slate[0].strip() if len(slate) else ''

        title = titlebarwrapper.css(
            'div.titleBar div.title_wrapper h1[itemprop="name"]::text'
        ).extract()
        title = title[0].strip() if len(title) else ''

        year = titlebarwrapper.css(
            'div.titleBar div.title_wrapper h1[itemprop="name"] span#titleYear a::text'
        ).extract()
        year = year[0].strip() if len(year) else ''

        releasedate = titlebarwrapper.css(
            'div.titleBar div.title_wrapper div.subtext a[title="See more release dates"]::text'
        ).extract()
        releasedate = releasedate[0].strip() if len(releasedate) else ''

        releasedate1 = titlebarwrapper.css(
            'div.titleBar div.title_wrapper div.subtext a[title="See more release dates"] meta[itemprop="datePublished"]::attr(content)'
        ).extract()
        releasedate1 = releasedate1[0].strip() if len(releasedate1) else ''

        ratingbar = titlebarwrapper.css('div.ratings_wrapper div.imdbRating')
        rating = ratingbar.css(
            'div.ratingValue strong span[itemprop="ratingValue"]::text'
        ).extract()
        rating = rating[0].strip() if len(rating) else ''

        ratingcount = ratingbar.css(
            'a span[itemprop="ratingCount"]::text').extract()
        ratingcount = ratingcount[0].strip() if len(ratingcount) else ''

        plotsummary = maintop.css('div.plot_summary_wrapper div.plot_summary')
        description = plotsummary.css('div.summary_text::text').extract()
        description = description[0].strip() if len(description) else ''

        directorslist = plotsummary.css(
            'div.credit_summary_item span[itemprop="director"] a span[itemprop="name"]::text'
        ).extract()
        directors = ','.join(directorslist)

        writerslist = plotsummary.css(
            'div.credit_summary_item span[itemprop="creator"] a span[itemprop="name"]::text'
        ).extract()
        writers = ','.join(writerslist)

        titlereviewbar = maintop.css(
            'div.plot_summary_wrapper div.titleReviewBar ')

        metascore = titlereviewbar.css(
            'div.titleReviewBarItem a div.titleReviewBarSubItem span::text'
        ).extract()
        metascore = metascore[0].strip() if len(metascore) else ''

        popularity = titlereviewbar.css(
            'div.titleReviewBarItem div.titleReviewBarSubItem div span.subText::text'
        ).extract()
        for pop in popularity:
            number = re.findall(r'\d+', pop)
            if len(number) > 0:
                popularity = number[0]
                break
        try:
            test = int(popularity)
            pass
        except Exception as e:
            popularity = ''

        peoplealsolikelist = mainbottom.css(
            'div#titleRecs div#title_recs div.rec_const_picker div.rec_view div.rec_slide div.rec_page'
        )
        peoplemaylike = peoplealsolikelist[0].css(
            'div.rec_item::attr(data-tconst)').extract() if len(
                peoplealsolikelist) else ''
        peoplemaylike = ','.join(peoplemaylike)

        castslist = titlecast.css(
            'table.cast_list tr td.itemprop[itemprop="actor"] a span.itemprop::text'
        ).extract()
        actors = ','.join(castslist)

        keywordslist = storyline.css(
            'div.see-more.inline.canwrap[itemprop="keywords"] a span[itemprop="keywords"]::text'
        ).extract()
        keywords = ','.join(keywordslist)

        genreslist = storyline.css(
            'div.see-more.inline.canwrap[itemprop="genre"] a::text').extract()
        genreslist = ','.join(genreslist)

        itemslist = detail.css('div.txt-block')
        countrieslist = []
        countries = ''
        releasedate2 = ''
        runtime = ''
        for item in itemslist:
            itemname = item.css('h4.inline::text').extract()
            itemname = itemname[0].strip() if len(itemname) else ''
            if itemname == 'Country:':
                countrieslist = item.css('a[itemprop="url"]::text').extract()
                countries = ','.join(countrieslist)
            if itemname == 'Release Date:':
                texts = item.css('::text').extract()
                for text in texts:
                    if len(re.findall(r'\d+', text)) > 0:
                        releasedate2 = text.split('(')[0].strip()
                        break
            if itemname == 'Runtime:':
                runtime = item.css('time[itemprop="duration"]::text').extract()
                runtime = runtime[0].strip() if len(runtime) else ''
        if runtime == '':
            runtime = titlebarwrapper.css(
                'div.titleBar div.title_wrapper div.subtext time[itemprop="duration"]::text'
            ).extract()
            runtime = runtime[0].strip() if len(runtime) else ''
        if year == '':
            texts = releasedate2.split(' ')
            year = texts[len(texts) - 1]

        imdbItem = ImdbItem()
        imdbItem['Idx'] = response.meta['idx']
        imdbItem['Id'] = response.meta['Id']
        imdbItem['Title'] = title
        imdbItem['Year'] = year
        imdbItem['Genres'] = genreslist
        imdbItem['Directors'] = directors
        imdbItem['Writers'] = writers
        imdbItem['Actors'] = actors
        imdbItem['Countries'] = countries
        imdbItem['ReleaseDate'] = releasedate
        imdbItem['ReleaseDate1'] = releasedate1
        imdbItem['ReleaseDate2'] = releasedate2
        imdbItem['Runtime'] = runtime
        imdbItem['Rating'] = rating
        imdbItem['RatingCount'] = ratingcount
        imdbItem['Popularity'] = popularity
        imdbItem['MetaScore'] = metascore
        imdbItem['PeopleMayLike'] = peoplemaylike
        imdbItem['Keywords'] = keywords
        imdbItem['Link'] = response.meta['Link']
        imdbItem['Description'] = description.replace('\"', '')
        imdbItem['image_urls'] = [poster] if (len(poster.strip()) > 0
                                              and crawl_image) else []
        imdbItem['file_urls'] = [slate] if len(slate.strip()) > 0 else []

        with open(fileout, 'a', newline='') as csvfile:
            spamwriter = csv.writer(csvfile,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_ALL)
            spamwriter.writerow([
                response.meta['idx'], response.meta['Id'], title, year,
                genreslist, directors, writers, actors, countries, releasedate,
                releasedate1, releasedate2, runtime, rating, ratingcount,
                popularity, metascore, peoplemaylike, keywords,
                imdbItem['Link'], imdbItem['Description'], poster, slate
            ])
        yield imdbItem
コード例 #18
0
    def parse(self, response):

        fullcast_url = get_fullcast_url(response.url)
        company_url = get_company_url(response.url)
        location_url = get_location_url(response.url)
        technical_url = get_technical_url(response.url)
        parentguide_url = get_parent_guide_url(response.url)
        keyword_url = get_keyword_url(response.url)

        urls = {
            "fullcast": fullcast_url,
            "company": company_url,
            "location": location_url,
            "technical": technical_url,
            "parentguide": parentguide_url,
            "keyword": keyword_url
        }

        film = ImdbItem()
        # main
        film["ttid"] = response.url.split('/')[4]
        film["name"] = response.xpath(
            "//div[@class='title_block']/div[@class='title_bar_wrapper']/div[@class='titleBar']/div[@class='title_wrapper']/h1/text()"
        ).extract()[0].strip()
        release_year = response.xpath(
            "//div[@class='title_bar_wrapper']/div[@class='titleBar']/div[@class='title_wrapper']/h1/span[@id='titleYear']/a/text()"
        )
        if release_year and len(release_year.extract()) > 0:
            film["release_year"] = release_year.extract()[0].strip()
        rating = response.xpath(
            "//div[@class='imdbRating']/div[@class='ratingValue']/strong/span/text()"
        )
        if rating and len(rating.extract()) > 0:
            film["rating"] = response.xpath(
                "//div[@class='imdbRating']/div[@class='ratingValue']/strong/span/text()"
            ).extract()[0].strip()
            film["vote"] = response.xpath(
                "//div[@class='ratings_wrapper']/div[@class='imdbRating']/a/span[@itemprop='ratingCount']/text()"
            ).extract()[0].strip()

        detail_items = response.xpath(
            "//div[@id='main_bottom']/div[@id='titleDetails']/div[@class='txt-block']/h4/text()"
        ).extract()
        if "Language:" in detail_items:
            lang_index = detail_items.index("Language:") + 1
            languages = response.xpath(
                "//div[@id='main_bottom']/div[@id='titleDetails']/div[@class='txt-block']["
                + str(lang_index) + "]/a/text()").extract()
            if len(languages) > 0:
                film["primary_language"] = languages[0].strip()
        if "Country:" in detail_items:
            country_index = detail_items.index("Country:") + 1
            countries = response.xpath(
                "//div[@id='main_bottom']/div[@id='titleDetails']/div[@class='txt-block']["
                + str(country_index) + "]/a/text()").extract()
            film["country"] = [country.strip() for country in countries]

        storyline_items = response.xpath(
            "//div[@id='main_bottom']/div[@id='titleStoryLine']/div/h4/text()"
        ).extract()
        if "Genres:" in storyline_items:
            genre_index = storyline_items.index("Genres:") + 2
            genres = response.xpath(
                "//div[@id='main_bottom']/div[@id='titleStoryLine']/div[" +
                str(genre_index) + "]/a/text()").extract()
            film["genre"] = [genre.strip() for genre in genres]

        yield scrapy.Request(urls["technical"],
                             callback=self.tech_specification,
                             meta={
                                 'item': film,
                                 "urls": urls
                             })
コード例 #19
0
    def parse_imdb(self, response):
        item = ImdbItem()
        try:
            item['video_title'] = "".join(
                response.xpath('//*[@class="fk-3"]/div[@class="hdd"]/h3/text()'
                               ).extract())
            item['video_rating'] = "".join(
                response.xpath(
                    '//*[@class="fk-3"]/div[@class="hdd"]/span/i/text()').
                extract())
            content = response.xpath(
                '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li').extract()
            for i in range(0, len(content)):
                if "片名" in content[i]:
                    if i == 0:
                        item['video_name'] = "".join(
                            response.xpath(
                                '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[1]/a/text()'
                            ).extract())
                if "别名" in content[i]:
                    if i == 1:
                        item['video_alias'] = "|".join(
                            response.xpath(
                                '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[2]/a/text()'
                            ).extract())
                if "导演" in content[i]:
                    if i == 1:
                        item['video_director'] = "|".join(
                            response.xpath(
                                '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[2]/a/text()'
                            ).extract())
                    elif i == 2:
                        item['video_director'] = "|".join(
                            response.xpath(
                                '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[3]/a/text()'
                            ).extract())
                if "主演" in content[i]:
                    if i == 2:
                        item['video_actor'] = "|".join(
                            response.xpath(
                                '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[3]/a/text()'
                            ).extract())
                    if i == 3:
                        item['video_actor'] = "|".join(
                            response.xpath(
                                '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[4]/a/text()'
                            ).extract())
                if "上映时间" in content[i]:
                    if i == 4:
                        item['video_year'] = "|".join(
                            response.xpath(
                                '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a[1]/text()'
                            ).extract())
                        a = response.xpath(
                            '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a'
                        ).extract()
                        length = len(a) - 1
                        try:
                            item['video_color'] = "".join(
                                response.xpath(
                                    '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a/text()'
                                ).extract()[length])
                        except Exception as e:
                            item['video_color'] = ""
                        try:
                            type = "|".join(
                                response.xpath(
                                    '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a/text()'
                                ).extract()[1:length])
                            maohao = type.split(":")
                            if len(maohao) > 0:
                                item['video_type'] = maohao[0]
                            else:
                                item['video_type'] = ""
                        except Exception as e:
                            item['video_type'] = ""
                    if i == 5:
                        item['video_year'] = "".join(
                            response.xpath(
                                '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[1]/text()'
                            ).extract())
                        a = response.xpath(
                            '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a'
                        ).extract()
                        length = len(a) - 1
                        try:
                            item['video_color'] = "".join(
                                response.xpath(
                                    '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a/text()'
                                ).extract()[length])
                        except Exception as e:
                            item['video_color'] = ""
                        try:
                            type = "|".join(
                                response.xpath(
                                    '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a/text()'
                                ).extract()[1:length])
                            maohao = type.split(":")
                            if len(maohao) > 0:
                                item['video_type'] = maohao[0]
                            else:
                                item['video_type'] = ""
                        except Exception as e:
                            item['video_type'] = ""

                if "国家" in content[i]:
                    if i == 5:
                        item['video_area'] = "|".join(
                            response.xpath(
                                '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[1]/text()'
                            ).extract())
                        item['video_voice'] = "|".join(
                            response.xpath(
                                '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[2]/text()'
                            ).extract())
                    if i == 6:
                        item['video_area'] = "|".join(
                            response.xpath(
                                '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[7]/a[1]/text()'
                            ).extract())
                        item['video_voice'] = "|".join(
                            response.xpath(
                                '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[7]/a[2]/text()'
                            ).extract())
            item['video_length'] = "".join(
                response.xpath(
                    '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[@class="nolink"]/text()'
                ).extract()).replace("&nbsp", "")
            item['video_language'] = "".join(
                response.xpath(
                    '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[@class="nolink"]/a/text()'
                ).extract())
            item['video_summary'] = "".join(
                response.xpath(
                    '//*[@class="fk-4 clear"]/div[@class="bdd clear"]/i/text()'
                ).extract()).lstrip().rstrip().replace("<br>", "")
            item['video_url'] = response.url
            yield item
        except Exception as error:
            log(error)
コード例 #20
0
    def parse_review_page(self, response):

        # run_time = response.meta['run_time']
        # genre = response.meta['genre']
        # imdb_rating = response.meta['imdb_rating']
        # meta_rating = response.meta['meta_rating']
        title = response.meta['title']
        MPAA_rating = response.meta['MPAA_rating']
        release_date = response.meta['release_date']
        director = response.meta['director']
        actors = response.meta['actors']

        #extracting review info
        try:
            male_teen_rating = float(
                response.xpath('//div[@class="bigcell"]/text()').extract()[6])
        except ValueError:
            male_teen_rating = ""
        try:
            male_youngAdult_rating = float(
                response.xpath('//div[@class="bigcell"]/text()').extract()[7])
        except ValueError:
            male_youngAdult_rating = ""
        try:
            male_adult_rating = float(
                response.xpath('//div[@class="bigcell"]/text()').extract()[8])
        except ValueError:
            male_adult_rating = ""
        try:
            male_elder_rating = float(
                response.xpath('//div[@class="bigcell"]/text()').extract()[9])
        except ValueError:
            male_elder_rating = ""
        try:
            male_ratingCount = int(
                response.xpath('//div[@class="smallcell"]/a/text()').extract()
                [5].strip().replace(',', ""))
        except IndexError:
            male_ratingCount = ""
        # try:
        # 	male_teen_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[6].strip().replace(',',""))
        # except ValueError:
        # 	male_teen_ratingCount = ""
        # try:
        # 	male_youngAdult_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[7].strip().replace(',',""))
        # except ValueError:
        # 	male_youngAdult_ratingCount = ""
        # try:
        # 	male_adult_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[8].strip().replace(',',""))
        # except ValueError:
        # 	male_youngAdult_ratingCount = ""
        # try:
        # 	male_elder_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[9].strip().replace(',',""))
        # except ValueError:
        # 	male_elder_ratingCount = ""

        try:
            female_teen_rating = float(
                response.xpath('//div[@class="bigcell"]/text()').extract()[11])
        except ValueError:
            female_teen_rating = ""
        try:
            female_youngAdult_rating = float(
                response.xpath('//div[@class="bigcell"]/text()').extract()[12])
        except ValueError:
            female_youngAdult_rating = ""
        try:
            female_adult_rating = float(
                response.xpath('//div[@class="bigcell"]/text()').extract()[13])
        except ValueError:
            female_adult_rating = ""
        try:
            female_elder_rating = float(
                response.xpath('//div[@class="bigcell"]/text()').extract()[14])
        except ValueError:
            female_elder_rating = ""
        try:
            female_ratingCount = int(
                response.xpath('//div[@class="smallcell"]/a/text()').extract()
                [10].strip().replace(',', ""))
        except IndexError:
            female_ratingCount = ""

        # try:
        # 	female_teen_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[11].strip().replace(',',""))
        # except ValueError:
        # 	female_teen_ratingCount = ""
        # try:
        # 	female_youngAdult_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[12].strip().replace(',',""))
        # except ValueError:
        # 	female_youngAdult_ratingCount = ""
        # try:
        # 	female_adult_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[13].strip().replace(',',""))
        # except ValueError:
        # 	female_adult_ratingCount = ""
        # try:
        # 	female_elder_ratingCount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[14].strip().replace(',',""))
        # except ValueError:
        # 	female_elder_ratingCount = ""

        try:
            us_users = float(
                response.xpath('//div[@class="bigcell"]/text()').extract()[17])
        except ValueError:
            us_users = ""
        # try:
        # 	us_count = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[17].strip().replace(',',""))
        # except ValueError:
        # 	us_count = ""
        try:
            non_USusers = float(
                response.xpath('//div[@class="bigcell"]/text()').extract()[18])
        except ValueError:
            non_USusers = ""
        # try:
        # 	non_UScount = int(response.xpath('//div[@class="smallcell"]/a/text()').extract()[18].strip().replace(',',""))
        # except ValueError:
        # 	non_UScount = ""

        item = ImdbItem()
        # item['run_time'] = run_time
        # item['genre'] = genre
        item['title'] = title
        # item['imdb_rating'] = imdb_rating
        # item['meta_rating'] = meta_rating
        item['MPAA_rating'] = MPAA_rating
        item['release_date'] = release_date
        item['director'] = director
        item['actors'] = actors
        item['male_teen_rating'] = male_teen_rating
        item['male_youngAdult_rating'] = male_youngAdult_rating
        item['male_adult_rating'] = male_adult_rating
        item['male_elder_rating'] = male_elder_rating
        item['male_ratingCount'] = male_ratingCount
        # item['male_teen_ratingCount'] = male_teen_ratingCount
        # item['male_youngAdult_ratingCount'] = male_youngAdult_ratingCount
        # item['male_adult_ratingCount'] = male_adult_ratingCount
        # item['male_elder_ratingCount'] = male_elder_ratingCount
        item['female_teen_rating'] = female_teen_rating
        item['female_youngAdult_rating'] = female_youngAdult_rating
        item['female_adult_rating'] = female_adult_rating
        item['female_elder_rating'] = female_elder_rating
        item['female_ratingCount'] = female_ratingCount
        # item['female_teen_ratingCount'] = female_teen_ratingCount
        # item['female_youngAdult_ratingCount'] = female_youngAdult_ratingCount
        # item['female_adult_ratingCount'] = female_adult_ratingCount
        # item['female_elder_ratingCount'] = female_elder_ratingCount
        item['non_USusers'] = non_USusers
        # item['non_UScount'] = non_UScount
        item['us_users'] = us_users
        # item['us_count'] = us_count

        yield item