Esempio n. 1
0
    def parse_item(self, response):
        sel = Selector(response)
        item = DoubanMovieItem()
        title = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
        director = sel.xpath('//*[@id="info"]/span[1]/span[2]/a/text()').extract()
        actor = sel.xpath('//*[@id="info"]/span[3]/span[2]/a/text()').extract()
        #release_time = sel.xpath('//*[@id="info"]/span[11]/text()').extract()
        #time = sel.xpath('//*[@id="info"]/span[13]/text()').extract()
        star = sel.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()


        item['title'] = title
        item['director'] = director
        item['actor'] = actor
        #item['release_time'] = release_time
        #item['time'] = time
        item['star'] = star

        yield item
Esempio n. 2
0
    def parse(self, response):
        item = DoubanMovieItem()
        for movie in response.xpath('//ol[@class="grid_view"]/li'):
            item['rank'] = movie.xpath(
                './/div[@class="pic"]/em/text()').extract_first()
            item['movie_name'] = movie.xpath(
                './/div[@class="hd"]/a/span[1]/text()').extract_first()
            item['score'] = movie.xpath(
                './/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract_first()
            item['quote'] = movie.xpath(
                './/span[@class="inq"]/text()').extract_first()
            yield item

        next_url = response.xpath(
            '//span[@class="next"]/a/@href').extract_first()
        if next_url:
            next_url = 'https://movie.douban.com/top250' + next_url
            yield Request(next_url, headers=self.headers)
Esempio n. 3
0
 def parse(self, response):
     item = DoubanMovieItem()
     film_list = json.loads(response.body.decode())
     if film_list == [] or self.offset > 1000:
         return
     for film in film_list['data']:
         item['film_name'] = film['title']
         item['film_directors'] = film['directors']
         item['film_rate'] = film['rate']
         item['film_actors'] = film['casts']
         item['film_image_url'] = film['cover']
         urllib.request.urlretrieve(
             item['film_image_url'],
             self.file_path + "/" + item['film_name'] + "." +
             item['film_image_url'].split(".")[-1])
         yield item
     self.offset = self.offset + 20
     new_url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=1&start=' + str(
         self.offset)
     yield scrapy.Request(url=new_url, callback=self.parse)
Esempio n. 4
0
 def parse(self, response):
     movie_blocks = response.xpath('//ol[@class="grid_view"]/li')
     for block in movie_blocks:
         name = block.css('span.title::text').extract_first()
         # name = block.xpath()
         star = block.xpath(
             ".//span[@class='rating_num']/text()").extract_first()
         e = block.xpath(
             ".//div[@class='star']/span[4]/text()").extract_first()
         evaluation = self.eval_re.search(e).group()
         #group()和groups(),前一个输出字符串,后一个输出元组
         introduction = block.css('span.inq::text').extract_first()
         item = DoubanMovieItem()
         item['name'] = name
         item['star'] = star
         item['evaluation'] = evaluation
         item['introduction'] = introduction
         yield item  #下一次迭代从yield之后开始
     next_url = response.css('span.next > a::attr(href)').extract_first()
     # 注意地址取法,之前都是去text,而这里取的是<>里的,指定attr和对应名称。
     # <a href="?start=25&amp;filter=" style="background: rgb(204, 136, 136); border: 2px solid red;">后页&gt;</a>
     if next_url:
         next_url = response.urljoin(next_url)  # ruljoin创建完整的url
         yield scrapy.Request(next_url, callback=self.parse)
Esempio n. 5
0
    def parse_item(self, response):
        global failed_count
        global real_parse_count
        item = DoubanMovieItem()
        try:
            real_parse_count += 1
            print("real parse count = %d" % (real_parse_count))
            # get movie id
            url = response.url
            id = url.split('/')[-2].strip()
            item["movie_id"] = id

            # get movie name
            name = response.xpath(
                '//div[@id="content"]/h1/span[1]/text()').extract_first()
            item["movie_name"] = name.strip() if name else ""

            #get movie year
            year = response.xpath(
                '//div[@id="content"]/h1/span[2]/text()').extract_first()
            item["movie_year"] = year.strip("()() ") if year else ""

            # get movie rate
            rate = response.xpath(
                "//div[@class='rating_self clearfix']/strong/text()"
            ).extract_first()
            item["movie_rate"] = float(rate.strip() if rate else "-1")

            # get movie rate people
            rate_num = response.xpath(
                "//span[@property='v:votes']/text()").extract_first()
            item["movie_rate_people"] = int(
                rate_num.strip() if rate_num else "-1")

            # get hot short comments
            comments = response.xpath(
                "//div[@id='hot-comments']//div[@class='comment-item']//div[@class='comment']/p/text()"
            ).extract()
            votes = response.xpath(
                "//div[@id='hot-comments']//div[@class='comment-item']//div[@class='comment']//span[@class='votes pr5']/text()"
            ).extract()
            rates = response.xpath(
                "//div[@id='hot-comments']//div[@class='comment-item']//span[@class='comment-info']/span[1]/@title"
            ).extract()
            if len(comments) == len(votes) and len(votes) == len(rates):
                commentsarray = []
                for i in range(len(votes)):
                    short_comments = {}
                    short_comments['comment'] = comments[i]
                    short_comments['votes'] = int(votes[i])
                    short_comments['rates'] = rates[i]
                    commentsarray.append(short_comments)
                item["movie_hot_short_comments"] = commentsarray

            seenwish = response.xpath(
                "//div[@class='subject-others-interests-ft']//a//text()"
            ).extract()
            if seenwish and len(seenwish) == 2:
                item['movie_seen'] = int(seenwish[0][:-3])
                item['movie_wishes'] = int(seenwish[1][:-3])

            # get movie info
            info = response.xpath("//div[@id='info']")
            infoarray = info.extract()
            infostr = ''.join(infoarray).strip()

            director = info.xpath("span[1]/span[2]/a/text()").extract()
            self.add_array("movie_director", director, item)

            writor = info.xpath("span[2]/span[2]/a/text()").extract()
            self.add_array("movie_writor", writor, item)

            actors = info.xpath("span[3]/span[2]/a/text()").extract()
            self.add_array("movie_actors", actors, item)

            time = info.xpath(
                "span[@property='v:runtime']/@content").extract_first()
            item["movie_time"] = float(time.strip() if time else "-1")

            types = info.xpath("span[@property='v:genre']/text()").extract()
            self.add_array("movie_type", types, item)

            try:
                lang = re.search(language_pattern, infostr)
                if lang:
                    language = lang.group(1).strip()
                    item["movie_language"] = language.strip()
            except:
                pass

            try:
                regionmatch = re.search(region_pattern, infostr)
                if regionmatch:
                    region = regionmatch.group(1).strip()
                    item["movie_region"] = region.strip()
            except:
                pass

            try:
                dialectmatch = re.search(dialect_pattern, infostr)
                if dialectmatch:
                    dialect = dialectmatch.group(1).strip()
                    item["movie_dialect"] = dialect.strip()
            except:
                pass

            desc = response.xpath("//span[@property='v:summary']/node()"
                                  ).extract_first().strip()
            item["movie_desc"] = desc.strip() if desc else ""

            tags = response.xpath(
                "//div[@class='tags-body']/a/text()").extract()
            self.add_array("movie_tags", tags, item)

            pic = response.xpath(
                "//div[@id='mainpic']/a/img/@src").extract_first()
            item["movie_pic_url"] = pic

            yield item

        except Exception, e:
            # do nothing
            logging.info("Parse error:%s" % (str(e)))
            print("failed_count = %d" % (failed_count + 1))
            failed_count += 1
            pass
Esempio n. 6
0
    def parse_movie(self, response):
        print(response.status)
        _setDNSCache()
        movie_item = DoubanMovieItem()
        # movie id
        movie_item['movie_id'] = response.xpath(
            './/li/span[@class="rec"]/@id').extract()
        # movie title
        movie_item['movie_title'] = response.xpath(
            './/h1/span[@property="v:itemreviewed"]/text()').extract()
        # release_date
        movie_item['release_date'] = response.xpath(
            './/h1/span[@class="year"]/text()').extract()
        # 导演
        movie_item['directedBy'] = response.xpath(
            './/a[@rel="v:directedBy"]/text()').extract()
        # 电影主演
        movie_item['starring'] = response.xpath(
            './/a[@rel="v:starring"]/text()').extract()
        # 电影类别
        movie_item['genre'] = response.xpath(
            './/span[@property="v:genre"]/text()').extract()
        # 电影时长
        movie_item['runtime'] = response.xpath(
            './/span[@property="v:runtime"]/text()').extract()
        # 电影的国别和语言
        temp = response.xpath('.//div[@id="info"]/text()').extract()
        movie_item['country'] = [
            p for p in temp if (p.strip() != '') & (p.strip() != '/')
        ][0].strip()
        movie_item['language'] = [
            p for p in temp if (p.strip() != '') & (p.strip() != '/')
        ][1].strip()
        # 电影的评分
        movie_item['rating_num'] = response.xpath(
            './/strong[@class="ll rating_num"]/text()').extract()
        # 评分的人数
        movie_item['vote_num'] = response.xpath(
            './/span[@property="v:votes"]/text()').extract()
        # 电影1-5星的百分比
        movie_item['rating_per_stars5'] = response.xpath(
            './/span[@class="rating_per"]/text()').extract()[0].strip()
        movie_item['rating_per_stars4'] = response.xpath(
            './/span[@class="rating_per"]/text()').extract()[1].strip()
        movie_item['rating_per_stars3'] = response.xpath(
            './/span[@class="rating_per"]/text()').extract()[2].strip()
        movie_item['rating_per_stars2'] = response.xpath(
            './/span[@class="rating_per"]/text()').extract()[3].strip()
        movie_item['rating_per_stars1'] = response.xpath(
            './/span[@class="rating_per"]/text()').extract()[4].strip()
        # 电影的剧情简介
        intro = response.xpath('.//span[@class="all hidden"]/text()').extract()
        if len(intro):
            movie_item['intro'] = intro
        else:
            movie_item['intro'] = response.xpath(
                './/span[@property="v:summary"]/text()').extract()
        # 电影的短评数
        movie_item['comment_num'] = response.xpath(
            './/div[@class="mod-hd"]/h2/span/a/text()').extract()[0].strip()
        # 电影的提问数
        movie_item['question_num'] = response.xpath(
            './/div[@class="mod-hd"]/h2/span/a/text()').extract()[1].strip()

        # 最后输出
        yield movie_item
Esempio n. 7
0
    def parse_content(self, response):
        movieid = self.movie[0]
        tag = self.movie[1]
        title = self.movie[2]
        director = self.movie[3]
        actor = self.movie[4]
        rate = self.movie[5]
        star = self.movie[6]
        cover = self.movie[7]
        html = BeautifulSoup(response.body, 'lxml')
        info = html.select('#info')
        if len(info) == 0:
            print(response.text)
            return [-2]
        info = html.select('#info')[0].get_text().split('\n')
        print(info)
        # print(len(info))
        category = ''
        district = ''
        showtime = ''
        length = ''
        for item in info:
            item = item.split(':')
            if item[0] == '类型':
                category = item[-1].strip()
            elif item[0] == '制片国家/地区':
                district = item[-1].strip()
            elif item[0] == '上映日期':
                showtime = item[-1].strip().split('-')[0]
            elif item[0] == '片长':
                length = item[-1].strip()
                length = re.findall('\d+', length)[0]

        category = category.replace(r'/', ',')
        if len(district) > 0:
            district = district[:50]

        if len(category) > 0:
            category = category[:30]
        rate_count = html.select(
            '#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > div > div.rating_sum > a > span'
        )[0].get_text()

        # interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-child(1) > span.rating_per
        rate5 = html.select(
            '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(1) > span.rating_per'
        )[0].get_text().split('%')[0]
        rate4 = html.select(
            '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(2) > span.rating_per'
        )[0].get_text().split('%')[0]
        rate3 = html.select(
            '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(3) > span.rating_per'
        )[0].get_text().split('%')[0]
        rate2 = html.select(
            '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(4) > span.rating_per'
        )[0].get_text().split('%')[0]
        rate1 = html.select(
            '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(5) > span.rating_per'
        )[0].get_text().split('%')[0]

        item = DoubanMovieItem()
        item['movieid'] = movieid
        item['title'] = title
        item['tag'] = tag
        item['directors'] = director
        item['actors'] = actor
        item['showtime'] = showtime
        item['length'] = length
        item['district'] = district
        item['category'] = category
        item['star'] = star
        item['rate'] = rate
        item['rate_count'] = rate_count
        item['rate5'] = rate5
        item['rate4'] = rate4
        item['rate3'] = rate3
        item['rate2'] = rate2
        item['rate1'] = rate1
        item['cover'] = cover
        print('###### ')
        print(item)
        print('######')
    def parse(self, response):
        global count
        global parsedids
        if count == 10000000:
            return
        else:
            count += 1
        item = DoubanMovieItem()
        try:
            # get movie id
            url = response.url
            id = url.split('/')[-2].strip()
            item["movie_id"] = id

            # get movie name
            name = response.xpath(
                '//div[@id="content"]/h1/span[1]/text()').extract_first()
            item["movie_name"] = name.strip() if name else ""

            #get movie year
            year = response.xpath(
                '//div[@id="content"]/h1/span[2]/text()').extract_first()
            item["movie_year"] = year.strip("()() ") if year else ""

            # get movie rate
            rate = response.xpath(
                "//div[@class='rating_self clearfix']/strong/text()"
            ).extract_first()
            item["movie_rate"] = float(rate.strip() if rate else "2.5")

            # get movie info
            info = response.xpath("//div[@id='info']")
            infoarray = info.extract()
            infostr = ''.join(infoarray).strip()

            director = info.xpath("span[1]/span[2]/a/text()").extract()
            self.add_array("movie_director", director, item)

            writor = info.xpath("span[2]/span[2]/a/text()").extract()
            self.add_array("movie_writor", writor, item)

            actors = info.xpath("span[3]/span[2]/a/text()").extract()
            self.add_array("movie_actors", actors, item)

            time = info.xpath(
                "span[@property='v:runtime']/@content").extract_first()
            item["movie_time"] = float(time.strip() if time else "0")

            types = info.xpath("span[@property='v:genre']/text()").extract()
            self.add_array("movie_type", types, item)

            try:
                lang = re.search(language_pattern, infostr)
                if lang:
                    language = lang.group(1).strip()
                    item["movie_language"] = language.strip()
            except:
                pass

            try:
                regionmatch = re.search(region_pattern, infostr)
                if regionmatch:
                    region = regionmatch.group(1).strip()
                    item["movie_region"] = region.strip()
            except:
                pass

            try:
                dialectmatch = re.search(dialect_pattern, infostr)
                if dialectmatch:
                    dialect = dialectmatch.group(1).strip()
                    item["movie_dialect"] = dialect.strip()
            except:
                pass

            desc = response.xpath("//span[@property='v:summary']/node()"
                                  ).extract_first().strip()
            item["movie_desc"] = desc.strip() if desc else ""

            tags = response.xpath(
                "//div[@class='tags-body']/a/text()").extract()
            self.add_array("movie_tags", tags, item)

            pic = response.xpath(
                "//div[@id='mainpic']/a/img/@src").extract_first()
            item["movie_pic_url"] = pic

            yield item

            next_pages = response.xpath(
                "//div[@class='recommendations-bd']/dl/dd/a/@href").extract()
            if next_pages:
                for page in next_pages:
                    id = int(page.split('/')[-2])
                    if parsedids[id]:
                        continue
                    else:
                        parsedids[id] = True
                        yield Request(page, callback=self.parse)
        except Exception, e:
            # do nothing
            logging.info("Parse error:%s" % (str(e)))
            pass