Ejemplo n.º 1
0
    def parse_comment_url(self, response):
        print(response.status)
        print('comment_url!')
        _setDNSCache()
        print()
        comment = DoubanMovieCommentItem()
        comment['movie_id'] = response.xpath('//div[@class="fright"]/a/@name').extract()
        comment['URL'] = response.url
        for item in response.xpath('//div[@class="comment-item"]'):
            # 短评的唯一id
            comment['comment_id'] = int(item.xpath('div[@class="comment"]/h3/span[@class="comment-vote"]/input/@value').extract()[0].strip())
            # 多少人评论有用
            comment['useful_num'] = item.xpath('div[@class="comment"]/h3/span[@class="comment-vote"]/span/text()').extract()[0].strip()
            # 评分
            comment['star'] = item.xpath('div[@class="comment"]/h3/span[@class="comment-info"]/span[2]/@class').extract()[0].strip()
            # 评论时间
            comment['time'] = item.xpath('div[@class="comment"]/h3/span[@class="comment-info"]/span[@class="comment-time "]/@title').extract()
            # 评论内容
            comment['content'] = item.xpath('div[@class="comment"]/p/text()').extract()
            # 评论者名字(唯一)
            comment['people'] = item.xpath('div[@class="avatar"]/a/@title').extract()[0]
            # 评论者页面
            comment['people_url'] = item.xpath('div[@class="avatar"]/a/@href').extract()[0]
            
            # 已摒弃
            #url = item.xpath('div[@class="avatar"]/a/@href').extract()[0]
            #movie_url = item.xpath('//p[@class="pl2"]/a/@href').extract()[0]

            yield comment
Ejemplo n.º 2
0
    def parse_movie(self, response):
        print(response.status)
        print(response.xpath('//li/span[@class="rec"]/@id'))
        print(response.xpath('//span[@class="rating_per"]/text()'))
        _setDNSCache()
        movie_item = DoubanMovieItem()
        # movie id
        movie_item['movie_id'] = response.xpath(
            '//li/span[@class="rec"]/@id').extract()
        # movie title
        movie_item['movie_title'] = response.xpath(
            '//*[@id="content"]/h1/span[1]').extract()
        # release_date
        movie_item['release_date'] = response.xpath(
            './/h1/span[@class="year"]/text()').extract()
        # 导演
        movie_item['directedBy'] = response.xpath(
            './/a[@rel="v:directedBy"]/text()').extract()
        # 电影主演
        movie_item['starring'] = response.xpath(
            './/a[@rel="v:starring"]/text()').extract()
        # 电影类别
        movie_item['genre'] = response.xpath(
            './/span[@property="v:genre"]/text()').extract()
        # 电影时长
        movie_item['runtime'] = response.xpath(
            './/span[@property="v:runtime"]/text()').extract()
        # # 电影的国别和语言
        # temp = response.xpath('.//div[@id="info"]/text()').extract()
        # movie_item['country'] = [p for p in temp if (p.strip() != '') & (p.strip() != '/')][0].strip()
        # movie_item['language'] = [p for p in temp if (p.strip() != '') & (p.strip() != '/')][1].strip()
        # 电影的评分
        movie_item['rating_num'] = response.xpath(
            './/strong[@class="ll rating_num"]/text()').extract()
        # 评分的人数
        movie_item['vote_num'] = response.xpath(
            './/span[@property="v:votes"]/text()').extract()
        # 电影1-5星的百分比
        # movie_item['rating_per_stars5'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[0].strip()
        # movie_item['rating_per_stars4'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[1].strip()
        # movie_item['rating_per_stars3'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[2].strip()
        # movie_item['rating_per_stars2'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[3].strip()
        # movie_item['rating_per_stars1'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[4].strip()
        # 电影的剧情简介
        intro = response.xpath('.//span[@class="all hidden"]/text()').extract()
        if len(intro):
            movie_item['intro'] = intro
        else:
            movie_item['intro'] = response.xpath(
                './/span[@property="v:summary"]/text()').extract()
        # 电影的短评数
        # movie_item['comment_num'] = response.xpath('.//div[@class="mod-hd"]/h2/span/a/text()').extract()[0].strip()
        # # 电影的提问数
        # movie_item['question_num'] = response.xpath('.//div[@class="mod-hd"]/h2/span/a/text()').extract()[1].strip()

        # 最后输出
        yield movie_item
Ejemplo n.º 3
0
 def parse_people_url(self, response):
     #print(response.status)
     _setDNSCache()
     User = DoubanMovieUser()
     User['location'] = response.xpath('//*[@id="profile"]/div/div[2]/div[1]/div/a/text()').extract()
     User['location'] = response.xpath('//div[@class="user-info"]/a/text()').extract()
     User['introduction'] = response.xpath('//span[@id="intro_display"]/text()').extract()
     User['friend'] = response.xpath('//div[@id="friend"]/h2/span/a/text()').extract()
     User['be_attention'] = response.xpath('//p[@class="rev-link"]/a/text()').extract()
     yield User
Ejemplo n.º 4
0
 def after_login(self, response):
     _setDNSCache()
     #print(response.status)
     self.headers['Host'] = "movie.douban.com"
     movie_id = np.loadtxt('movie_id.out', dtype='i').tolist()  # Top250
     for mid in movie_id:
         yield scrapy.Request(
             url='https://movie.douban.com/subject/%s/' % mid,
             meta={'cookiejar': response.meta['cookiejar']},
             headers=self.headers,
             callback=self.parse_movie)
Ejemplo n.º 5
0
 def after_login(self, response):
     print(response.status)
     print('after_login!')
     _setDNSCache()
     print()
     self.headers['Host'] = "movie.douban.com" 
     movie_id = np.loadtxt('movie_id.out', dtype='i').tolist()[225:]   # Top250
     for mid in movie_id:
         yield scrapy.Request(url='https://movie.douban.com/subject/%s/comments' % mid,
                           meta={'cookiejar': response.meta['cookiejar']},
                           headers=self.headers,
                           callback=self.parse_comment_url)
         yield scrapy.Request(url='https://movie.douban.com/subject/%s/comments' % mid,
                                                   meta={'cookiejar': response.meta['cookiejar']},
                                                   headers=self.headers,
                                                   callback=self.parse_next_page,
                                                   dont_filter = True)
Ejemplo n.º 6
0
    def parse_next_page(self, response):
        #print(response.status)
        print('Next_page!')
        print()
        _setDNSCache()
        next_url = response.urljoin(
            response.xpath('//a[@class="next"]/@href').extract()[0])
        self.url_set.append(next_url)

        try:
            #next_url = response.urljoin(response.xpath('//a[@class="next"]/@href').extract()[0])
            if next_url and (self.page < self.pages):
                self.page += 1
                ## 将 「下一页」的链接传递给自身,并重新分析
                yield scrapy.Request(
                    url=next_url,
                    meta={'cookiejar': response.meta['cookiejar']},
                    headers=self.headers,
                    callback=self.parse_next_page,
                    dont_filter=True)
            else:
                print('No more pages!')
                #with open('url_set.txt', mode='w') as f:
                #    f.write(self.url_set)
                print(self.url_set)
                for url in self.url_set:
                    yield scrapy.Request(
                        url=url,
                        meta={'cookiejar': response.meta['cookiejar']},
                        headers=self.headers,
                        callback=self.parse_comment_url  #,
                        #dont_filter = True
                    )

        except:
            print("Next page Error")
            print(response.status)
            print(
                response.urljoin(
                    response.xpath('//a[@class="next"]/@href').extract()))
            return