def parse_comment_url(self, response): print(response.status) print('comment_url!') _setDNSCache() print() comment = DoubanMovieCommentItem() comment['movie_id'] = response.xpath('//div[@class="fright"]/a/@name').extract() comment['URL'] = response.url for item in response.xpath('//div[@class="comment-item"]'): # 短评的唯一id comment['comment_id'] = int(item.xpath('div[@class="comment"]/h3/span[@class="comment-vote"]/input/@value').extract()[0].strip()) # 多少人评论有用 comment['useful_num'] = item.xpath('div[@class="comment"]/h3/span[@class="comment-vote"]/span/text()').extract()[0].strip() # 评分 comment['star'] = item.xpath('div[@class="comment"]/h3/span[@class="comment-info"]/span[2]/@class').extract()[0].strip() # 评论时间 comment['time'] = item.xpath('div[@class="comment"]/h3/span[@class="comment-info"]/span[@class="comment-time "]/@title').extract() # 评论内容 comment['content'] = item.xpath('div[@class="comment"]/p/text()').extract() # 评论者名字(唯一) comment['people'] = item.xpath('div[@class="avatar"]/a/@title').extract()[0] # 评论者页面 comment['people_url'] = item.xpath('div[@class="avatar"]/a/@href').extract()[0] # 已摒弃 #url = item.xpath('div[@class="avatar"]/a/@href').extract()[0] #movie_url = item.xpath('//p[@class="pl2"]/a/@href').extract()[0] yield comment
def parse_movie(self, response): print(response.status) print(response.xpath('//li/span[@class="rec"]/@id')) print(response.xpath('//span[@class="rating_per"]/text()')) _setDNSCache() movie_item = DoubanMovieItem() # movie id movie_item['movie_id'] = response.xpath( '//li/span[@class="rec"]/@id').extract() # movie title movie_item['movie_title'] = response.xpath( '//*[@id="content"]/h1/span[1]').extract() # release_date movie_item['release_date'] = response.xpath( './/h1/span[@class="year"]/text()').extract() # 导演 movie_item['directedBy'] = response.xpath( './/a[@rel="v:directedBy"]/text()').extract() # 电影主演 movie_item['starring'] = response.xpath( './/a[@rel="v:starring"]/text()').extract() # 电影类别 movie_item['genre'] = response.xpath( './/span[@property="v:genre"]/text()').extract() # 电影时长 movie_item['runtime'] = response.xpath( './/span[@property="v:runtime"]/text()').extract() # # 电影的国别和语言 # temp = response.xpath('.//div[@id="info"]/text()').extract() # movie_item['country'] = [p for p in temp if (p.strip() != '') & (p.strip() != '/')][0].strip() # movie_item['language'] = [p for p in temp if (p.strip() != '') & (p.strip() != '/')][1].strip() # 电影的评分 movie_item['rating_num'] = response.xpath( './/strong[@class="ll rating_num"]/text()').extract() # 评分的人数 movie_item['vote_num'] = response.xpath( './/span[@property="v:votes"]/text()').extract() # 电影1-5星的百分比 # movie_item['rating_per_stars5'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[0].strip() # movie_item['rating_per_stars4'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[1].strip() # movie_item['rating_per_stars3'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[2].strip() # movie_item['rating_per_stars2'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[3].strip() # movie_item['rating_per_stars1'] = response.xpath('.//span[@class="rating_per"]/text()').extract()[4].strip() # 电影的剧情简介 intro = response.xpath('.//span[@class="all hidden"]/text()').extract() if len(intro): movie_item['intro'] = intro else: movie_item['intro'] = response.xpath( './/span[@property="v:summary"]/text()').extract() # 电影的短评数 # movie_item['comment_num'] = response.xpath('.//div[@class="mod-hd"]/h2/span/a/text()').extract()[0].strip() # # 电影的提问数 # movie_item['question_num'] = response.xpath('.//div[@class="mod-hd"]/h2/span/a/text()').extract()[1].strip() # 最后输出 yield movie_item
def parse_people_url(self, response): #print(response.status) _setDNSCache() User = DoubanMovieUser() User['location'] = response.xpath('//*[@id="profile"]/div/div[2]/div[1]/div/a/text()').extract() User['location'] = response.xpath('//div[@class="user-info"]/a/text()').extract() User['introduction'] = response.xpath('//span[@id="intro_display"]/text()').extract() User['friend'] = response.xpath('//div[@id="friend"]/h2/span/a/text()').extract() User['be_attention'] = response.xpath('//p[@class="rev-link"]/a/text()').extract() yield User
def after_login(self, response): _setDNSCache() #print(response.status) self.headers['Host'] = "movie.douban.com" movie_id = np.loadtxt('movie_id.out', dtype='i').tolist() # Top250 for mid in movie_id: yield scrapy.Request( url='https://movie.douban.com/subject/%s/' % mid, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_movie)
def after_login(self, response): print(response.status) print('after_login!') _setDNSCache() print() self.headers['Host'] = "movie.douban.com" movie_id = np.loadtxt('movie_id.out', dtype='i').tolist()[225:] # Top250 for mid in movie_id: yield scrapy.Request(url='https://movie.douban.com/subject/%s/comments' % mid, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment_url) yield scrapy.Request(url='https://movie.douban.com/subject/%s/comments' % mid, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_next_page, dont_filter = True)
def parse_next_page(self, response): #print(response.status) print('Next_page!') print() _setDNSCache() next_url = response.urljoin( response.xpath('//a[@class="next"]/@href').extract()[0]) self.url_set.append(next_url) try: #next_url = response.urljoin(response.xpath('//a[@class="next"]/@href').extract()[0]) if next_url and (self.page < self.pages): self.page += 1 ## 将 「下一页」的链接传递给自身,并重新分析 yield scrapy.Request( url=next_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_next_page, dont_filter=True) else: print('No more pages!') #with open('url_set.txt', mode='w') as f: # f.write(self.url_set) print(self.url_set) for url in self.url_set: yield scrapy.Request( url=url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment_url #, #dont_filter = True ) except: print("Next page Error") print(response.status) print( response.urljoin( response.xpath('//a[@class="next"]/@href').extract())) return