def parse_item(self, response): sel = Selector(response) item = DoubanMovieItem() title = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract() director = sel.xpath('//*[@id="info"]/span[1]/span[2]/a/text()').extract() actor = sel.xpath('//*[@id="info"]/span[3]/span[2]/a/text()').extract() #release_time = sel.xpath('//*[@id="info"]/span[11]/text()').extract() #time = sel.xpath('//*[@id="info"]/span[13]/text()').extract() star = sel.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract() item['title'] = title item['director'] = director item['actor'] = actor #item['release_time'] = release_time #item['time'] = time item['star'] = star yield item
def parse(self, response): item = DoubanMovieItem() for movie in response.xpath('//ol[@class="grid_view"]/li'): item['rank'] = movie.xpath( './/div[@class="pic"]/em/text()').extract_first() item['movie_name'] = movie.xpath( './/div[@class="hd"]/a/span[1]/text()').extract_first() item['score'] = movie.xpath( './/div[@class="star"]/span[@class="rating_num"]/text()' ).extract_first() item['quote'] = movie.xpath( './/span[@class="inq"]/text()').extract_first() yield item next_url = response.xpath( '//span[@class="next"]/a/@href').extract_first() if next_url: next_url = 'https://movie.douban.com/top250' + next_url yield Request(next_url, headers=self.headers)
def parse(self, response): item = DoubanMovieItem() film_list = json.loads(response.body.decode()) if film_list == [] or self.offset > 1000: return for film in film_list['data']: item['film_name'] = film['title'] item['film_directors'] = film['directors'] item['film_rate'] = film['rate'] item['film_actors'] = film['casts'] item['film_image_url'] = film['cover'] urllib.request.urlretrieve( item['film_image_url'], self.file_path + "/" + item['film_name'] + "." + item['film_image_url'].split(".")[-1]) yield item self.offset = self.offset + 20 new_url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=1&start=' + str( self.offset) yield scrapy.Request(url=new_url, callback=self.parse)
def parse(self, response): movie_blocks = response.xpath('//ol[@class="grid_view"]/li') for block in movie_blocks: name = block.css('span.title::text').extract_first() # name = block.xpath() star = block.xpath( ".//span[@class='rating_num']/text()").extract_first() e = block.xpath( ".//div[@class='star']/span[4]/text()").extract_first() evaluation = self.eval_re.search(e).group() #group()和groups(),前一个输出字符串,后一个输出元组 introduction = block.css('span.inq::text').extract_first() item = DoubanMovieItem() item['name'] = name item['star'] = star item['evaluation'] = evaluation item['introduction'] = introduction yield item #下一次迭代从yield之后开始 next_url = response.css('span.next > a::attr(href)').extract_first() # 注意地址取法,之前都是去text,而这里取的是<>里的,指定attr和对应名称。 # <a href="?start=25&filter=" style="background: rgb(204, 136, 136); border: 2px solid red;">后页></a> if next_url: next_url = response.urljoin(next_url) # ruljoin创建完整的url yield scrapy.Request(next_url, callback=self.parse)
def parse_item(self, response): global failed_count global real_parse_count item = DoubanMovieItem() try: real_parse_count += 1 print("real parse count = %d" % (real_parse_count)) # get movie id url = response.url id = url.split('/')[-2].strip() item["movie_id"] = id # get movie name name = response.xpath( '//div[@id="content"]/h1/span[1]/text()').extract_first() item["movie_name"] = name.strip() if name else "" #get movie year year = response.xpath( '//div[@id="content"]/h1/span[2]/text()').extract_first() item["movie_year"] = year.strip("()() ") if year else "" # get movie rate rate = response.xpath( "//div[@class='rating_self clearfix']/strong/text()" ).extract_first() item["movie_rate"] = float(rate.strip() if rate else "-1") # get movie rate people rate_num = response.xpath( "//span[@property='v:votes']/text()").extract_first() item["movie_rate_people"] = int( rate_num.strip() if rate_num else "-1") # get hot short comments comments = response.xpath( "//div[@id='hot-comments']//div[@class='comment-item']//div[@class='comment']/p/text()" ).extract() votes = response.xpath( "//div[@id='hot-comments']//div[@class='comment-item']//div[@class='comment']//span[@class='votes pr5']/text()" ).extract() rates = response.xpath( "//div[@id='hot-comments']//div[@class='comment-item']//span[@class='comment-info']/span[1]/@title" ).extract() if len(comments) == len(votes) and len(votes) == len(rates): commentsarray = [] for i in range(len(votes)): short_comments = {} short_comments['comment'] = comments[i] short_comments['votes'] = int(votes[i]) short_comments['rates'] = rates[i] commentsarray.append(short_comments) item["movie_hot_short_comments"] = commentsarray seenwish = response.xpath( "//div[@class='subject-others-interests-ft']//a//text()" ).extract() if seenwish and len(seenwish) == 2: item['movie_seen'] = int(seenwish[0][:-3]) item['movie_wishes'] = int(seenwish[1][:-3]) # get movie info info = response.xpath("//div[@id='info']") infoarray = info.extract() infostr = ''.join(infoarray).strip() director = info.xpath("span[1]/span[2]/a/text()").extract() self.add_array("movie_director", director, item) writor = info.xpath("span[2]/span[2]/a/text()").extract() self.add_array("movie_writor", writor, item) actors = info.xpath("span[3]/span[2]/a/text()").extract() self.add_array("movie_actors", actors, item) time = info.xpath( "span[@property='v:runtime']/@content").extract_first() item["movie_time"] = float(time.strip() if time else "-1") types = info.xpath("span[@property='v:genre']/text()").extract() self.add_array("movie_type", types, item) try: lang = re.search(language_pattern, infostr) if lang: language = lang.group(1).strip() item["movie_language"] = language.strip() except: pass try: regionmatch = re.search(region_pattern, infostr) if regionmatch: region = regionmatch.group(1).strip() item["movie_region"] = region.strip() except: pass try: dialectmatch = re.search(dialect_pattern, infostr) if dialectmatch: dialect = dialectmatch.group(1).strip() item["movie_dialect"] = dialect.strip() except: pass desc = response.xpath("//span[@property='v:summary']/node()" ).extract_first().strip() item["movie_desc"] = desc.strip() if desc else "" tags = response.xpath( "//div[@class='tags-body']/a/text()").extract() self.add_array("movie_tags", tags, item) pic = response.xpath( "//div[@id='mainpic']/a/img/@src").extract_first() item["movie_pic_url"] = pic yield item except Exception, e: # do nothing logging.info("Parse error:%s" % (str(e))) print("failed_count = %d" % (failed_count + 1)) failed_count += 1 pass
def parse_movie(self, response): print(response.status) _setDNSCache() movie_item = DoubanMovieItem() # movie id movie_item['movie_id'] = response.xpath( './/li/span[@class="rec"]/@id').extract() # movie title movie_item['movie_title'] = response.xpath( './/h1/span[@property="v:itemreviewed"]/text()').extract() # release_date movie_item['release_date'] = response.xpath( './/h1/span[@class="year"]/text()').extract() # 导演 movie_item['directedBy'] = response.xpath( './/a[@rel="v:directedBy"]/text()').extract() # 电影主演 movie_item['starring'] = response.xpath( './/a[@rel="v:starring"]/text()').extract() # 电影类别 movie_item['genre'] = response.xpath( './/span[@property="v:genre"]/text()').extract() # 电影时长 movie_item['runtime'] = response.xpath( './/span[@property="v:runtime"]/text()').extract() # 电影的国别和语言 temp = response.xpath('.//div[@id="info"]/text()').extract() movie_item['country'] = [ p for p in temp if (p.strip() != '') & (p.strip() != '/') ][0].strip() movie_item['language'] = [ p for p in temp if (p.strip() != '') & (p.strip() != '/') ][1].strip() # 电影的评分 movie_item['rating_num'] = response.xpath( './/strong[@class="ll rating_num"]/text()').extract() # 评分的人数 movie_item['vote_num'] = response.xpath( './/span[@property="v:votes"]/text()').extract() # 电影1-5星的百分比 movie_item['rating_per_stars5'] = response.xpath( './/span[@class="rating_per"]/text()').extract()[0].strip() movie_item['rating_per_stars4'] = response.xpath( './/span[@class="rating_per"]/text()').extract()[1].strip() movie_item['rating_per_stars3'] = response.xpath( './/span[@class="rating_per"]/text()').extract()[2].strip() movie_item['rating_per_stars2'] = response.xpath( './/span[@class="rating_per"]/text()').extract()[3].strip() movie_item['rating_per_stars1'] = response.xpath( './/span[@class="rating_per"]/text()').extract()[4].strip() # 电影的剧情简介 intro = response.xpath('.//span[@class="all hidden"]/text()').extract() if len(intro): movie_item['intro'] = intro else: movie_item['intro'] = response.xpath( './/span[@property="v:summary"]/text()').extract() # 电影的短评数 movie_item['comment_num'] = response.xpath( './/div[@class="mod-hd"]/h2/span/a/text()').extract()[0].strip() # 电影的提问数 movie_item['question_num'] = response.xpath( './/div[@class="mod-hd"]/h2/span/a/text()').extract()[1].strip() # 最后输出 yield movie_item
def parse_content(self, response): movieid = self.movie[0] tag = self.movie[1] title = self.movie[2] director = self.movie[3] actor = self.movie[4] rate = self.movie[5] star = self.movie[6] cover = self.movie[7] html = BeautifulSoup(response.body, 'lxml') info = html.select('#info') if len(info) == 0: print(response.text) return [-2] info = html.select('#info')[0].get_text().split('\n') print(info) # print(len(info)) category = '' district = '' showtime = '' length = '' for item in info: item = item.split(':') if item[0] == '类型': category = item[-1].strip() elif item[0] == '制片国家/地区': district = item[-1].strip() elif item[0] == '上映日期': showtime = item[-1].strip().split('-')[0] elif item[0] == '片长': length = item[-1].strip() length = re.findall('\d+', length)[0] category = category.replace(r'/', ',') if len(district) > 0: district = district[:50] if len(category) > 0: category = category[:30] rate_count = html.select( '#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > div > div.rating_sum > a > span' )[0].get_text() # interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-child(1) > span.rating_per rate5 = html.select( '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(1) > span.rating_per' )[0].get_text().split('%')[0] rate4 = html.select( '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(2) > span.rating_per' )[0].get_text().split('%')[0] rate3 = html.select( '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(3) > span.rating_per' )[0].get_text().split('%')[0] rate2 = html.select( '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(4) > span.rating_per' )[0].get_text().split('%')[0] rate1 = html.select( '#interest_sectl > div.rating_wrap.clearbox > div.ratings-on-weight > div:nth-of-type(5) > span.rating_per' )[0].get_text().split('%')[0] item = DoubanMovieItem() item['movieid'] = movieid item['title'] = title item['tag'] = tag item['directors'] = director item['actors'] = actor item['showtime'] = showtime item['length'] = length item['district'] = district item['category'] = category item['star'] = star item['rate'] = rate item['rate_count'] = rate_count item['rate5'] = rate5 item['rate4'] = rate4 item['rate3'] = rate3 item['rate2'] = rate2 item['rate1'] = rate1 item['cover'] = cover print('###### ') print(item) print('######')
def parse(self, response): global count global parsedids if count == 10000000: return else: count += 1 item = DoubanMovieItem() try: # get movie id url = response.url id = url.split('/')[-2].strip() item["movie_id"] = id # get movie name name = response.xpath( '//div[@id="content"]/h1/span[1]/text()').extract_first() item["movie_name"] = name.strip() if name else "" #get movie year year = response.xpath( '//div[@id="content"]/h1/span[2]/text()').extract_first() item["movie_year"] = year.strip("()() ") if year else "" # get movie rate rate = response.xpath( "//div[@class='rating_self clearfix']/strong/text()" ).extract_first() item["movie_rate"] = float(rate.strip() if rate else "2.5") # get movie info info = response.xpath("//div[@id='info']") infoarray = info.extract() infostr = ''.join(infoarray).strip() director = info.xpath("span[1]/span[2]/a/text()").extract() self.add_array("movie_director", director, item) writor = info.xpath("span[2]/span[2]/a/text()").extract() self.add_array("movie_writor", writor, item) actors = info.xpath("span[3]/span[2]/a/text()").extract() self.add_array("movie_actors", actors, item) time = info.xpath( "span[@property='v:runtime']/@content").extract_first() item["movie_time"] = float(time.strip() if time else "0") types = info.xpath("span[@property='v:genre']/text()").extract() self.add_array("movie_type", types, item) try: lang = re.search(language_pattern, infostr) if lang: language = lang.group(1).strip() item["movie_language"] = language.strip() except: pass try: regionmatch = re.search(region_pattern, infostr) if regionmatch: region = regionmatch.group(1).strip() item["movie_region"] = region.strip() except: pass try: dialectmatch = re.search(dialect_pattern, infostr) if dialectmatch: dialect = dialectmatch.group(1).strip() item["movie_dialect"] = dialect.strip() except: pass desc = response.xpath("//span[@property='v:summary']/node()" ).extract_first().strip() item["movie_desc"] = desc.strip() if desc else "" tags = response.xpath( "//div[@class='tags-body']/a/text()").extract() self.add_array("movie_tags", tags, item) pic = response.xpath( "//div[@id='mainpic']/a/img/@src").extract_first() item["movie_pic_url"] = pic yield item next_pages = response.xpath( "//div[@class='recommendations-bd']/dl/dd/a/@href").extract() if next_pages: for page in next_pages: id = int(page.split('/')[-2]) if parsedids[id]: continue else: parsedids[id] = True yield Request(page, callback=self.parse) except Exception, e: # do nothing logging.info("Parse error:%s" % (str(e))) pass