def parse(self, response): print(f'Processing {response.request.url}') item = DoubanSpiderItem() commentlist = Selector( response=response).xpath('//div[@class="review-list "]/div') for comm in commentlist: cid = comm.xpath('./@data-cid').get() user = comm.xpath('./div/header/a[2]/text()').get() updatetime = comm.xpath('./div/header/span[2]/text()').get() star = comm.xpath('./div/header/span[1]/@class').get() title = comm.xpath('./div/div/h2/a/text()').get() shorts = comm.xpath('./div/div/div/div/text()').getall() shortfull = '' for short in shorts: shortfull += short.strip() shortfull = shortfull[:-7].strip() #print(f"{cid} : {shortfull} -END-") item['cid'] = int(cid) item['user'] = user item['updatetime'] = updatetime #print(star) if star[:7] == 'allstar': item['star'] = int(star[7:8]) else: item['star'] = 0 item['title'] = title item['short'] = shortfull #print(item) yield item
def parse(self,response): print response sel=Selector(response) item=DoubanSpiderItem() movie_name=sel.xpath('//span[@class="title"]/text()').extract() star=sel.xpath('//div[@class="star"]/span/em/text()').extract() quote=sel.xpath('//p[@class="quote"]/span[@class="inq"]/text()').extract() item['movie_name']=[n.encode('utf-8') for n in movie_name] item['star']=[n.encode('utf-8') for n in star] item['quote']=[n.encode('utf-8') for n in quote] yield item next_page=sel.xpath('//div[@class="paginator"]/a/@href').extract() for url in next_page: url='http://movie.douban.com/top250'+url yield Request(url,callback=self.parse_item)
def parse(self, response): movies = Selector(response=response).xpath('//div[@class="hd"]') for movie in movies: item = DoubanSpiderItem() title = movie.xpath('./a/span/text()').extract_first() item['title'] = title link = movie.xpath( './a/@href').extract_first() + '/comments?status=P' yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse_item(self, response): sel = Selector(response) item = DoubanSpiderItem() sites = sel.xpath('//div[@class="info"]') for site in sites: item['movie_name'] = site.xpath( 'div[@class="hd"]/a/span[1]/text()').extract() item['movie_star'] = site.xpath( 'div[@class="bd"]/div/span[2]/text()').extract() item['movie_quote'] = site.xpath( 'div[@class="bd"]/p[@class="quote"]/span/text()').extract() yield item
def parse_item(self, response): print response sel = Selector(response) item = DoubanSpiderItem() movie_name = sel.xpath('//span[@class="title"][1]/text()').extract() star = sel.xpath('//div[@class="star"]/span/em/text()').extract() quote = sel.xpath( '//p[@class="quote"]/span[@class="inq"]/text()').extract() item['movie_name'] = [n.encode('utf-8') for n in movie_name] item['star'] = [n.encode('utf-8') for n in star] item['quote'] = [n.encode('utf-8') for n in quote] yield item
def parse_item(self,response): print response sel=Selector(response) item=DoubanSpiderItem() soup=BeautifulSoup(response.url.text,'lxml') movie_name=sel.xpath('//span[@class="title"][0]/text()').extract() star=sel.xpath('//span[@class="rating_num"]/text()').extract() quote=soup.find_all('p',class_='quote').find('span',class_='inq').text() item['movie_name']=[n.encode('utf-8') for n in movie_name] item['star']=[n for n in star] item['quote']=[n.encode('utf-8') for n in quote] yield item
def parse_item(self, response): sel = Selector(response) item = DoubanSpiderItem() movie_name = sel.xpath('//span[@class="title"][1]/text()').extract() # //*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[2]/div/span[2] # star = sel.xpath('//div[@class="star"]/span/em/text()').extract() star = sel.xpath('//div[@class="star"]/span[@class="rating_num"]/text()').extract() quote = sel.xpath('//p[@class="quote"]/span[@class="inq"]/text()').extract() item['movie_name'] = [n.encode('utf-8').decode('utf-8') for n in movie_name] item['star'] = [n.encode('utf-8').decode('utf-8') for n in star] item['quote'] = [n.encode('utf-8').decode('utf-8') for n in quote] yield item
def parse_short(self, response): meta = deepcopy(response.meta) short_answers, recommendation_index, useful_num = [], [], [] num_for_short = meta[ "num_for_short"] if meta["num_for_short"] < 501 else 500 for count in range(0, num_for_short, 20): short_url = self.short_url.format(meta["id"], count) print("short_url", short_url) try: res = self.get_shorturl_content(short_url) if len(re.findall(r'"code":112', str(res))) > 0: self.crawler.engine.close_spider(self, '程序异常,停止爬虫!') print('!!!111程序异常,停止爬虫!!!,状态码是:', response.status) else: short_answers_temp = [ emoji.demojize(i["content"]) for i in res["comments"] ] recommendation_index_temp = [ i["rating"]["value"] for i in res["comments"] ] useful_num_temp = [ i["useful_count"] for i in res["comments"] ] short_answers.extend(short_answers_temp) recommendation_index.extend(recommendation_index_temp) useful_num.extend(useful_num_temp) except Exception as e: logger.info(e) continue info = DoubanSpiderItem() info["url"] = meta["url"] info["movie_id"] = meta["id"] info["name"] = meta["name"] info["scriptwriters"] = meta["scriptwriters"] info["directors"] = meta["directors"] info["actors"] = meta["actors"] info["type"] = meta["type"] info["region"] = meta["region"] info["language"] = meta["language"] info["duration"] = meta["duration"] info["release_date"] = meta["release_date"] info["alias"] = meta["alias"] info["source"] = meta["source"] info["description"] = meta["description"] info["short_answers"] = short_answers info["recommendation_index"] = recommendation_index info["useful_num"] = useful_num yield info
def parse(self, response): item = DoubanSpiderItem() #selector = Selector(response) #all_movie = selector.xpath('//div[@class="info"]') for r in response.css('div.item'): item['rank'] = r.css('div.pic em::text').extract() item['name'] = r.css( 'div.info div.hd a span.title::text').extract_first() item['score'] = r.css( 'div.info div.bd div.star span.rating_num::text').extract() yield item next_url = response.css( 'div.paginator span.next a::attr(href)').extract() if next_url: next_url = "https://movie.douban.com/top250" + next_url[0] print(next_url) yield scrapy.Request(next_url, headers=self.headers)
def parse_detail(self, response): print "detail url:" + response.url _name = response.css('div#content h1 span:first-child').xpath( './/text()').extract()[0] name = _name score = response.css( 'div#interest_sectl div.rating_self strong.rating_num').xpath( './/text()').extract()[0] _summary = response.css('div.related-info div.indent span').xpath( './/text()').extract()[0] summary = _summary.strip() _actors = response.css('div#info span.actor span.attrs a').xpath( './/text()').extract() _types = response.css('div#info >span[property="v:genre"]').xpath( './/text()').extract() release_time = response.css( 'div#info >span[property="v:initialReleaseDate"]').xpath( './/text()').extract()[0] if _types: types = ','.join(_types) else: types = None if _actors: actors = ','.join(_actors) else: actors = None # print name.encode('utf-8') # print score # print summary.encode('utf-8') # print actors.encode('utf-8') # print types.encode('utf-8') # print release_time.encode('utf-8') item = DoubanSpiderItem() item['name'] = name item['score'] = score item['summary'] = summary item['types'] = types item['actors'] = actors item['src_url'] = response.url item['release_time'] = release_time yield item
def parse_item(self, response): print response sel = Selector(response) item = DoubanSpiderItem() item['movie_name'] = sel.xpath( '//span[@class="title"][1]/text()').extract() item['star'] = sel.xpath( '//div[@class="star"]/span[@class="rating_num"]/text()').extract() item['quote'] = sel.xpath( '//p[@class="quote"]/span[@class="inq"]/text()').extract() yield item nextPage = sel.xpath( '//div[@class="paginator"]/span[@class="next"]/a/@href').extract( )[0] print nextPage if nextPage: next_url = 'https://movie.douban.com/top250' + nextPage yield Request(next_url, callback=self.parse_item)
def parse_item(self, response): rank_No = response.xpath("//span[@class='top250-no']/text()").get() rank = rank_No.split('.')[1] title = response.xpath("//h1/span/text()").get() year_text = response.xpath("//span[@class='year']/text()").get() year = re.sub(r'\(|\)', '', year_text) infos = response.xpath("//div[@id='info']/span") director = infos[0].xpath(".//a/text()").get() screenwriter_list = infos[1].xpath(".//a/text()").getall() screenwriter = ','.join(screenwriter_list) stars_list = infos[2].xpath(".//a/text()").getall() stars = ','.join(stars_list) types_list = response.xpath("//div[@id='info']/span[@property='v:genre']/text()").getall() types = ','.join(types_list) runtime = response.xpath("//span[@property='v:runtime']/text()").get() IMDb = response.xpath("//div[@id='info']/a[@rel='nofollow']/@href").get() origin_url = response.url pub_time = response.xpath("//span[@property='v:initialReleaseDate']/@content").get() others = response.xpath("//div[@id='info']/text()").getall() country, language, *_ = [x for x in list(map(lambda x: re.sub(r"\s|/", '', x), others)) if x] item = DoubanSpiderItem(rank=rank, title=title, year=year, director=director, screenwriter=screenwriter, stars=stars, types=types, runtime=runtime, IMDb=IMDb, origin_url=origin_url, pub_time=pub_time, ) yield item
def parse_item(self, response): pre = 'https://movie.douban.com/top250' print(response) sel = Selector(response) item = DoubanSpiderItem() movie_name = sel.xpath('//span[@class="title"][1]/text()').extract() star = sel.xpath( '//div[@class="star"]/span[@class="rating_num"]/text()').extract() quote = sel.xpath( '//p[@class="quote"]/span[@class="inq"]/text()').extract() item['movie_name'] = [n for n in movie_name] item['star'] = [n for n in star] item['quote'] = [n for n in quote] urls = sel.xpath('//span[@class="next"]/a/@href').extract() for i in range(len(urls)): print(i) print(pre + urls[i]) yield Request(pre + urls[i], headers=self.headers, callback=self.parse_item) yield item
def parse(self, response): item = DoubanSpiderItem() name = response.css()