Esempio n. 1
0
    def parse(self, response):
        print(f'Processing {response.request.url}')

        item = DoubanSpiderItem()

        commentlist = Selector(
            response=response).xpath('//div[@class="review-list  "]/div')
        for comm in commentlist:
            cid = comm.xpath('./@data-cid').get()
            user = comm.xpath('./div/header/a[2]/text()').get()
            updatetime = comm.xpath('./div/header/span[2]/text()').get()
            star = comm.xpath('./div/header/span[1]/@class').get()
            title = comm.xpath('./div/div/h2/a/text()').get()

            shorts = comm.xpath('./div/div/div/div/text()').getall()
            shortfull = ''
            for short in shorts:
                shortfull += short.strip()
            shortfull = shortfull[:-7].strip()
            #print(f"{cid} : {shortfull} -END-")

            item['cid'] = int(cid)
            item['user'] = user
            item['updatetime'] = updatetime
            #print(star)
            if star[:7] == 'allstar':
                item['star'] = int(star[7:8])
            else:
                item['star'] = 0

            item['title'] = title
            item['short'] = shortfull

            #print(item)
            yield item
	def parse(self,response):

		print response

		sel=Selector(response)
		item=DoubanSpiderItem()

		movie_name=sel.xpath('//span[@class="title"]/text()').extract()
		star=sel.xpath('//div[@class="star"]/span/em/text()').extract()
		quote=sel.xpath('//p[@class="quote"]/span[@class="inq"]/text()').extract()


		item['movie_name']=[n.encode('utf-8')  for n in movie_name]
		item['star']=[n.encode('utf-8')  for n in star]
		item['quote']=[n.encode('utf-8')  for n in quote]

		
		yield item

		next_page=sel.xpath('//div[@class="paginator"]/a/@href').extract()



		for url in next_page:
			url='http://movie.douban.com/top250'+url

			yield Request(url,callback=self.parse_item)
Esempio n. 3
0
 def parse(self, response):
     movies = Selector(response=response).xpath('//div[@class="hd"]')
     for movie in movies:
         item = DoubanSpiderItem()
         title = movie.xpath('./a/span/text()').extract_first()
         item['title'] = title
         link = movie.xpath(
             './a/@href').extract_first() + '/comments?status=P'
         yield scrapy.Request(url=link,
                              meta={'item': item},
                              callback=self.parse2)
Esempio n. 4
0
 def parse_item(self, response):
     sel = Selector(response)
     item = DoubanSpiderItem()
     sites = sel.xpath('//div[@class="info"]')
     for site in sites:
         item['movie_name'] = site.xpath(
             'div[@class="hd"]/a/span[1]/text()').extract()
         item['movie_star'] = site.xpath(
             'div[@class="bd"]/div/span[2]/text()').extract()
         item['movie_quote'] = site.xpath(
             'div[@class="bd"]/p[@class="quote"]/span/text()').extract()
         yield item
Esempio n. 5
0
    def parse_item(self, response):
        print response
        sel = Selector(response)
        item = DoubanSpiderItem()
        movie_name = sel.xpath('//span[@class="title"][1]/text()').extract()
        star = sel.xpath('//div[@class="star"]/span/em/text()').extract()
        quote = sel.xpath(
            '//p[@class="quote"]/span[@class="inq"]/text()').extract()

        item['movie_name'] = [n.encode('utf-8') for n in movie_name]
        item['star'] = [n.encode('utf-8') for n in star]
        item['quote'] = [n.encode('utf-8') for n in quote]

        yield item
Esempio n. 6
0
 def parse_item(self,response):
     print response
     sel=Selector(response)
     item=DoubanSpiderItem()
     soup=BeautifulSoup(response.url.text,'lxml')
     movie_name=sel.xpath('//span[@class="title"][0]/text()').extract()
     star=sel.xpath('//span[@class="rating_num"]/text()').extract()
     quote=soup.find_all('p',class_='quote').find('span',class_='inq').text()
     
     item['movie_name']=[n.encode('utf-8') for n in movie_name]
     item['star']=[n for n in star]
     item['quote']=[n.encode('utf-8') for n in quote]
     
     yield item
    def parse_item(self, response):

        sel = Selector(response)
        item = DoubanSpiderItem()

        movie_name = sel.xpath('//span[@class="title"][1]/text()').extract()
        # //*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[2]/div/span[2]
        # star = sel.xpath('//div[@class="star"]/span/em/text()').extract()
        star = sel.xpath('//div[@class="star"]/span[@class="rating_num"]/text()').extract()
        quote = sel.xpath('//p[@class="quote"]/span[@class="inq"]/text()').extract()
        item['movie_name'] = [n.encode('utf-8').decode('utf-8') for n in movie_name]
        item['star'] = [n.encode('utf-8').decode('utf-8') for n in star]
        item['quote'] = [n.encode('utf-8').decode('utf-8') for n in quote]

        yield item
Esempio n. 8
0
 def parse_short(self, response):
     meta = deepcopy(response.meta)
     short_answers, recommendation_index, useful_num = [], [], []
     num_for_short = meta[
         "num_for_short"] if meta["num_for_short"] < 501 else 500
     for count in range(0, num_for_short, 20):
         short_url = self.short_url.format(meta["id"], count)
         print("short_url", short_url)
         try:
             res = self.get_shorturl_content(short_url)
             if len(re.findall(r'"code":112', str(res))) > 0:
                 self.crawler.engine.close_spider(self, '程序异常,停止爬虫!')
                 print('!!!111程序异常,停止爬虫!!!,状态码是:', response.status)
             else:
                 short_answers_temp = [
                     emoji.demojize(i["content"]) for i in res["comments"]
                 ]
                 recommendation_index_temp = [
                     i["rating"]["value"] for i in res["comments"]
                 ]
                 useful_num_temp = [
                     i["useful_count"] for i in res["comments"]
                 ]
                 short_answers.extend(short_answers_temp)
                 recommendation_index.extend(recommendation_index_temp)
                 useful_num.extend(useful_num_temp)
         except Exception as e:
             logger.info(e)
             continue
     info = DoubanSpiderItem()
     info["url"] = meta["url"]
     info["movie_id"] = meta["id"]
     info["name"] = meta["name"]
     info["scriptwriters"] = meta["scriptwriters"]
     info["directors"] = meta["directors"]
     info["actors"] = meta["actors"]
     info["type"] = meta["type"]
     info["region"] = meta["region"]
     info["language"] = meta["language"]
     info["duration"] = meta["duration"]
     info["release_date"] = meta["release_date"]
     info["alias"] = meta["alias"]
     info["source"] = meta["source"]
     info["description"] = meta["description"]
     info["short_answers"] = short_answers
     info["recommendation_index"] = recommendation_index
     info["useful_num"] = useful_num
     yield info
Esempio n. 9
0
    def parse(self, response):
        item = DoubanSpiderItem()
        #selector = Selector(response)
        #all_movie = selector.xpath('//div[@class="info"]')
        for r in response.css('div.item'):
            item['rank'] = r.css('div.pic em::text').extract()
            item['name'] = r.css(
                'div.info div.hd a span.title::text').extract_first()
            item['score'] = r.css(
                'div.info div.bd div.star span.rating_num::text').extract()
            yield item

        next_url = response.css(
            'div.paginator span.next a::attr(href)').extract()
        if next_url:
            next_url = "https://movie.douban.com/top250" + next_url[0]
            print(next_url)
            yield scrapy.Request(next_url, headers=self.headers)
Esempio n. 10
0
 def parse_detail(self, response):
     print "detail url:" + response.url
     _name = response.css('div#content h1 span:first-child').xpath(
         './/text()').extract()[0]
     name = _name
     score = response.css(
         'div#interest_sectl div.rating_self strong.rating_num').xpath(
             './/text()').extract()[0]
     _summary = response.css('div.related-info div.indent span').xpath(
         './/text()').extract()[0]
     summary = _summary.strip()
     _actors = response.css('div#info span.actor span.attrs a').xpath(
         './/text()').extract()
     _types = response.css('div#info >span[property="v:genre"]').xpath(
         './/text()').extract()
     release_time = response.css(
         'div#info >span[property="v:initialReleaseDate"]').xpath(
             './/text()').extract()[0]
     if _types:
         types = ','.join(_types)
     else:
         types = None
     if _actors:
         actors = ','.join(_actors)
     else:
         actors = None
     # print name.encode('utf-8')
     # print score
     # print summary.encode('utf-8')
     # print actors.encode('utf-8')
     # print types.encode('utf-8')
     # print release_time.encode('utf-8')
     item = DoubanSpiderItem()
     item['name'] = name
     item['score'] = score
     item['summary'] = summary
     item['types'] = types
     item['actors'] = actors
     item['src_url'] = response.url
     item['release_time'] = release_time
     yield item
Esempio n. 11
0
    def parse_item(self, response):

        print response

        sel = Selector(response)
        item = DoubanSpiderItem()

        item['movie_name'] = sel.xpath(
            '//span[@class="title"][1]/text()').extract()
        item['star'] = sel.xpath(
            '//div[@class="star"]/span[@class="rating_num"]/text()').extract()
        item['quote'] = sel.xpath(
            '//p[@class="quote"]/span[@class="inq"]/text()').extract()
        yield item

        nextPage = sel.xpath(
            '//div[@class="paginator"]/span[@class="next"]/a/@href').extract(
            )[0]
        print nextPage
        if nextPage:
            next_url = 'https://movie.douban.com/top250' + nextPage
            yield Request(next_url, callback=self.parse_item)
Esempio n. 12
0
    def parse_item(self, response):
        rank_No = response.xpath("//span[@class='top250-no']/text()").get()
        rank = rank_No.split('.')[1]
        title = response.xpath("//h1/span/text()").get()
        year_text = response.xpath("//span[@class='year']/text()").get()
        year = re.sub(r'\(|\)', '', year_text)

        infos = response.xpath("//div[@id='info']/span")
        director = infos[0].xpath(".//a/text()").get()
        screenwriter_list = infos[1].xpath(".//a/text()").getall()
        screenwriter = ','.join(screenwriter_list)
        stars_list = infos[2].xpath(".//a/text()").getall()
        stars = ','.join(stars_list)
        types_list = response.xpath("//div[@id='info']/span[@property='v:genre']/text()").getall()
        types = ','.join(types_list)
        runtime = response.xpath("//span[@property='v:runtime']/text()").get()
        IMDb = response.xpath("//div[@id='info']/a[@rel='nofollow']/@href").get()
        origin_url = response.url
        pub_time = response.xpath("//span[@property='v:initialReleaseDate']/@content").get()

        others = response.xpath("//div[@id='info']/text()").getall()
        country, language, *_ = [x for x in list(map(lambda x: re.sub(r"\s|/", '', x), others)) if x]

        item = DoubanSpiderItem(rank=rank,
                                title=title,
                                year=year,
                                director=director,
                                screenwriter=screenwriter,
                                stars=stars,
                                types=types,
                                runtime=runtime,
                                IMDb=IMDb,
                                origin_url=origin_url,
                                pub_time=pub_time,
                                )
        yield item
Esempio n. 13
0
    def parse_item(self, response):
        pre = 'https://movie.douban.com/top250'
        print(response)
        sel = Selector(response)
        item = DoubanSpiderItem()

        movie_name = sel.xpath('//span[@class="title"][1]/text()').extract()
        star = sel.xpath(
            '//div[@class="star"]/span[@class="rating_num"]/text()').extract()
        quote = sel.xpath(
            '//p[@class="quote"]/span[@class="inq"]/text()').extract()

        item['movie_name'] = [n for n in movie_name]
        item['star'] = [n for n in star]
        item['quote'] = [n for n in quote]

        urls = sel.xpath('//span[@class="next"]/a/@href').extract()
        for i in range(len(urls)):
            print(i)
            print(pre + urls[i])
            yield Request(pre + urls[i],
                          headers=self.headers,
                          callback=self.parse_item)
        yield item
Esempio n. 14
0
 def parse(self, response):
     item = DoubanSpiderItem()
     name = response.css()