Example #1
0
 def parse(self, response):
     dm_dict = json.loads(response.body_as_unicode())
     # print(dm_dict)
     dm_list = dm_dict["subjects"]
     # print(len(dm_list))
     if len(dm_list) > 0:
         for dm in dm_list:
             item = DoubanmovieItem()
             item["cover"] = dm["cover"]
             item["id"] = dm["id"]
             item["is_new"] = dm["is_new"]
             item["playable"] = dm["playable"]
             item["rate"] = dm["rate"]
             item["title"] = dm["title"]
             item["url"] = dm["url"]
             yield item
         # print("end")
         # 下一頁
         self.pg += 1
         next_url = self.next_url.format(self.pg * 20)
         yield scrapy.Request(next_url, callback=self.parse)
     else:
         # 最後沒有資料時回傳一個空item{}回pipelines,告訴它可以保存資料到json,csv,or db
         # 或是可以在pipelines.py裡的close_spider()寫作
         item = DoubanmovieItem()
         yield item
         print("爬蟲結束.........,pg=%d" % (int(self.pg) * 19))
Example #2
0
    def parse(self, response):
        item = DoubanmovieItem()
        selector = Selector(response)
        movies = selector.xpath('//div[@class="info"]')
        for eachMovie in movies:
            title = eachMovie.xpath('div[@class="hd"]/a/span/text()').extract()
            fulltitle = ''
            for each in title:
                fulltitle += each
            movieInfo = eachMovie.xpath('div[@class="bd"]/p/text()').extract()
            star = eachMovie.xpath(
                'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()
            quote = eachMovie.xpath(
                'div[@class="bd"]/p[@class="quote"]/span/text()').extract()
            if quote:
                quote = quote[0]
            else:
                quote = ''
            item['title'] = fulltitle
            item['movieInfo'] = ';'.join(movieInfo)
            item['star'] = star
            item['quote'] = quote
            yield item

        nextLink = selector.xpath('//span[@class="next"]/link/@href').extract()

        if nextLink:
            nextLink = nextLink[0]
            print(nextLink)
            yield Request(self.url + nextLink, callback=self.parse)
    def parse_a_movie(self,response):

        sel = Selector(response)
        item = DoubanmovieItem()
        item['movie'] = sel.xpath('//h1/span[1]/text()').extract()[0]
        item['score'] = sel.xpath('//div[@class="rating_self clearfix"]/strong/text()').extract()[0]
        item['url'] = response.url
        item['intro'] = ''
        jieshao = sel.xpath('//div[@id="link-report"]/span[1]/text()').extract()
        if jieshao[0].strip() != '':
            for yige in jieshao:
                item['intro'] = item['intro'] + yige.strip()+ '\n'
            i = 0
            length = len(item['intro'])
            while i < length:
                item['intro'] = item['intro'][:i]+'\n'+item['intro'][i:]
                i += 75
        else:
            #self.driver.get(response.url)
            #self.driver.find_element_by_xpath('//a[@class="j a_show_full"]').click()
            #jieshao2 = self.driver.find_element_by_xpath('//span[@class="all hidden"]').text
            #item['intro'] = jieshao2
            i = 0
            length = len(item['intro'])
            while i < length:
                item['intro'] = item['intro'][:i]+'\n'+item['intro'][i:]
                i += 75

        return item
Example #4
0
    def parse(self, response):
        currentpPage_movie_item = response.xpath('//div[@class="item"]')
        for movie_item in currentpPage_movie_item:
            # 创建一个Movie对象
            movie = DoubanmovieItem()
            # 获取电影排名并赋值rank属性
            movie['rank'] = movie_item.xpath(
                'div[@class="pic"]/em/text()').extract()
            # 获取电影名称并赋值title属性
            movie['title'] = movie_item.xpath(
                'div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()'
            ).extract()
            # 获取电影海报地址并赋值pic_url属性
            movie['pic_url'] = movie_item.xpath(
                'div[@class="pic"]/a/img/@src').extract()
            # 将封装好的一个电影信息添加到容器中,yield作用是创建一个列表并添加元素
            movie['inq'] = movie_item.xpath(
                'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()'
            ).extract()

            yield movie
            pass

        # 下一页请求跳转,实现自动翻页
        nextPage = response.xpath('//span[@class="next"]/a/@href')
        # 判断nextPage是否有效(无效代表当前页面为最后一页)
        if nextPage:
            # 获取nextPage中的下一页链接地址并加入到respones对象的请求地址中
            url = response.urljoin(nextPage[0].extract())
            # 发送下一页请求并调用parse()函数继续解析
            yield scrapy.Request(url, self.parse)
        pass
Example #5
0
 def parse(self, response):
     # print response.body
     item = DoubanmovieItem()
     selector = Selector(response)
     Movies = selector.xpath('//div[@class="info"]')
     for eachMoive in Movies:
         title = eachMoive.xpath('div[@class="hd"]/a/span/text()').extract()
         fullTitle = ''
         for each in title:
             fullTitle += each
         movieInfo = eachMoive.xpath('div[@class="bd"]/p/text()').extract()
         star = eachMoive.xpath(
             'div[@class="bd"]/div[@class="star"]/span/em/text()').extract(
             )[0]
         quote = eachMoive.xpath(
             'div[@class="bd"]/p[@class="quote"]/span/text()').extract()
         #quote可能为空,因此需要先进行判断
         if quote:
             quote = quote[0]
         else:
             quote = ''
         item['title'] = fullTitle
         item['movieInfo'] = ';'.join(movieInfo)
         item['star'] = star
         item['quote'] = quote
         yield item
     nextLink = selector.xpath('//span[@class="next"]/link/@href').extract()
     #第10页是最后一页,没有下一页的链接
     if nextLink:
         nextLink = nextLink[0]
         print nextLink
         yield Request(self.url + nextLink, callback=self.parse)
Example #6
0
    def parse_page_item(self,response):
        # 初始化item对象
        item = DoubanmovieItem()
        # 电影名
        item['name'] = response.xpath("//div[@id='content']/h1/span[1]/text()").extract()[0].strip().split()[0]
        # 评分
        item['score'] = response.xpath("//strong[@class='ll rating_num']/text()").extract()[0].strip()

        # 导演等信息XML树
        info_tree = response.xpath("//div[@id='info']")

        # 主演列表
        actor_list = info_tree.xpath("./span[3]/span[2]/a/text()").extract()
        # 主演字段
        item['actors'] = self.retain_element_by_num(actor_list, RATAIN_ACTORS)

        # 导演列表
        director_list = info_tree.xpath("./span[3]//a/text()").extract()
        # 导演字段
        item['directors'] = self.retain_element_by_num(director_list, RATAIN_DIRECTORS)

        # 类型列表
        type_list = info_tree.xpath(".//span[@property='v:genre']/text()").extract()
        # 类型字段
        item['types'] = self.retain_element_by_num(type_list, RATAIN_TYPE)

        # 评价人数
        item['people_number'] = response.xpath("//a[@class='rating_people']/span/text()").extract()[0]
        # 将item交给pipelines处理
        yield item
Example #7
0
 def parse(self, response):
     #定位标签
     movie_item = response.xpath('//div[@class = "item"]')
     # print(movie_item)
     # #遍历数据
     for item in movie_item:
         # print(item)
         #xpath解析
         #创建采集对象
         movie = DoubanmovieItem()
         # 排名解析并赋值
         movie['rank'] = item.xpath('div[@class="pic"]/em/text()').extract()
         #电影名解析并赋值
         movie['title'] = item.xpath(
             'div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()'
         ).extract()
         #获取影评
         movie["quote"] = item.xpath(
             'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()'
         ).extract()
         # 获取海报
         movie['pic'] = item.xpath('div[@class="pic"]/a/img/@src').extract()
         #返回生成器
         yield movie
     #深度采集
     #从当前页获取下一页链接
     next_page = response.xpath('//span[@class="next"]/a/@href').extract()
     if next_page:
         choice = input("是否需要抓取下一页:(y/n)?n\n")
         if choice.lower() == 'y':
             #获取下一页地址
             next_url = "https://movie.douban.com/top250" + next_page[0]
             yield scrapy.Request(next_url, self.parse)
         else:
             os.path.exists(0)
Example #8
0
 def parse(self, response):
     item = DoubanmovieItem()
     selector = Selector(response)
     Movies = selector.xpath('//div[@class="info"]')
     for eachMovie in Movies:
         title = eachMovie.xpath(
             'div[@class="hd"]/a/span/text()').extract()  # 多个span标签
         fullTitle = "".join(title)  # 将多个字符串无缝连接起来
         movieInfo = eachMovie.xpath('div[@class="bd"]/p/text()').extract()
         star = eachMovie.xpath(
             'div[@class="bd"]/div[@class="star"]/span/text()').extract()[0]
         quote = eachMovie.xpath(
             'div[@class="bd"]/p[@class="quote"]/span/text()').extract()
         # quote可能为空,因此需要先进行判断
         if quote:
             quote = quote[0]
         else:
             quote = ''
         item['title'] = fullTitle
         item['movieInfo'] = ';'.join(movieInfo)
         item['star'] = star
         item['quote'] = quote
         yield item
     nextLink = selector.xpath('//span[@class="next"]/link/@href').extract()
     # 第10页是最后一页,没有下一页的链接
     if nextLink:
         nextLink = nextLink[0]
         yield Request(urljoin(response.url, nextLink), callback=self.parse)
Example #9
0
 def parse(self, response):
     items = []
     soup = BeautifulSoup(response.text, 'html.parser')
     
     title_list = soup.find_all('div', attrs={'class': 'comment'})
     
     for i in range(len(title_list)):
         # 在items.py定义
         item = DoubanmovieItem()
         # author = title_list[i].find('a').find('href')
         comments = title_list[i].find('span', attrs={'class': 'short'}).text
         
         comments_dict = dict(很差=1,较差=2,还行=3,推荐=4,力荐=5)
         
         t1 = title_list[i].find('span',attrs={'class':'comment-info'}).find_all('span')
         star = t1[1]['title']
         
         t2 = title_list[i].find('span',attrs={'class':'comment-info'}).find('a')
         author = t2.text
         
         item['author'] = author
         item['comments'] = comments
         item['star'] = comments_dict[star]
         items.append(item)
     return items
Example #10
0
 def parse(self, response):
     item = DoubanmovieItem()
     all_movie_list = response.xpath('//div[@class="item"]')
     for evevry_moive_info in all_movie_list:
         item["count"] = evevry_moive_info.xpath(
             'div[1]/em/text()').extract()
         item["name"] = evevry_moive_info.xpath(
             'div[2]/div[1]/a[1]/span[1]/text()').extract()
         item["director"] = evevry_moive_info.xpath(
             'div[2]//div[2]/p/text()').extract()[0].split(
                 '\xa0\xa0\xa0')[0].strip().replace('导演: ', '')
         try:
             item["stra"] = evevry_moive_info.xpath(
                 'div[2]//div[2]/p/text()').extract()[0].split(
                     '\xa0\xa0\xa0')[1].strip().replace('主演: ', '').replace(
                         '...', '')
         except:
             item["stra"] = '空数据'
         item["quote"] = evevry_moive_info.xpath(
             'div[2]//div[2]/p[2]/span/text()').extract()
         yield item
         nextPage = response.xpath('//span[@class="next"]/a/@href')
         # 判断nextPage是否有效
         if nextPage:
             # 拼接下一页的地址
             url = response.urljoin(nextPage[0].extract())
             # 发送url后页请求
             yield scrapy.Request(url, self.parse)
Example #11
0
    def parse(self, response):
        #解析所有<div class = "item"> <div>
        itemList = response.xpath('//div[@class = "item"]')
        movie = DoubanmovieItem()
        # 循环读取每一个item
        for item in itemList:
            # 爬取top250排名
            movie['rank'] = item.xpath(
                'div[@class = "pic"]/em/text()').extract()

            # 爬取电影名称
            movie['title'] = item.xpath(
                'div[@class = "info"]/div[@class = "hd"]/a/span[@class = "title"][1]/text()'
            ).extract()

            #爬取电影海报
            movie['poster'] = item.xpath(
                'div[@class = "pic"]/a/img/@src').extract()

            # 添加对象到列表中
            yield movie
            pass

            # 下一页请求
        next_page = response.xpath('//span[@class = "next"]/a/@href')

        #判断
        if next_page:
            # 拼接网址
            url = response.urljoin(next_page[0].extract())
            # 重新请求parse函数
            yield scrapy.Request(url, self.parse)
            pass
        pass
Example #12
0
    def parse(self, response):
        # for info in response.xpath('//div[@class="item"]'):
        # item = DoubanmovieItem()
        # item['rank'] = info.xpath('div[@class="pic"]/em/text()').extract()
        # item['title'] = info.xpath('div[@class="pic"]/a/img/@alt').extract()
        # item['link'] = info.xpath('div[@class="pic"]/a/@href').extract()
        # item['star'] = info.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/em/text()').extract()
        # item['rate'] = info.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()').extract()
        # item['quote'] = info.xpath('div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()').extract()
        # yield item

        item = DoubanmovieItem()
        item['rank'] = response.xpath(
            '//*[@id="content"]/div/div[1]/ol/li[1]/div/div[1]/em').extract()
        item['title'] = response.xpath(
            '//*[@id="content"]/div/div[1]/ol/li[1]/div/div[1]/a/img').extract(
            )
        item['link'] = response.xpath(
            '//*[@id="content"]/div/div[1]/ol/li[1]/div/div[1]/a').extract()
        item['star'] = response.xpath(
            '//*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[2]/div/span[2]'
        ).extract()
        item['rate'] = response.xpath(
            'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()'
        ).extract()
        item['quote'] = response.xpath(
            '//*[@id="content"]/div/div[1]/ol/li[1]/div/div[2]/div[2]/p[2]/span'
        ).extract()
        yield item
Example #13
0
 def parse(self, response):
     item = DoubanmovieItem()
     selector = scrapy.Selector(response)
     movies = selector.xpath('//div[@class="item"]')
     for each in movies:
         num = each.xpath('div[@class="pic"]/em/text()').extract()[0]
         title = each.xpath(
             'div[@class="info"]/div[@class="hd"]/a/span[@class="title"]/text()'
         ).extract()[0]
         star = each.xpath(
             'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
         ).extract()[0]
         # star = re.search('<span class="rating_num" property="v:average">(.*?)</span>', each.extract(), re.S).group(1)
         quote = each.xpath(
             'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()'
         ).extract_first()
         if quote is None:
             quote = ' '
         item['quote'] = quote
         item['star'] = star
         item['title'] = title
         item['num'] = num
         yield item
         nextPage = selector.xpath(
             '//span[@class="next"]/link/@href').extract_first()
         if nextPage:
             next = response.urljoin(nextPage)
             print(next)
             yield scrapy.http.Request(next, callback=self.parse)
Example #14
0
 def parse(self, response):
     movies = response.xpath('//tr[@class="item"]')
     for movie_item in movies:
         item = DoubanmovieItem()
         item['name'] = movie_item.xpath('.//a[@class="nbg"]/img/@alt').extract_first()
         item['url'] = movie_item.xpath('.//a/@href').extract_first()
         item['rating'] = movie_item.xpath('.//span[@class="rating_nums"]/text()').extract_first()
         yield item
Example #15
0
    def parse_item(self,response):
        print "=============START"
        print response.body
        print "=============END"
        hxs = HtmlXPathSelector(response)
        movie_name = hxs.select('//*[@id="content"]/h1/span[1]/text()').extract()
        movie_director = hxs.select('//*[@id="info"]/span[1]/span[2]/a/text()').extract()
        movie_writer = hxs.select('//*[@id="info"]/span[2]/span[2]/a/text()').extract()
        movie_description = hxs.select('//*[@id="link-report"]//*[@property="v:summary"]/text()').extract()
        movie_roles = hxs.select('//*[@id="info"]/span[3]/span[2]//*[@rel="v:starring"]/text()').extract()
        movie_type = hxs.select('//*[@id="info"]//*[@property="v:genre"]/text()').extract()

        #获取电影详细信息序列及字符串
        movie_detail = hxs.select('//*[@id="info"]').extract()
        movie_detail_str = ''.join(movie_detail).strip()

        # 正则匹配关键词
        movie_language_str = '.*语言:</span> (.+?)<br>'.decode("utf8")
        movie_date_str = '.*上映日期:</span> <span property="v:initialReleaseDate" content="(\S+?)">(\S+?)</span>.*'.decode("utf8")
        movie_long_str = '.*片长:</span> <span property="v:runtime" content="(\d+).*'.decode("utf8")
        movie_country_str = '.*制片国家/地区:</span> (.+?)<br>'.decode("utf8")
        
        pattern_language =re.compile(movie_language_str,re.S)
        pattern_date = re.compile(movie_date_str,re.S)
        pattern_long = re.compile(movie_long_str,re.S)
        pattern_country = re.compile(movie_country_str,re.S)
        
        movie_language = re.search(pattern_language,movie_detail_str)
        movie_date = re.search(pattern_date,movie_detail_str)
        movie_long = re.search(pattern_long,movie_detail_str)
        movie_country = re.search(pattern_country,movie_detail_str)


        # 保存数据到item里
        item = DoubanmovieItem()
        item['movie_name'] = self._string_deal(''.join(movie_name))
        item['movie_director'] = self._string_deal(' '.join(movie_director))
        item['movie_description'] = self._string_deal(''.join(movie_description[0])) if len(movie_description) else ''
        item['movie_writer'] = self._string_deal(' '.join(movie_writer))
        item['movie_roles'] = self._string_deal(' '.join(movie_roles))
        item['movie_type'] = self._string_deal(' '.join(movie_type))

        item['movie_language'] = ""
        if movie_language:
            item['movie_language'] = self._string_deal(movie_language.group(1))

        item['movie_date'] = ""
        if movie_date:
            item['movie_date'] = self._string_deal(movie_date.group(1))

        item['movie_long'] = ""
        if movie_long:
            item['movie_long'] = self._string_deal(movie_long.group(1))

        item['movie_country'] = ""
        if movie_country:
            item['movie_country'] = self._string_deal(movie_country.group(1))
        yield item
Example #16
0
 def parse(self, response):
     # 使用 BeautifulSoup 解析网页内容
     bs_info = bs(response.text, 'html.parser')  # html 解析方式
     # Python 中使用for in 形式的循环,Python使用缩进来做语句块分隔
     title_list = bs_info.find_all('div', attrs={'class': 'hd'})
     for i in title_list:
         item = DoubanmovieItem()
         item["title"] = i.find('a').find('span', ).text
         item["link"] = i.find('a').get('href')
         yield scrapy.Request(url=item["link"], meta={'item': item}, callback=self.parse2)
Example #17
0
    def parse(self, response):

        item = DoubanmovieItem()
        item['rank'] = rank[self.i]
        item['title'] = title[self.i]
        item['star'] = star[self.i]
        item['rate'] = rate[self.i]
        item['quote'] = quote[self.i]
        item['detail'] = response.xpath('//*[@id="link-report"]').extract()[0]
        self.i = self.i + 1
        yield item
        if self.i < 250:
            yield scrapy.Request(url[self.i], self.parse)
Example #18
0
 def parse(self, response):
     soup = BeautifulSoup(response.text, 'html.parser')
     title_list = soup.find_all('div', attrs={'class': 'hd'})
     #for i in range(len(title_list)):
     # 在Python中应该这样写
     for i in title_list:
         # 在items.py定义
         item = DoubanmovieItem()
         title = i.find('a').find('span').text
         link = i.find('a').get('href')
         item['title'] = title
         item['link'] = link
         yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
Example #19
0
    def parse(self, response):  # 解析函数
        soup = BeautifulSoup(response.text, 'html.parser')
        title_list = soup.find_all('div', attrs={'class': 'hd'})
        for i in title_list:
            item = DoubanmovieItem()
            title = i.find('a').find('span').text
            link = i.find('a').get('href')
            item['title'] = title
            item['link'] = link
            # yield item

            yield scrapy.Request(url=link,
                                 meta={'item': item},
                                 callback=self.parse2)
Example #20
0
    def parse(self, response):
        for info in response.xpath('//div[@class="item"]'):
            item = DoubanmovieItem()
            item['title'] = info.xpath(
                'div[@class="pic"]/a/img/@alt').extract()
            item['star'] = info.xpath(
                'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()
            yield item

        next_page = response.xpath('//span[@class="next"]/a/@href')
        if next_page:
            url = response.urljoin(next_page[0].extract())
            yield scrapy.Request(url, self.parse)
Example #21
0
 def parse(self, response):
     sel = Selector(response)
     sites = sel.xpath('//div[@class="item"]/div[@class="info"]')
     items = []
     for site in sites:
         item = DoubanmovieItem()
         item['title'] = site.xpath(
             'div[@class="hd"]/a/span[@class="title"]/text()')[0].extract()
         item['rating_num'] = site.xpath(
             'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
         ).extract()
         item['link'] = site.xpath('div[@class="hd"]/a/@href').extract()
         items.append(item)
         yield item
Example #22
0
 def parse(self, response):
     movie_item = response.xpath('//div[@class="item"]')
     for item in movie_item:
         #创建一个movie对象
         movie = DoubanmovieItem()
         movie['rank'] = item.xpath(
             'div[@class="pic"]/em/text()').extract()  #提取文本内容
         movie['name'] = item.xpath(
             'div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()'
         ).extract()
         movie['pic'] = item.xpath('div[@class="pic"]/a/img/@src').extract()
         yield movie
         pass
     pass
Example #23
0
    def parse(self, response):
        items = []
        soup = BeautifulSoup(response.text, 'html.parser')
        title_list = soup.find_all('div', attrs={'class': 'hd'})
        for i in range(len(title_list)):
        # 在Python中应该这样写
	    # for i in title_list:
            # 在items.py定义
            item = DoubanmovieItem()
            title = title_list[i].find('a').find('span').text
            link = title_list[i].find('a').get('href')
            item['title'] = title
            item['link'] = link
            items.append(item)
        return items
Example #24
0
 def parse_subject(self, response):
     item = DoubanmovieItem()
     # todo extract item content
     item['movie_name'] = response.xpath(
         '//*[@id="content"]/h1/span[1]').xpath(
             'normalize-space(string(.))').extract()[0]
     item['intro'] = response.xpath('//*[@id="link-report"]/span').xpath(
         'normalize-space(string(.))').extract()[0]
     item['actors'] = response.xpath(
         '//*[@id="info"]/span[3]/span[2]').xpath(
             'normalize-space(string(.))').extract()
     item['date'] = response.xpath('//*[@id="info"]/span[11]').xpath(
         'normalize-space(string(.))').extract()[0]
     item['director'] = response.xpath(
         '//*[@id="info"]/span[1]/span[2]/a').xpath(
             'normalize-space(string(.))').extract()[0]
     return item
Example #25
0
    def deail_parse(self, response):
        self.logger.info('Parse function called on %s', response.url)
        result = DoubanmovieItem()
        imag_url = response.xpath('//*[@id="mainpic"]/a/img/@src').get()
        title = response.xpath('//*[@id="content"]/h1/span[1]/text()').get()
        year = response.xpath('//*[@id="content"]/h1/span[2]/text()').get()
        star = response.xpath(
            '//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').get()
        comment = response.xpath(
            '//*[@id="hot-comments"]//div/div/p/span/text()').get()
        result['title'] = title
        result['year'] = year
        result['star'] = star
        result['comment'] = comment
        result['image_url'] = imag_url

        yield result
Example #26
0
 def parse(self, response):
     #定位标签
     movie_item = response.xpath('//div[@class = "item"]')
     # print(movie_item)
     # #遍历数据
     for item in movie_item:
         # print(item)
         #xpath解析
         #创建采集对象
         movie = DoubanmovieItem()
         # 排名解析并赋值
         movie['rank'] = item.xpath('div[@class="pic"]/em/text()').extract()
         #电影名解析并赋值
         movie['title'] = item.xpath('div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()').extract()
         #返回生成器
         yield movie
     pass
Example #27
0
    def parse_item(self, response):
        sel = response
        item = DoubanmovieItem()
        item['name'] = sel.xpath(
            '//*[@id="content"]/h1/span[1]/text()').extract()
        item['year'] = sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(
            r'\((\d+)\)')
        item['score'] = sel.xpath(
            '//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract()
        item['director'] = sel.xpath(
            '//*[@id="info"]/span[1]/span[2]/a/text()').extract()
        item['classification'] = sel.xpath(
            '//span[@property="v:genre"]/text()').extract()
        item['actor'] = sel.xpath(
            '////*[@id="info"]/span[3]/span[2]/a/text()').extract()

        return item
Example #28
0
    def parse(self, response):
        item = DoubanmovieItem()
        selector = Selector(response)
        movies = selector.xpath('//div[@class="pl2"]')

        for each in movies:
            URL = each.xpath('a/@href').extract()

            title = each.xpath('a/text()').extract()
            title2 = each.xpath('a/span/text()').extract()
            fullTitle = ''
            #//*[@id="content"]/div/div[1]/div[2]/table[1]/tbody/tr/td[2]/div/a/text()
            #//*[@id="content"]/div/div[1]/div[2]/table[1]/tbody/tr/td[2]/div/a/span
            for eachTitle in title:
                fullTitle += eachTitle
            for eachTitle in title2:
                fullTitle += eachTitle
            # print fullTitle

            movieInfo = each.xpath('p[@class="pl"]/text()').extract()
            # print movieInfo

            star = each.xpath(
                'div[@class="star clearfix"]/span[@class="rating_nums"]/text()'
            ).extract()
            # print star

            #quote = each.xpath('div[@class="star clearfix"]/span[@class="rating_nums"]/text()').extract()
            #if quote:
            #quote = quote[0]
            #else:
            #quote = ''
            # print quote

            item['title'] = fullTitle
            item['movieInfo'] = ';'.join(movieInfo)
            item['star'] = star
            item['full_URL'] = URL
            #item['quote'] = quote
            yield item
        nextLink = selector.xpath('//span[@class="next"]/link/@href').extract()
        if nextLink:
            nextLink = nextLink[0]
            print nextLink
            yield Request(nextLink, callback=self.parse)
Example #29
0
    def parse(self, response):
        info_list = response.xpath('//div[@class="info"]')
        for data in info_list:
            item = DoubanmovieItem()
            item['name'] = data.xpath('./div[@class="hd"]/a/span[1]/text()').extract()[0].encode('utf-8')
            item['actor'] = data.xpath('./div[@class="bd"]/p[1]/text()').extract()[0].strip().encode('utf-8')
            item['rating'] = data.xpath('./div//div/span[@class="rating_num"]/text()').extract()[0].encode('utf-8')
            infos = data.xpath('./div[@class="bd"]/p[@class="quote"]/span/text()').extract()
            if len(infos)!=0:
                infos = infos[0].strip().encode('utf-8')
            else:
                infos = ' '
            item['info'] = infos
            yield item

        if self.page<225:
            self.page+=25
            yield scrapy.Request(self.url+str(self.page), callback=self.parse)
Example #30
0
 def parse(self, response):
     movies = response.xpath('//div[@class="info"]')
     for each in movies:
         item = DoubanmovieItem()
         item['moviename'] = each.xpath(
             './div[@class="hd"]/a/span[1]/text()').extract()[0]
         item['info'] = each.xpath(
             './div[@class="bd"]/p/text()').extract()[0]
         item['star'] = each.xpath(
             './/div[@class="star"]/span[@class="rating_num"]/text()'
         ).extract()[0]
         quote = each.xpath('.//p[@class="quote"]/span/text()').extract()
         if len(quote) != 0:
             item["quote"] = quote[0]
         yield item
     if self.offset < 225:
         self.offset += 25
         yield scrapy.Request(self.url + str(self.offset),
                              callback=self.parse)