Ejemplo n.º 1
0
 def parse_book(self, response):
     item = DoubanItem()
     item['publish'] = response.xpath(
         u'//span[contains(./text(), "出版社:")]/following::text()[1]'
     ).extract()[0]
     item['publish_time'] = response.xpath(
         u'//span[contains(./text(), "出版年:")]/following::text()[1]'
     ).extract()[0]
     item['decorate'] = response.xpath(
         u'//span[contains(./text(), "装帧:")]/following::text()[1]').extract(
         )[0]
     item['ISBN'] = response.xpath(
         u'//span[contains(./text(), "ISBN:")]/following::text()[1]'
     ).extract()[0]
     item['price'] = response.xpath(
         u'//span[contains(./text(), "定价:")]/following::text()[1]').extract(
         )[0]
     item['page_num'] = response.xpath(
         u'//span[contains(./text(), "页数:")]/following::text()[1]').extract(
         )[0]
     writerList = response.xpath(
         '//div[@id="info"]/span/a/text()').extract()
     item['writer'] = writerList[0]
     try:
         item['translator'] = writerList[1]
     except:
         item['translator'] = '无'
     yield item
Ejemplo n.º 2
0
    def parse(self, response):
        #定义item为字典,并且检查字段的格式是否为字典

        #打印响应的body
        # with open('douban.html','w') as f:
        #     f.write(response.body)

        #解析数据
        item = DoubanItem()
        #xpath解析,extract()提取,出来是一个列表,可以用[0]进行进一步提取
        #解析有xpath提取,css提取,正则提取

        #首先xpath,直接response.xpath(),再加extract()提取成列表,[0]就是第一个
        #或者extract_first('无数据')也是提取第一个,无数据的话不会出现导致数组越界报错,会返回无数据
        #css,直接response.css('.text a::text'),.是class,#是id,空格加a就是标签,
        # ::text是文本属性,::attr(href)是获取属性。。或者就是css选择器
        #两者可以互相嵌套
        #正则一定要在xpath后接上,不然报错
        #response.xpath('.').re(''),re_first是获取第一个,

        item['name'] = response.xpath(
            '//*[@id="anony-time"]/div/div[3]/ul/li[1]/a[2]/text()').extract(
            )[0]
        item['column'] = response.xpath(
            '//*[@id="anony-time"]/div/div[3]/ul/li[1]/span/text()').extract(
            )[0]
        #详情页的url
        detail_url = response.xpath(
            '//*[@id="statuses"]/div[2]/div[1]/div/div/div[2]/div[1]/div[2]/div/a/@href'
        )
        print(item)
        # return把数据传输到engine,然后engine把数据传给pipelines管道
        yield item
Ejemplo n.º 3
0
    def parse(self, response):
        print("resonse.url===", response.url)

        node_list = response.xpath('//div[@class="item"]')

        for node in node_list:
            title = node.xpath('.//span[@class="title"][1]/text()').extract()
            score = node.xpath(
                './/div[@class="star"]/span[2]/text()').extract()
            info = node.xpath('.//div[@class="info"]//p/span/text()').extract()
            if title:
                title = " ".join(title)

            if score:
                score = " ".join(score)

            if info:
                info = " ".join(info)

            item = DoubanItem()
            item["title"] = title
            item["score"] = score
            item["info"] = info

            yield item

            #下一页
            if self.offset < 225:
                self.offset += 25

            new_next_url = self.url + str(self.offset) + "&filter="
            yield scrapy.Request(new_next_url, callback=self.parse)
Ejemplo n.º 4
0
    def parse_item(self, response):
        print("response.url================================", response.url)

        all_node = response.xpath('//div[@class="info"]')
        for node in all_node:

            item = DoubanItem()
            print("--" * 100)
            # 影片的标题
            tilte = node.xpath(
                './/span[@class="title"][1]/text()').extract()[0]
            # 影片的信息
            content = node.xpath('.//div[@class="bd"]/p/text()').extract()[0]
            # 影片的评分
            score = node.xpath(
                './/div[@class="star"]/span[2]/text()').extract()[0]
            # 影片的一句话简介
            info = node.xpath('.//p[@class="quote"]/span/text()').extract()
            if len(info) > 0:
                info = info[0]

            item["tilte"] = tilte
            item["content"] = content
            item["score"] = score
            item["info"] = info

            # print(item)

            yield item
Ejemplo n.º 5
0
    def parse_item(self, response):
        # print (response.url,'----------')
        # 获取所有电影节点
        node_list = response.xpath(
            '//*[@id="content"]/div/div[1]/ol/li/div/div[2]')
        # print (len(node_list))

        # 遍历节点列表
        for node in node_list:
            # 创建item对象
            item = DoubanItem()
            # 从节点中获取数据,保存到item中
            # 电影名
            item['name'] = node.xpath(
                './div[1]/a/span[1]/text()').extract_first()
            # 评分
            item['score'] = node.xpath(
                './div[2]/div/span[2]/text()').extract_first()
            # info信息
            item['info'] = ''.join([
                i.strip()
                for i in node.xpath('./div[2]/p[1]/text()').extract()
            ]).replace('\xa0', '')
            #简介
            item['desc'] = node.xpath(
                './div[2]/p[2]/span/text()').extract_first()

            # print (item)
            # 返回数据
            yield item
Ejemplo n.º 6
0
    def parse(self, response):
        for info in response.xpath('//div[@class="item"]'):
            item = DoubanItem()
            item['rank'] = info.xpath('div[@class="pic"]/em/text()').extract(
            )[0].strip().encode("utf8").replace("\'", "")
            item['title'] = info.xpath('div[@class="pic"]/a/img/@alt').extract(
            )[0].strip().encode("utf8").replace("\'", "")
            item['link'] = info.xpath('div[@class="pic"]/a/@href').extract(
            )[0].strip().encode("utf8").replace("\'", "")
            item['star'] = info.xpath(
                'div[@class="info"]/div[@class="bd"]/p[1]/text()').extract(
                )[0].strip().encode("utf8").replace("\'", "")
            list_quote = info.xpath(
                'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()'
            ).extract()
            item['quote'] = self.sumChildStr(list_quote).replace("\'", "")
            list_rate = info.xpath(
                'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()'
            ).extract()
            item['rate'] = self.sumChildStr(list_rate).replace("\'", "")

            yield item

        next_page = response.xpath('//span[@class="next"]/a/@href')
        if next_page and self.page < self.MAX_PAGE:
            url = response.urljoin(next_page[0].extract())
            self.page += 1
            yield scrapy.Request(url, self.parse)
Ejemplo n.º 7
0
    def parse_item(self, response):
        print("resonse.url===", response.url)

        node_list = response.xpath('//div[@class="item"]')

        for node in node_list:
            title = node.xpath('.//span[@class="title"][1]/text()').extract()
            score = node.xpath(
                './/div[@class="star"]/span[2]/text()').extract()
            info = node.xpath('.//div[@class="info"]//p/span/text()').extract()
            if title:
                title = " ".join(title)

            if score:
                score = " ".join(score)

            if info:
                info = " ".join(info)

            item = DoubanItem()
            item["title"] = title
            item["score"] = score
            item["info"] = info

            yield item
Ejemplo n.º 8
0
 def parse(self, response):
     item = DoubanItem()
     author = response.xpath(
         "//div[@id='info']/span[contains(./text(), '作者:')]/following-sibling::a[1]/text()"
     ).extract()[0]
     item["author"] = re.sub("\s", "", string=author)
     item["publish_house"] = response.xpath(
         "//div[@id='info']/span[contains(./text(), '出版社:')]/following::text()[1]"
     ).extract()[0]
     item["publish_date"] = response.xpath(
         "//div[@id='info']/span[contains(./text(), '出版年:')]/following::text()[1]"
     ).extract()[0]
     item["page_num"] = response.xpath(
         "//div[@id='info']/span[contains(./text(), '页数:')]/following::text()[1]"
     ).extract()[0]
     item["package"] = response.xpath(
         "//div[@id='info']/span[contains(./text(), '装帧:')]/following::text()[1]"
     ).extract()[0]
     item["ISBN"] = response.xpath(
         "//div[@id='info']/span[contains(./text(), 'ISBN:')]/following::text()[1]"
     ).extract()[0]
     item["price"] = response.xpath(
         "//div[@id='info']/span[contains(./text(), '定价:')]/following::text()[1]"
     ).extract()[0]
     item["remark"] = response.xpath(
         "//strong[@class='ll rating_num ']/text()").extract()[0]
     item["tags"] = response.xpath("//a[@class='  tag']/text()").extract()
     yield item
     # 返回下一个链接
     url_list = response.xpath(
         "//div[@class='content clearfix']/dl/dd/a/@href").extract()
     for url in url_list:
         yield scrapy.Request(url=url, callback=self.parse)
Ejemplo n.º 9
0
    def parse_movie(self, response):

        loader = ItemLoader(item=DoubanItem(), response=response)

        for attr, xpath in self.settings.getdict('INFO_XPATH').items():
            loader.add_xpath(attr, xpath)

        s = response.xpath('//div[@id="info"]').extract_first()
        for attr, regex in self.settings.getdict('RE').items():
            loader.add_value(attr, re.findall(regex, s))

        loader.add_value('rate', self.parse_rate(response))
        loader.add_value('url', response.url)

        if self.settings.get('ALLOW_COVER') == True:
            image_urls = self._get_urls(
                self.image_base_url,
                urljoin,
                response.xpath('//div[@id="mainpic"]/a/img/@src').extract(),
                lambda s: s.split('/')[-1],
            )

            loader.add_value('image_urls', image_urls)

        return loader.load_item()
Ejemplo n.º 10
0
    def parse(self, response):
        node_list = response.xpath("//ol/li")
        #node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")

        for node in node_list:
            # 构建item对象,用来保存数据
            item = DoubanItem()
            # 提取每个职位的信息
            #print(node)
            #print("*"*40)
            item['title'] = node.xpath(".//span[1]/text()").extract()[0]

            item['director'] = node.xpath(
                ".//p/text()").extract()[0].split()[1]
            span = node.xpath(".//p[@class='quote']/span")
            if span is None:
                item['introduce'] = '########'
            else:
                item['introduce'] = span.xpath(".//text()").extract()[0]

            item['link'] = node.xpath(".//a[1]/@href").extract()[0]

            # yield 的重要性,是返回数据后还能回来接着执行代码
            yield item

        # 第一种写法:拼接url,适用场景:页面没有可以点击的请求连接,必须通过拼接url才能获取响应
        if self.off_set < 250:
            self.off_set += 25
            url = self.base_url + str(self.off_set)
            yield scrapy.Request(url, callback=self.parse)
Ejemplo n.º 11
0
 def parse(self, response):
     item = DoubanItem()
     movies = response.xpath('//ol[@class="grid_view"]/li')
     for movie in movies:
         item["排名"] = movie.xpath(
             './/div[@class="pic"]/em/text()').extract()[0]
         item["名字"] = movie.xpath(
             './/div[@class="hd"]/a/span[1]/text()').extract()[0]
         item["导演"] = movie.xpath('.//p[@class=""]/text()[1]'
                                  ).extract_first().strip().split(' ')[1]
         #            item["主演"]=movie.xpath('.//p[@class=""]/text()[1]').extract_first().strip().split(' ')[1]
         item["年份"] = movie.xpath('.//p[@class=""]/text()[2]'
                                  ).extract_first().split('/')[0].strip()
         item["国家"] = movie.xpath('.//p[@class=""]/text()[2]'
                                  ).extract_first().split('/')[1].strip()
         item["类型"] = movie.xpath('.//p[@class=""]/text()[2]'
                                  ).extract_first().split('/')[2].strip()
         item["评分"] = movie.xpath(
             './/div[@class="star"]/span[@class="rating_num"]/text()'
         ).extract()[0]
         item["评价"] = movie.xpath(
             './/p[@class="quote"]/span/text()').extract()[0]
         yield item
     next_url = response.xpath('//span[@class="next"]/a/@href').extract()
     if next_url:
         next_url = 'https://movie.douban.com/top250' + next_url[0]
         yield Request(next_url, headers=self.headers)
Ejemplo n.º 12
0
    def parse(self, response):
        # 定义item为字典,并且检查字段的格式是否为字典

        # 打印响应的body
        # with open('douban.html','w') as f:
        #     f.write(response.body)

        # 解析数据
        item = DoubanItem()
        # xpath解析,extract()提取,出来是一个列表,可以用[0]进行进一步提取
        item['name'] = response.xpath(
            '//*[@id="anony-time"]/div/div[3]/ul/li[1]/a[2]/text()').extract(
            )[0]
        item['column'] = response.xpath(
            '//*[@id="anony-time"]/div/div[3]/ul/li[1]/span/text()').extract(
            )[0]
        # 详情页的url
        detail_url = response.xpath(
            '//*[@id="statuses"]/div[2]/div[1]/div/div/div[2]/div[1]/div[2]/div/a/@href'
        )
        print(item)
        #要传输item,所以不能现在就返回
        # yield item
        #通过meta传输数据,公用管道存储,将item传了过去
        yield scrapy.Request(detail_url,
                             callback=self.parse_detail,
                             meta={'item': item})
Ejemplo n.º 13
0
class Douban_can_video(scrapy.Spider):
    name = 'Douban_can_video'
    item = DoubanItem()
    allowed_domains = ['movie.douban.com']
    start_urls = ['https://movie.douban.com/j/search_subjects?']
    headers = {
        'Referer':'https://movie.douban.com/',
        'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' \
                       'Chrome/55.0.2883.87 Safari/537.36'
    }

    values = {
        'type': 'movie',
        'tag': '可播放',
        'sort': 'recommend',
        'page_limit': '20',
        'page_start': '0',
    }

    # 'https://movie.douban.com/j/search_subjects?'

    def start_requests(self):
        # yield scrapy.Request(self.start_urls[0],headers=self.headers,callback=self.parse)
        #热门:309  最新:500
        data = urllib.urlencode(self.values)
        url = self.start_urls[0] + data
        self.item['path'] = 'Image/可播放'
        yield scrapy.Request(url, headers=self.headers, callback=self.parse)

    def parse(self, response):
        s = json.loads(response.body)
        a = s['subjects']
        for i in xrange(len(a)):
            self.url = a[i]['url']
            self.item['url'] = self.url
            yield scrapy.Request(self.url,
                                 self.parse_info,
                                 headers=self.headers)

    def parse_info(self, response):
        print '*****************8'
        print response
        self.item['title'] = response.xpath(
            '//h1/span[@property="v:itemreviewed"]/text()').extract()[0]
        self.item['year'] = response.xpath(
            '//h1/span[@class="year"]/text()').extract()[0]
        self.item['image_urls'] = response.xpath(
            '//div[@id="mainpic"][@class=""]/a/img/@src').extract()
        #处理提取多行text
        self.item['info'] = response.xpath(
            'string(//div[@id="info"])').extract()[0]
        # content = response.xpath('string(//div[@class="related-info"])').extract()[0]
        #处理xpath提取出来打内容中的空白,只留一个
        self.item['content'] = response.xpath(
            'normalize-space(string(//div[@class="related-info"]))').extract(
            )[0]
        yield self.item
Ejemplo n.º 14
0
 def parse_item(self, response):
     movie_list = response.xpath('//*[@id="content"]/div/div[1]/ol/li')
     for movie in movie_list:
         item = DoubanItem()
         item['name'] = movie.xpath('./div/div[2]/div[1]/a/span[1]/text()').extract_first()
         item['score'] = movie.xpath('./div/div[2]/div[2]/div/span[2]/text()').extract_first()
         item['info'] = movie.xpath('./div/div[2]/div[2]/p[1]/text()').extract_first().strip()
         item['desc'] = movie.xpath('./div/div[2]/div[2]/p[2]/span/text()').extract_first()
         yield item
Ejemplo n.º 15
0
 def parse_item(self, response):
     sel = Selector(response)
     item = DoubanItem() # 以下爬出的内容中文编码有问题,需要转化
     item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
     item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)')
     item['score']=sel.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')
     item['director']=sel.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')
     item['classification']= sel.xpath('//span[@property="v:genre"]/text()')
     item['actor']= sel.xpath('//*[@id="info"]/span[3]/span[2]/text()') # 输出全是 '/',需要修正
     return item
Ejemplo n.º 16
0
    def parse_next(self, response):

        for item in response.xpath('//tr[@class="item"]'):
            book = DoubanItem()
            book['title'] = item.xpath('td[2]/div[1]/a/@title').extract_first()
            book['info'] = item.xpath('td[2]/p/text()').extract_first()
            book['rank'] = item.xpath(
                'td[2]/div[2]/span[2]/text()').extract_first()
            book['intro'] = item.xpath(
                'td[2]/p/span[1]/text()').extract_first()
            print book
Ejemplo n.º 17
0
    def parse_item(self, response):

        # print(response.url)

        node_list = response.xpath('//div[@class="info"]')

        for node in node_list:
            item = DoubanItem()
            item['name'] = node.xpath('./div[1]/a/span[1]/text()').extract_first()
            item['score'] = node.xpath('./div[2]/div/span[2]/text()').extract_first()
            item['info'] = node.xpath('./div[2]/p[1]/text()').extract_first().replace('\xa0', ' ').strip()
            # item['info'] = node.xpath('./div[2]/p[1]/text()').extract_first().replace('\xa0','').strip()

            item['desc'] = node.xpath('./div[2]/p[2]/span/text()').extract_first()
            # print(item)
            yield item
Ejemplo n.º 18
0
 def parse(self, response):
     elements = response.css("#subject_list ul li")
     for ele in elements:
         item = DoubanItem()
         item['title'] = ele.css(".info h2 a::text").get().strip()
         pub_all = ele.css(".info .pub::text").get().strip().split(' / ')
         item['author'] = '/'.join(pub_all[:-3])
         item['pub'] = ''.join(pub_all[-3:-2])
         item['date'] = ''.join(pub_all[-2:-1])
         item['price'] = ''.join(pub_all[-1:])
         item['rating_nums'] = ele.css(
             ".info .star span.rating_nums::text").get()
         item['comment_nums'] = ele.css(
             ".info .star span.pl::text").get().strip().strip('()')
         item['introduction'] = ele.css(".info p::text").get()
         yield item
Ejemplo n.º 19
0
    def parse(self, response):
        el_list = json.loads(response.body.decode())["subjects"]
        for el in el_list:
            item = DoubanItem()
            item["name"] = el["title"]
            item["img_link"] = el["cover"]
            item["score"] = el["rate"]

            details_url = el["url"]
            yield scrapy.Request(url=details_url,
                                 callback=self.parse_details,
                                 meta={"item": item})

        ajax_url = response.request.url.rsplit("=", 1)
        ajax_url = "=".join([ajax_url[0], str(int(ajax_url[-1]) + 20)])
        time.sleep(random.random() * 2)
        yield scrapy.Request(url=ajax_url, callback=self.parse)
Ejemplo n.º 20
0
    def parse(self, response):
        node_list = response.xpath("//div[@class='pic']//img")
        for node in node_list:
            item = DoubanItem()
            item['imagename'] = node.xpath("./@alt")[0].extract()
            item['imagelink'] = node.xpath("./@src")[0].extract()

            yield item

        # if self.offset < 225:
        #     self.offset += 25
        #     newurl = self.baseurl + str(self.offset)
        #     yield scrapy.Request(newurl, callback = self.parse)

        if response.xpath("//span[@class = 'next']/link/@href"):
            offset = response.xpath(
                "//span[@class = 'next']/link/@href").extract()[0]
            newurl = self.baseurl + offset
            yield scrapy.Request(newurl, callback=self.parse)
Ejemplo n.º 21
0
    def parse_item(self, response):
        node_list = response.xpath('//div[@class="info"]')
        # print(len(node_list))

        for node in node_list:
            item = DoubanItem()

            item['name'] = node.xpath(
                './div[1]/a/span[1]/text()').extract_first()
            item['score'] = node.xpath(
                './div[2]/div/span[2]/text()').extract_first()
            item['info'] = ''.join([
                data.strip()
                for data in node.xpath('./div[2]/p[1]/text()').extract()
            ])
            item['desc'] = node.xpath(
                './div[2]/p[2]/span/text()').extract_first()

            yield item
Ejemplo n.º 22
0
    def parse_item(self, response):
        # print (response.url,'------')
        # 获取电影节点列表
        node_list = response.xpath('//div[@class="info"]')
        # print (len(node_list))

        # 遍历节点列表
        for node in node_list:
            # 构建item实例
            item = DoubanItem()

            # 抽取数据
            item['name'] = node.xpath('./div[1]/a/span[1]/text()').extract_first()
            item['score'] = node.xpath('./div[2]/div/span[2]/text()').extract_first()
            item['info'] = ''.join([i.strip() for i in node.xpath('./div[2]/p[1]/text()').extract()]).replace('\xa0','')
            item['desc'] = node.xpath('./div[2]/p[2]/span/text()').extract_first()
            # print (item)
            # 将数据返回给引擎
            yield item
Ejemplo n.º 23
0
 def parse(self, response):
     #print(response.body)
     selector = scrapy.Selector(response)
     for movie in selector.xpath('//tr[@class="item"]'):
         item = DoubanItem()
         title = movie.xpath('./td[1]/a/@title').extract_first()
         href = movie.xpath('./td[1]/a/@href').extract_first()
         item['title'] = title
         item['href'] = href
         #print(title)
         #print(href)
         yield item
     #startnum = response.url
     #startnum = startnum[40:-7]
     #num = int(startnum) + 20
     #next_page_url = 'https://movie.douban.com/tag/2015?start=' + str(num) + '&type=O'
     next_page_url = response.xpath(
         '//span[@class="next"]/a/@href').extract_first()
     if next_page_url:
         yield scrapy.Request(response.urljoin(next_page_url))
Ejemplo n.º 24
0
 def parse_article(self, response):
     hxs = Selector(response)
     movie_name = hxs.xpath(
         '//*[@id="content"]/h1/span[1]/text()').extract()
     comment_link = hxs.xpath(
         '//div[@id="comments-section"]/div/h2/span/a/@href').extract()[0]
     item = DoubanItem()
     item['movie_name'] = movie_name
     item['comment_link'] = comment_link
     yield Request(comment_link,
                   meta={'item': item},
                   callback=self.parse_item,
                   cookies=[
                       {
                           'name': 'COOKIE_NAME',
                           'value': 'VALUE',
                           'domain': '.douban.com',
                           'path': '/'
                       },
                   ])
Ejemplo n.º 25
0
    def parsr_detarl(self, response):

        # print(2222222222222222222222,response.request.headers['User-Agent'])

        item = DoubanItem()
        print('+++++++++++++++', response.url)
        item['movie_name'] = response.xpath(
            '//*[@id="content"]/h1/span[1]/text()').extract_first()
        item['movie_url'] = response.url
        item['director'] = response.xpath(
            '//*[@id="info"]/span[1]/span[2]/a/text()').extract_first()
        item['scripter'] = ",".join(
            response.xpath(
                '//*[@id="info"]/span[2]/span[2]/a/text()').extract())
        item['octor'] = ",".join(
            response.xpath(
                '//*[@id="info"]/span[3]/span[2]/span/a/text()').extract())

        item['style'] = ','.join(
            response.xpath(
                '//*[@id="info"]/span[@property="v:genre"]/text()').extract())

        item['create_country'] = response.xpath(
            '//*[@id="info"]/text()[8]').extract_first()
        item['language'] = response.xpath(
            '//*[@id="info"]/text()[10]').extract_first()
        item['show_date'] = ','.join(
            response.xpath(
                '//*[@id="info"]/span[@property="v:initialReleaseDate"]/text()'
            ).extract())
        item['longer'] = response.xpath(
            '//*[@id="info"]/span[@property="v:runtime"]/text()'
        ).extract_first()
        item['other_name'] = response.xpath(
            '//*[@id="info"]/text()[17]').extract_first()
        item['desc'] = ''.join(
            response.xpath(
                '//*[@id="link-report"]/span[@class="all hidden"]/text()').
            extract())

        yield item
Ejemplo n.º 26
0
    def parse(self, response):
        print("response.url================================",response.url)

        all_node = response.xpath('//div[@class="info"]')
        for node in all_node:

            item = DoubanItem()
            print("--"*100)
            #影片的标题
            tilte = node.xpath('.//span[@class="title"][1]/text()').extract()[0]
            #影片的信息
            content = node.xpath('.//div[@class="bd"]/p/text()').extract()[0]
            #影片的评分
            score = node.xpath('.//div[@class="star"]/span[2]/text()').extract()[0]
            #影片的一句话简介
            info = node.xpath('.//p[@class="quote"]/span/text()').extract()
            if len(info) > 0:
                info = info[0]

            item["tilte"] = tilte
            item["content"] = content
            item["score"] = score
            item["info"] = info

            # print(item)

            yield item


        #实现下一页的功能

        if self.offset < 225:
            self.offset += 25

        else:
            self.offset = 0

        next_url = self.url + str(self.offset)
        print("next_url=====",next_url)

        yield scrapy.Request(next_url,callback=self.parse,headers=self.headers,dont_filter=True)
Ejemplo n.º 27
0
    def parse(self, response):
        item = DoubanItem()
        movies = response.xpath("//div[@class='info']")
        print "-----"
        for each in movies:
            item['title'] = each.xpath(
                ".//span[@class='title'][1]/text()").extract()[0]
            bd = each.xpath(".//div[@class='bd']/p/text()").extract()[0]
            item['star'] = each.xpath(
                ".//span[@class='rating_num']/text()").extract()[0]
            quote = each.xpath(".//p[@class='quote']/span/text()").extract()
            item['bd'] = "".join(bd).strip()
            if len('quote') != 0:
                item['quote'] = quote[0]

            yield item
            print "----"
        if self.offset < 225:
            self.offset += 25
            yield scrapy.Request(self.url + str(self.offset),
                                 callback=self.parse)
Ejemplo n.º 28
0
 def parse_item(self, response):
     # 获取所有电影节点列表
     movie_list = response.xpath('//div[@class="item"]')
     for movie in movie_list:
         # 创建item实例
         item = DoubanItem()
         item['name'] = movie.xpath(
             './div[2]/div[1]/a/span[1]/text()').extract()[0]
         item['image'] = movie.xpath('./div[1]/a/img/@src').extract()[0]
         item['score'] = movie.xpath(
             './div[2]/div[2]/div/span[2]/text()').extract()[0]
         item['info'] = movie.xpath(
             './div[2]/div[2]/p[2]/span/text()').extract()[0]
         item['ower'] = ''.join([
             i.strip()
             for i in movie.xpath('./div[2]/div[2]/p[1]/text()').extract()
         ]).replace('\xa0', '')
         url = movie.xpath('./div[1]/a/@href').extract()[0]
         print(url, '++++++++++++++++++++++')
         yield scrapy.Request(url,
                              callback=self.parse_detail,
                              meta={'mymeta': item})
Ejemplo n.º 29
0
    def parse(self, response):
        #print(response.request.headers['User-Agent'])

        movie_list = response.xpath(
            '//*[@id="content"]/div/div[1]/ol/li/div/div[2]')

        print(len(movie_list))

        for movie in movie_list:
            item = DoubanItem()

            item['name'] = movie.xpath(
                './div[1]/a/span[1]/text()').extract_first()
            print(item)
            yield item

        next_url = response.xpath(
            '//*[@id="content"]/div/div[1]/div[2]/span[3]/a/@href'
        ).extract_first()
        if next_url != None:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(url=next_url)
Ejemplo n.º 30
0
    def parse(self, response):
        for info in response.xpath('//div[@class="item"]'):
            item = DoubanItem()
            item['rank'] = info.xpath('div[@class="pic"]/em/text()').extract()
            item['title'] = info.xpath(
                'div[@class="pic"]/a/img/@alt').extract()
            item['link'] = info.xpath('div[@class="pic"]/a/@href').extract()
            item['star'] = info.xpath(
                'div[@class="info"]/div[@class="bd"]/p[1]/text()').extract(
                )[0].strip()
            item['rate'] = info.xpath(
                'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span/text()'
            ).extract()
            item['quote'] = info.xpath(
                'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()'
            ).extract()

            yield item

        next_page = response.xpath('//span[@class="next"]/a/@href')
        if next_page:
            url = response.urljoin(next_page[0].extract())
            yield scrapy.Request(url, self.parse)