Ejemplo n.º 1
0
    def parse_item(self, response):
        item = DoubanbookItem()
        selector = Selector(response)
        name = selector.xpath(
            '//div[@id="wrapper"]/h1/span/text()').extract()[0]
        star = selector.xpath(
            '//strong[@class="ll rating_num "]/text()').extract()[0].strip()
        ratingPeople = selector.xpath(
            '//a[@class="rating_people"]/span/text()').extract()[0]

        #print name + ratingPeople
        #auther = re.findall(u'<span class="pl"> 作者<\/span><a class="" href=".*">(.*)<\/a><\/span>',response)

        # for span in info:
        #     auther = selector.xpath('span/a/@href').extract()[0]
        #     #publish =
        #     print auther
        #print book_info
        bookImg = selector.xpath('//a[@class="nbg"]/@href').extract()[0]
        book_info = selector.xpath('//div[@id="info"]').extract()[0].encode(
            'utf8')
        author = re.findall(r'<a class="" href=".*?">(.*?)</a>',
                            book_info)[0].decode('utf8')
        press = re.search(r'出版社:</span>(.*?)<br',
                          book_info).group(1).strip().decode('utf8')
        try:
            original = re.search(r'原作名:</span>(.*?)<br',
                                 book_info).group(1).strip().decode('utf8')
        except Exception, e:
            original = ''
Ejemplo n.º 2
0
 def parse_item(self, response):
     sel = Selector(response)
     divs = sel.xpath(
         '//div[@class="doulist-item"]/div[@class="mod"]/div[1]')
     for div in divs:
         link = div.xpath('./div[4]/a/@href')
         if len(link) > 0:
             item = DoubanbookItem()
             t_link = link[0]
             bookname = div.xpath('./div[4]/a/text()').extract()
             author = div.xpath('./div[6]/text()[1]').extract()
             score = div.xpath('./div[5]/span[2]/text()').extract()
             scoreCount = div.xpath('./div[5]/span[3]/text()').extract()
             publishCompany = div.xpath('./div[6]/text()[2]').extract()
             publishTime = div.xpath('./div[6]/text()[3]').extract()
             item['bookName'] = [b.encode("utf-8") for b in bookname]
             item['author'] = [a.encode("utf-8") for a in author]
             item['score'] = [s.encode("utf-8") for s in score]
             item['scoreCount'] = [sc.encode("utf-8") for sc in scoreCount]
             item['publishCompany'] = [
                 p.encode('utf-8') for p in publishCompany
             ]
             item['publishTime'] = [
                 pt.encode('utf-8') for pt in publishTime
             ]
             item['link'] = t_link
             yield scrapy.Request(t_link,
                                  callback=self.parse_detail,
                                  meta={"item": item})
Ejemplo n.º 3
0
    def parse(self, response):
        book = DoubanbookItem()
        for book_selector in response.xpath(
                '//div[@class="bd doulist-subject"]'):
            book['book_src'] = book_selector.xpath(
                'div[@class="post"]/a/img/@src').extract_first()
            book['book_title'] = book_selector.xpath(
                'div[@class="title"]/a/text()').extract_first().strip(' \n')
            book['book_href'] = book_selector.xpath(
                'div[@class="title"]/a/@href').extract_first()
            book['book_rating_nums'] = book_selector.xpath(
                'div[@class="rating"]/span[2]/text()').extract_first()
            book['book_rating_counting'] = book_selector.xpath(
                'div[@class="rating"]/span[3]/text()').extract_first().strip(
                    '()')
            book_abstract = book_selector.xpath(
                'div[@class="abstract"]').xpath('string(.)').extract_first()
            book['book_author'] = self.__match("作者:(.*?)\n", book_abstract)
            book['book_publisher'] = self.__match("出版社:(.*?)\n", book_abstract)
            book['book_publish_date'] = self.__match("出版年:(.*?)\n",
                                                     book_abstract)
            yield book

            next_page = response.xpath('//div[@class="paginator"]').xpath(
                'span[@class="next"]/a/@href').extract_first()
            if next_page is not None:
                yield scrapy.Request(url=next_page, callback=self.parse)
Ejemplo n.º 4
0
    def parse(self, response):
        #print(str(response.body,encoding="utf8"))
        item = DoubanbookItem()
        selector = scrapy.Selector(response)
        books = selector.xpath('//div[@class="bd doulist-subject"]')
        for x in books:
            title = x.xpath('div[@class="title"]/a/text()').extract()[0]
            rate = x.xpath(
                'div[@class="rating"]/span[@class="rating_nums"]/text()'
            ).extract()[0]
            author = re.search('<div class="abstract">(.*?)<br', x.extract(),
                               re.S).group(1)
            title = title.replace(' ', '').replace('\n', '')
            author = author.replace(' ', '').replace('\n', '')
            item["title"] = title
            item["rate"] = rate
            item["author"] = author

            # print('标题:' + title)
            # print('评分:' + rate)
            # print(author)
            # print('-----------------------------------------')
            yield item
            nextpage = selector.xpath(
                '//span[@class="next"]/link/@href').extract()
            if nextpage:
                next = nextpage[0]
                print(next)
                yield scrapy.http.Request(next, callback=self.parse)
Ejemplo n.º 5
0
    def parse(self, response):
        books = response.xpath('//div[@class="bd doulist-subject"]')
        item = DoubanbookItem()

        for each_book in books:
            title = each_book.xpath(
                'div[@class="title"]/a/text()').extract_first("null").replace(
                    "\n", "").replace(" ", "")
            author = each_book.xpath(
                'div[@class="abstract"]/text()').extract_first("null").replace(
                    "\n", "").replace(" ", "").replace('\"', "")
            rate = each_book.xpath(
                'div[@class="rating"]/span[@class="rating_nums"]/text()'
            ).extract_first("null")
            author = re.match(r'.+:(.*)', author).group(1).replace('\"', "")

            item['num'] = self.count
            item['title'] = title
            item['author'] = author
            item['rate'] = rate
            self.count += 1

            yield item

        next_page = response.xpath(
            '//span[@class="next"]/a/@href').extract_first("")
        if next_page:
            print(next_page)
            yield scrapy.http.Request(next_page, callback=self.parse)
Ejemplo n.º 6
0
 def parse(self, response):
     item = DoubanbookItem()
     selector = scrapy.Selector(response)
     books = selector.xpath('//div[@class="bd doulist-subject"]')
     for each in books:
         # print each.extract()
         t = each.xpath('div[@class="title"]/a/text()').extract()
         title = t[0].replace(' ', '').replace('\n', '')
         r = each.xpath(
             'div[@class="rating"]/span[@class="rating_nums"]/text()'
         ).extract()
         rate = r[0] if len(r) > 0 else ""
         author = re.search('<div class="abstract">(.*?)<br',
                            each.extract(), re.S).group(1)
         author = author.replace(' ', '').replace('\n', '')
         print 'Title:' + title
         print 'Rate:' + rate
         print author
         print ''
         item['title'] = title
         item['rate'] = rate
         item['author'] = author
         yield item
         nextPage = selector.xpath(
             '//span[@class="next"]/link/@href').extract()
         if nextPage:
             next = nextPage[0]
             # print next
             yield scrapy.http.Request(next, callback=self.parse)
Ejemplo n.º 7
0
    def parse(self, response):
        #print(response.body)
        item = DoubanbookItem()
        selector = scrapy.Selector(response)  #scrapy.Selector提取内容
        books = selector.xpath('//div[@class="bd doulist-subject"]')
        for each in books:
            title = each.xpath('div[@class="title"]/a/text()').extract()[0]
            #xpath如果要提取内容,需要在后面加上.extract()   extract提取
            rate = each.xpath(
                'div[@class="rating"]/span[@class="rating_nums"]/text()'
            ).extract()[0]
            author = re.search('<div class="abstract">(.*?)<br',
                               each.extract(), re.S).group(1)

            title = title.replace(' ', '').replace('\n', '')
            author = author.replace(' ', '').replace('\n', '')

            item['title'] = title
            item['rate'] = rate
            item['author'] = author

            yield item
            nextPage = selector.xpath(
                '//span[@class="next"]/link/@href').extract()
            if nextPage:
                next = nextPage[0]
                print(next)
                yield scrapy.http.Request(next, callback=self.parse)
Ejemplo n.º 8
0
 def parse(self, response):
     #print response.body
     item = DoubanbookItem()
     selector = scrapy.Selector(response)
     albums = selector.xpath('//div[@class="bd doulist-subject"]')
     for each in albums:
         title = each.xpath('div[@class="title"]/a/text()').extract()[0]
         title = title.replace(' ', '').replace('\n', '')
         rate = each.xpath(
             'div[@class="rating"]/span[@class="rating_nums"]/text()'
         ).extract()[0]
         artist = re.search('<div class="abstract">(.*?)<br',
                            each.extract(), re.S).group(1)
         artist = artist.replace(' ', '').replace('\n', '')
         item['title'] = title
         item['rate'] = rate
         item['artist'] = artist
         #print 'title' + title
         #print 'rate' + rate
         #print artist
         print ''
         yield item
         nextPage = selector.xpath(
             '//span[@class="next"]/link/@href').extract()
         if nextPage:
             next = nextPage[0]
             print next
             yield scrapy.http.Request(next, callback=self.parse)
Ejemplo n.º 9
0
    def parse_item(self, response):

        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        item = DoubanbookItem()
        # 标题
        item['title'] = response.xpath(
            '//div[contains(@class, "pagecenter p3")]//strong/text()').extract(
            )[0]
        # 编号
        item['author'] = item['title'].split(' ')[-1].split(":")[-1]
        # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合
        content = response.xpath('//div[@class="contentext"]/text()').extract()
        # 如果没有内容,则返回空列表,则使用无图片情况下的匹配规则
        if len(content) == 0:
            content = response.xpath(
                '//div[@class="c1 text14_2"]/text()').extract()
            item['info'] = "".join(content).strip()
        else:
            item['info'] = "".join(content).strip()
        # 链接
        item['url'] = response.url

        yield item
Ejemplo n.º 10
0
    def parse(self, response):
        info_list = response.xpath('//div[@class="info"]')
        # print(info_list)

        for info in info_list:
            item = DoubanbookItem()
            item['title'] = info.xpath(
                './/div[@class="title"]/a/text()').extract_first()
            item['author'] = info.xpath(
                './/span[1]/span[2]/a/text()').extract_first()
            item['category'] = info.xpath(
                './/span[@itemprop="genre"]/text()').extract_first()
            item['rate'] = info.xpath(
                './/span[@class="rating-average"]/text()').extract_first()
            item['count'] = info.xpath(
                './/a[@class="ratings-link"]/span/text()').extract_first()
            item['brief'] = info.xpath(
                './/div[@class="article-desc-brief"]/text()').extract_first()
            yield item

        next_temp_url = response.xpath(
            '//li[@class="next"]/a/@href').extract_first()
        if next_temp_url:
            next_url = response.urljoin(next_temp_url)
            yield scrapy.Request(next_url, callback=self.parse)
Ejemplo n.º 11
0
    def parse(self, response):
        print(response.url)
        commonitems = Selector(
            response=response).xpath('//li[@class="comment-item"]')
        # print(commonitems)
        for ci in commonitems:
            star = ci.xpath(
                './div[@class="comment"]/h3/span[@class="comment-info"]/span[1]/@title'
            ).extract_first().strip()
            vote = ci.xpath(
                './div[@class="comment"]/h3/span[@class="comment-vote"]/span[@class="vote-count"]/text()'
            ).extract_first().strip()
            short = ci.xpath(
                './div[@class="comment"]/p[@class="comment-content"]/span[@class="short"]/text()'
            ).extract_first().strip()

            # print(star)
            # print(vote)
            # print(short)
            # 在items.py定义DoubanbookItem
            item = DoubanbookItem()
            item['star'] = star
            item['vote'] = vote
            item['short'] = short
            yield item
Ejemplo n.º 12
0
    def parse(self, response):
        # print response.body
        item = DoubanbookItem()
        selector = scrapy.Selector(response)
        books = selector.xpath('//div[@class="bd doulist-subject"]')
        for each in books:
            title = each.xpath('div[@class="title"]/a/text()').extract()[0]
            rate = each.xpath(
                'div[@class="rating"]/span[@class="rating_nums"]/text()'
            ).extract()[0]
            author = re.search('<div class="abstract">(.*?)<br',
                               each.extract(), re.S).group(1)

            title = title.replace(' ', '').replace('\n', '')
            author = author.replace(' ', '').replace('\n', '')

            item['title'] = title
            item['rate'] = rate
            item['author'] = author

            print '标题:', title
            print '评分:', rate
            print author
            print ''

            # 让scrapy自动去处理item
            yield item

            # 爬后面几页
            nextPage = selector.xpath(
                '//span[@class="next"]/link/@href').extract()
            if nextPage:
                next = nextPage[0]
                print next
                yield scrapy.http.Request(next, callback=self.parse)
 def parse_2(self, response):
     for i, book in enumerate(response.css('#wrapper')):
         item = DoubanbookItem()
         item['rate_num'] = book.xpath(
             './/*[@id="interest_sectl"]/div/div[2]/strong/text()'
         ).extract_first()
         if round(float(item['rate_num'].strip())) >= 7:
             item['name'] = book.xpath('.//h1/span/text()').extract_first()
             yield item
Ejemplo n.º 14
0
 def parse_item(self,response):
     type_out = True
     sel=Selector(response)
     item=DoubanbookItem()
     item['name']=sel.xpath('//*[@id="wrapper"]/h1/span/text()').extract()[0]
     item['url']=response.url
     try:
         item['author']=sel.xpath('//*[@id="info"]/a[1]/text()').extract()[0]
     except Exception,e:
         type_out = False
         item['author']=sel.xpath('//*[@id="info"]/span[1]/a/text()').extract()[0]
Ejemplo n.º 15
0
    def parse(self, response):

        links = response.xpath('//*[@id="subject_list"]/ul/li/div[2]/h2/a')
        for link in links:
            item = DoubanbookItem()
            item['href'] = link.xpath('@href').extract_first()
            item['title'] = link.xpath('@title').extract_first()
            yield item

        next = response.xpath('//*[@class="next"]/a/@href').extract_first()
        if next is not None:
            yield scrapy.Request(response.urljoin(next))
Ejemplo n.º 16
0
 def parse_data(self, response):
     item = DoubanbookItem()
     item['book_name'] = response.xpath(
         '//*[@id="wrapper"]/h1/span/text()').extract()[0]
     item['auth_name'] = response.xpath(
         '//*[@id="info"]/span[1]/a/text()').extract()[0]
     item['book_url'] = response.meta['url']
     item['pic_url'] = response.xpath(
         '//*[@id="mainpic"]/a/img/@src').extract()[0]
     item['rate'] = response.xpath(
         '//*[@id="interest_sectl"]/div/div[@class="rating_self clearfix"]/strong/text()'
     ).extract()[0]
     item['rate_num'] = response.xpath(
         '//span[@property="v:votes"]/text()').extract()[0]
     print "----------- Current book:%s-----------" % item[
         'book_name'].encode('gbk')
     yield item
Ejemplo n.º 17
0
    def parse(self, response):
        # ###debug
        # items = []
        soup = BeautifulSoup(response.text, 'html.parser')
        title_list = soup.find_all('div', attrs={'class': 'pl2'})
        for i in range(len(title_list)):
            # 在items.py定义
            item = DoubanbookItem()
            title = title_list[i].find('a').get('title')
            link = title_list[i].find('a').get('href')
            item['title'] = title
            item['link'] = link
        #     ##debug
        #     items.append(item)
        # return items

            yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
Ejemplo n.º 18
0
    def parse(self, response):
        # ###debug
        # items = []
        selector = etree.HTML(response.text)
        IDs = selector.xpath(
            '//*[@id="content"]/div/div[1]/div[1]/div/@data-cid')
        for id in IDs:
            # 在items.py定义
            item = DoubanbookItem()
            full_review_url = 'https://book.douban.com/j/review/' + id + '/full'
            item['url'] = full_review_url
            #     ##debug
            #     items.append(item)
            # return items

            yield scrapy.Request(url=full_review_url,
                                 meta={'item': item},
                                 callback=self.parse2)
Ejemplo n.º 19
0
 def parse(self, response):
     item = DoubanbookItem()
     book_name = self.get_name(response)
     author_name = self.get_author(response)
     # book_url = self.get_bookurl(response)
     pic_url = self.get_picurl(response)
     # rate = self.get_rate(response)
     # rate_num = self.get_ratenum(response)
     length = len(book_name)
     for i in range(length):
         print "----------- Current num:%d -----------" % i
         item['book_name'] = book_name[i]
         item['auth_name'] = author_name[i].replace('\n','').replace('  ', '')
         # item['book_url'] = book_url[i]
         item['pic_url'] = pic_url[i]
         # item['rate'] = rate[i]
         # item['rate_num'] = rate_num[i].replace('\n','').replace('  ', '')
         yield item
Ejemplo n.º 20
0
 def parse(self, response):
     item = DoubanbookItem()
     selector = scrapy.Selector(response)
     item['name'] = "".join(
         selector.xpath(
             "//*[@id=\"wrapper\"]/h1/span/text()").extract()).replace(
                 ' ', '').replace('\n', '')
     writer_case1 = "/".join(
         selector.xpath(
             "//*[@id=\"info\"]/span[1]/a/text()").extract()).replace(
                 ' ', '').replace('\n', '')
     writer_case2 = "/" "/".join(
         selector.xpath("//*[@id=\"info\"]/a[1]/text()").extract()).replace(
             ' ', '').replace('\n', '')
     item['date'] = "".join(
         selector.xpath(
             "//*[@id=\"info\"]/span[contains(./text(),'出版年')]/following::text()[1]"
         ).extract()).replace(' ', '').replace('\n', '')
     item['pagenum'] = "".join(
         selector.xpath(
             "//*[@id=\"info\"]/span[contains(./text(),'页数')]/following::text()[1]"
         ).extract()).replace(' ', '').replace('\n', '')
     item['ISBN'] = "".join(
         selector.xpath(
             "//*[@id=\"info\"]/span[contains(./text(),'ISBN')]/following::text()[1]"
         ).extract()).replace(' ', '').replace('\n', '')
     item['price'] = "".join(
         selector.xpath(
             "//*[@id=\"info\"]/span[contains(./text(),'定价')]/following::text()[1]"
         ).extract()).replace(' ', '').replace('\n', '')
     item['tags'] = ";".join(
         selector.xpath("//*[@id=\"db-tags-section\"]/div/span/a/text()").
         extract()).replace(' ', '').replace('\n', '')
     if (writer_case1 == ''):
         item['writer'] = writer_case2
     else:
         item['writer'] = writer_case1
     yield item
     url_list = response.xpath(
         "//div[@class='content clearfix']/dl/dd/a/@href").extract()
     for url in url_list:
         yield scrapy.Request(url=url, callback=self.parse)
Ejemplo n.º 21
0
    def parse2(self, response):
        print('response.url: ', response.url)
        commonitems = Selector(
            response=response).xpath('//li[@class="comment-item"]')
        for ci in commonitems:
            short = ci.xpath(
                './div[@class="comment"]/p[@class="comment-content"]/span[@class="short"]/text()'
            ).extract_first().strip()
            shorttime = ci.xpath(
                './div[@class="comment"]//span[@class="comment-info"]/span[2]/text()'
            ).extract_first().strip()
            # 判断数据是否已经读取过,读取过则返回
            sql = 'select count(*) from hlmshorts_new t where t.S_SHORTSTIME = "%s" and t.S_SHORTS = "%s"' % (
                shorttime, short)
            df = db.readtable(sql)
            cnt = df.iat[0, 0]
            if cnt > 0:
                return

            star = ci.xpath(
                './div[@class="comment"]/h3/span[@class="comment-info"]/span[1]/@title'
            ).extract_first().strip()
            vote = ci.xpath(
                './div[@class="comment"]/h3/span[@class="comment-vote"]/span[@class="vote-count"]/text()'
            ).extract_first().strip()
            # 在items.py定义DoubanbookItem
            item = DoubanbookItem()
            item['star'] = star
            item['vote'] = vote
            item['short'] = short
            item['shorttime'] = shorttime
            yield item
        # 取下一页数据
        nextpage1 = Selector(response=response).xpath(
            '//div[@class="paginator-wrapper"]/ul[@class="comment-paginator"]/li[last()]/a/@href'
        )
        if nextpage1:
            nextpage = nextpage1.extract_first().strip()
            print('nextpage: ', nextpage)
            url = f'{HongloumengSpider.start_urls[0]}{nextpage}'
            yield scrapy.Request(url=url, callback=self.parse2)
            time.sleep(5)
Ejemplo n.º 22
0
 def parse(self, response):
     #print response.body
     item = DoubanbookItem()
     selector = scrapy.Selector(response)
     books = selector.xpath('//div[@class="bd doulist-subject"]')
     for each in books:
         title = each.xpath('div[@class="title"]/a/text()').extract()[0]
         rate = each.xpath(
             'div[@class="rating"]/span[@class="rating_nums"]/text()'
         ).extract()[0]
         author = re.search('<div class="abstract">(.*?)<br',
                            each.extract(), re.S).group(1)
         title = title.replace(' ', '').replace('\n', '')
         author = author.replace(' ', '').replace('\n', '')
         item['title'] = title
         item['rate'] = rate
         item['author'] = author
         # print 'title:' + title
         # print 'rate:' + rate
         # print author
         # print ''
         yield item
Ejemplo n.º 23
0
    def parse_item(self, response):
        pass

        print("-------3-------")
    	item = DoubanbookItem()

        title = response.xpath('//div[@class="article-profile-bd"]/h1/text()').extract()
        if len(title) != 0:
            item['title'] = title[0]

    	author = response.xpath(' //div[@class="article-meta"]/p[@class="author"]//a/text()').extract()
        if len(author) != 0:
            item['author'] = author[0]

    	info = response.xpath('//div[@class]/div[@class="info"]/p/text()').extract()
        if len(info) != 0:
            item['info'] = info[0]

    	item['url'] = response.url

    	yield item

    	
Ejemplo n.º 24
0
    def parse_item(self, response):
        """
            解析页面数据
        """
        # 根据页面中item的总数,遍历解析出每个item
        for i in range(self.get_item_count(response)):
            item = DoubanbookItem()

            # 书名
            item['book_name'] = self.get_book_name(response, i)
            # 出版信息
            item['info'] = self.get_info(response, i)
            # 简介
            item['intro'] = self.get_intro(response, i)
            # 评分
            item['grade'] = self.get_grade(response, i)
            # 评价人数
            item['evaluate_number'] = self.get_evaluate_number(response, i)
            # 来源链接
            item['source_url'] = self.get_source_url(response, i)
            # 来源名称
            item['source_name'] = 'douban'

            yield item