Ejemplo n.º 1
0
    def parse_item(self, response):
        ratings = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
        book = response.css('.product_page .product_main')

        stock = book.css('.instock').get()
        breabcumb = response.css('.breadcrumb li')

        name = book.css('h1::text').get()
        price = float(
            re.search(r'\d+\.\d+',
                      book.css('.price_color::text').get())[0])
        available = book.css('.instock .icon-ok') is not None
        quantity = int(re.search(r'\d+', stock)[0])
        rating = ratings[book.css('.star-rating::attr(class)').get().replace(
            'star-rating ', '')]
        category = breabcumb[2].css('a::text').get()
        upc = response.css('.product_page table tr')[0].css('td::text').get()

        return BooksItem(name=name,
                         price=price,
                         quantity=quantity,
                         available=available,
                         rating=rating,
                         category=category,
                         url=response.url,
                         scrape_date=datetime.today().isoformat(),
                         upc=upc)
Ejemplo n.º 2
0
 def parse(self, response):
     item = BooksItem()
     book_list = response.xpath('//u1[@class="subject-list"]/li[@class="subject-item"]/div[@class= "info" ]')
     for book in book_list:
         try:
             title = book.xpath('/h2/a/text()').extract()
             pub = book.xpath('/h2/div[@class="pub"]/text()').extract().strip().split('/')
             price = pub.pop()
             date = pub.pop()
             publish = pub.pop()
             auther = '/'.join(pub)
             grade = book.xpath('div[@class="star clearfix"]/span[@class="rating_nums"]/text()').extract().strip()
             count = book.xpath('div[@class="star clearfix"]/span[@class="pl"]/text()').extract().strip()
             desc = book.xpath('p/text()').extract().strip()
             link = book.xpath('div[@class="ft"]/div[@class="ebook-link"]/a/@href').extract()
             item['book_name'] = title
             item['book_auther'] = auther
             item['book_grade'] = grade
             item['book_count'] = count
             item['book_publish'] = publish
             item['book_date'] = date
             item['book_price'] = price
             item['book_desc'] = desc
             item['book_link'] = link
             yield item
         except:
             pass
     nextpage = response.xpath('//div[@class="paginator"]/span[@class="next"]/a/@href').extract()[0].strip()
     if nextpage is not None:
         nextpage = response.urljoin(nextpage)
         yield scrapy.Request(nextpage)
Ejemplo n.º 3
0
    def parse(self, response):
        sel = Selector(response)
        book_list = sel.css('#subject_list > ul > li')
        for book in book_list:
            item = BooksItem()
            try:
                item['book_name'] = book.xpath(
                    'div[@class="info"]/h2/a/text()').extract()[0].strip()
                item['book_star'] = book.xpath(
                    'div[@class="info"]/div[2]/span[@class="rating_nums"]/text()'
                ).extract()[0].strip()
                item['book_pl'] = book.xpath(
                    'div[@class="info"]/div[2]/span[@class="pl"]/text()'
                ).extract()[0].strip()
                pub = book.xpath('div[@class="info"]/div[@class="pub"]/text()'
                                 ).extract()[0].strip().split('/')
                item['book_price'] = pub.pop()
                item['book_date'] = pub.pop()
                item['book_publish'] = pub.pop()
                item['book_author'] = '/'.join(pub)
                yield item
            except Exception as e:
                print(e)

        nextPage = sel.xpath(
            '//div[@id="subject_list"]/div[@class="paginator"]/span[@class="next"]/a/@href'
        ).extract()[0].strip()

        if nextPage:
            next_url = 'https://book.douban.com' + nextPage
            yield Request(next_url, callback=self.parse)

        pass
Ejemplo n.º 4
0
    def parse_review_page(self, response):

        Title = response.xpath(
            '//div[@id="metacol"]/h1/text()').extract_first().strip()
        Author = response.xpath(
            '//div[@class="authorName__container"]/a/span/text()'
        ).extract_first()
        Score = response.xpath(
            '//div[@id="bookMeta"]/span/text()').extract_first().strip()

        Pages = response.xpath('//span[@itemprop="numberOfPages"]/text()'
                               ).extract_first().strip()
        Pages = int(re.findall('\d+', Pages)[0])

        Genre = response.xpath(
            '//a[@class="actionLinkLite bookPageGenreLink"]/text()').extract()
        Year = str(response.xpath('//div[@class="row"]//text()').extract())
        Year = int(re.findall('((200|201)\d+)', Year)[0][0])

        item = BooksItem()
        item['Title'] = Title
        item['Author'] = Author
        item['Score'] = Score
        item['Pages'] = Pages
        item['Genre'] = Genre
        item['Year'] = Year

        yield item
Ejemplo n.º 5
0
    def parse(self, response):
        divs = response.xpath("//div[@father='1']")[2:12]
        for div in divs:
            item = BooksItem()
            item["l_one"] = [
                x.strip() for x in div.xpath("./dl/dt//text()").extract()
                if len(x.strip()) > 0
            ][0]
            dls = div.xpath(".//dl[@class='inner_dl']")
            for dl in dls:

                item["l_two"] = [
                    x.strip() for x in dl.xpath("./dt/a/text()").extract()
                ]
                if not item["l_two"]:
                    item["l_two"] = dl.xpath(
                        "./dt[position()=1]//text()").extract_first().strip()
                else:
                    item["l_two"] = item["l_two"][0]

                a_list = dl.xpath("./dd/a")
                for a in a_list:
                    item["l_three"] = a.xpath("./text()").extract_first()
                    item['category_url'] = a.xpath("./@href").extract_first()
                    yield scrapy.Request(item['category_url'],
                                         callback=self.parse_detail,
                                         meta=deepcopy(item))
Ejemplo n.º 6
0
    def parse(self, response):
        sel = Selector(response)
        book_list = sel.css('#subject_list>ul>li')
        for book in book_list:
            item = BooksItem()
            try:
                #strip 方法用于一处字符串头尾指定的字符(默认空格)
                item['book_name'] = book.xpath(
                    'div[@class="info"]/h2/a/text()').extract()[0].strip()

                item['book_star'] = book.xpath(
                    "div[@class='info']/div[2]/span[@class='rating_nums']/text()"
                ).extract()[0].strip()
                item['book_pl'] = book.xpath(
                    "div[@class='info']/div[2]/span[@class='pl']/text()"
                ).extract()[0].strip()
                #   item['book_pl'] = book.xpath("div[@class='info']/div[2]/span[@class='pl']/text()").extract()[0].strip()
                pub = book.xpath('div[@class="info"]/div[@class="pub"]/text()'
                                 ).extract()[0].strip().split('/')
                #   pub = book.xpath('div[@class="info"]/div[@class="pub"]/text()').extract()[0].strip().split('/')
                item['book_price'] = pub.pop()
                item['book_date'] = pub.pop()
                item['book_publish'] = pub.pop()
                item['book_author'] = '/'.join(pub)
                yield item
            except:
                pass
        nextPage = sel.xpath(
            '//div[@id="subject_list"]/div[@class="paginator"]/span[@class="next"]/a/@href'
        ).extract()[0].strip()
        if nextPage:
            next_url = 'https://book.douban.com' + nextPage
            yield scrapy.http.Request(next_url, callback=self.parse)
Ejemplo n.º 7
0
    def detail_parse(self, response):
        item = BooksItem()
        content = []
        selector = Selector(response)
        reads = selector.xpath('//div[@id="tab1"]/div/ol/li').extract()
        for read in reads:
            read = read.replace("<li>", "").replace("</li>",
                                                    "").replace("\r", " ")
            content.append(read)

        #content = reads.xpath('./text()').extract()

        #content = reads
        #content = reads.pop().replace("<li>", "")

        str_convert = ''.join(content)
        price = selector.xpath(
            '// *[ @ id = "book_cart_box"] / div[2] / div / div[1]/text()'
        ).extract()

        # print  123
        # print str_convert
        item['title'] = response.meta["title"]
        item['price'] = price
        item['content'] = str_convert
        item['bookurl'] = response.meta["bookurl"]
        yield item

        nextLink = selector.xpath(
            '//div[@class="plain_page"]/div/span/span/a/@href').extract()
        #第10页是最后一页,没有下一页的链接
        if nextLink:
            nextLink = nextLink[0]
            print nextLink
            yield Request(self.url + nextLink, callback=self.parse)
Ejemplo n.º 8
0
 def parse(self, response):
     items = BooksItem()
     for book in response.xpath('//article[@class="product_pod"]'):
         items['title'] = book.xpath('./h3/a/@title').extract_first()  # 书本标题
         items['price'] = book.xpath('./div/p[@class="price_color"]/text()').extract_first()  # 书本价格
         review = book.xpath('./p[1]/@class').extract_first()  # 书本评级
         items['review'] = review.split(' ')[-1]
         self.Q.put(f"{items['title']}\n{items['price']}\n{items['review']}\n")
         yield items
Ejemplo n.º 9
0
 def parse_book_detail(self, response):
     if not response.css(".h1").xpath('./a/text()').get():
         yield scrapy.Request(url=response.url, dont_filter=True)
     item = BooksItem()
     item['title'] = response.xpath('//h1/text()').get()
     item['description'] = response.css('.product_page').xpath(
         './p/text()').get()
     item['price'] = response.css('.price_color::text').get()
     item['UPC'] = response.css('tr td::text').extract_first()
     item['rating'] = response.css('p.star-rating').xpath(
         './@class').get().split(' ')[1]
     yield item
Ejemplo n.º 10
0
    def parse(self, response):
        item = BooksItem()

        lies = response.css('ol.row >li')
        for li in lies:
            item['title'] = li.xpath('article/div/a/img/@alt').extract()
            item['price'] = li.css('div.product_price p::text').extract()[0]
            item['star'] = li.xpath('article/p/@class').extract()
            yield item

        next_url = response.xpath(
            '//li[@class="next"]/a/@href').extract_first()
        if next_url:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, callback=self.parse)
Ejemplo n.º 11
0
    def getinfo(self,response):
        item = BooksItem()
        bookname = response.xpath('//h1/text()').extract()[0]
        bookprice = response.xpath('//div/p/text()').extract()[0]
        #去除货币符号
        bookprice = ''.join(re.findall(r"[\d+\.\d+]", bookprice))
        bookdescription = response.xpath('//article/p/text()').extract()[0]
        bookid = response.xpath('//tr[1]/td/text()').extract()[0]
        item['name'] = bookname
        item['price'] = bookprice
        item['des'] = bookdescription
        item['id'] = bookid
        yield item
        #print(bookname,bookprice,bookid)

        
Ejemplo n.º 12
0
    def get_content(self,response):
        item = BooksItem()

        name = response.xpath("//span[@id='breadnav']/a[2]/text()").get()
        item["name"] = name

        zhangjie = response.xpath("//h1[@id='title']/text()").get()
        item["zhangjie"] = zhangjie

        zhangjie_content = ""
        contents = response.xpath("//div[@class='vcon']/p/text()").extract()
        for content in contents:
            zhangjie_content = zhangjie_content + content
            item["content"] = zhangjie_content

        yield item
Ejemplo n.º 13
0
 def parse_item(self, response):
     item = BooksItem()
     # print(response.status)
     if response.status == 200:
         # try:
             item['url'] = response.url
             item['name'] = response.xpath("//div[@id='wrapper']/h1/span/text()").extract_first()
             # item['author'] = response.xpath("//div[@id='info']/a[1]/text()")[0].extract().replace('\n' ,'').replace(' ', '')
             _info = response.xpath("//div[@id='info']//text()").extract()
             info = [s.strip() for s in _info if s.strip() != '']
             # item['publishing'] = info[0]
             # item['publsh_time'] = info[-5]
             # item['page'] = info[-4]
             # item['price'] = info[-3]
             # item['ISBN'] = info[-1]
             item['author'] = ''
             item['publishing'] = ''
             item['publish_time'] = ''
             item['page'] = ''
             item['price'] = ''
             item['ISBN'] = ''
             item['score'] = 0.0
             item['evaluation_num'] = 0
             if '作者' in info:
                 item['author'] = info[info.index('作者') + 2]
             if '作者:' in info:
                 item['author'] = info[info.index('作者:') + 1]
             if '出版社:' in info:
                 item['publishing'] = info[info.index('出版社:') + 1]
             if '出版年:' in info:
                 item['publish_time'] = info[info.index('出版年:') + 1]
             if '页数:' in info:
                 item['page'] = info[info.index('页数:') + 1]
             if '定价:' in info:
                 item['price'] = info[info.index('定价:') + 1]
             if 'ISBN:' in info:
                 item['ISBN'] = info[info.index('ISBN:') + 1]
             flag = response.xpath("//div[@class='rating_sum']/span/a/text()").extract()
             if flag:
                 if flag[0] == '人评价':
                     item['score'] = response.xpath("//div[@class='rating_self clearfix']/strong/text()")[0].extract().strip()
                     item['evaluation_num'] = response.xpath("//a[@class='rating_people']/span/text()")[0].extract()
             yield item
         # except:
         #     print(f'{response.url} have some problem')
     else:
         print('***********************something  wrong***********************')
Ejemplo n.º 14
0
 def category_parse(self, response):
     
     # Extracting data of each book from the page.
     for book in response.xpath("//section/div/ol[@class='row']/li"):
         loader = ItemLoader(item=BooksItem(), selector=book)
         loader.add_xpath('book', ".//article[@class='product_pod']/h3/a/@title")
         loader.add_xpath('price', ".//article[@class='product_pod']/div[@class='product_price']/p[""@class='price_color']/text()")
         loader.add_xpath('image_url', ".//div[@class='image_container']//img//@src")
         loader.add_xpath('book_url', ".//div[@class='image_container']//a//@href")
                     
         yield loader.load_item()
         
     # Navigating to next page if it exists.
     next_page = response.xpath("//section/div/div//ul[@class='pager']/li[@class='next']/a/@href").extract_first()
     if next_page is not None:
         next_page_link = response.urljoin(next_page)
         yield scrapy.Request(url=next_page_link, callback=self.category_parse)
 def book_parse(self, response: HtmlResponse):
     name = response.xpath('//h1/text()').extract()
     link = response.url
     author = response.xpath(
         '//div[@class = "authors"][1]/a/text()').extract()
     old_price = response.xpath(
         '//span[@class="buying-priceold-val-number"]/text()'
     ).extract_first()
     new_price = response.xpath(
         "//span[@class='buying-pricenew-val-number']/text()"
     ).extract_first()
     rating = response.xpath("//div[@id='rate']/text()").extract_first()
     yield BooksItem(name=name,
                     link=link,
                     author=author,
                     old_price=old_price,
                     new_price=new_price,
                     rating=rating)
Ejemplo n.º 16
0
    def parse_book(self, response):

        nome = response.css('.product_main').css('h1::text').get()
        preco = response.css('.product_main').css(
            '.price_color::text').get().replace('£', '')

        disponivel_selector = response.css('.product_main').css(
            '.availability.instock')

        if disponivel_selector is not None:
            disponivel = True

            regex = re.compile('\d+')
            quantidade = regex.search(
                response.css('table tr:nth-child(6) td').get()).group()

        avaliacao = response.css('.star-rating').xpath("@class").get().replace(
            'star-rating ', '')

        if avaliacao == 'One':
            avaliacao = 1
        elif avaliacao == 'Two':
            avaliacao = 2
        elif avaliacao == 'Three':
            avaliacao = 3
        elif avaliacao == 'Four':
            avaliacao = 4
        else:
            avaliacao = 5

        categoria = response.css('.breadcrumb li:nth-child(3)').css(
            'a::text').get()
        UPC = response.css('table tr:nth-child(1) td::text').get()
        url = response.request.url

        yield BooksItem(nome=nome,
                        preco=float(preco),
                        disponivel=disponivel,
                        quantidade=int(quantidade),
                        avaliacao=float(avaliacao),
                        categoria=categoria,
                        UPC=UPC,
                        url=url,
                        data=datetime.now().isoformat())
 def book_parse(self, response: HtmlResponse):
     name = response.xpath(
         '//h1[@class="item-detail__title"]/text()').extract()
     link = response.url
     author = response.xpath(
         "//a[@class='item-tab__chars-link js-data-link']/text()").extract(
         )
     old_price = response.xpath(
         "//div[@class='item-actions__price-old']/text()").extract_first()
     new_price = response.xpath(
         "//div[@class='item-actions__prices']//b/text()").extract_first()
     rating = response.xpath(
         "//span[@class='rating__rate-value']/text()").extract_first()
     yield BooksItem(name=name,
                     link=link,
                     author=author,
                     old_price=old_price,
                     new_price=new_price,
                     rating=rating)
Ejemplo n.º 18
0
 def parse(self, response):
     for info in response.css('.product_pod'):
         item = BooksItem()
         # print(info)
         item['name'] = info.css('h3>a::attr(title)').extract_first()
         # name = info.xpath('./h3/a/@title').extract_first()
         # print(name)
         item['price'] = info.css(
             '.product_price .price_color::text').extract_first()
         # price = info.xpath('//p[@class="price_color"]/text()').extract()
         # print(price)
         yield item
         bookstr = item['name'] + '\t' + item['price'] + '\n'
         self.f.write(bookstr)
     le = LinkExtractor(restrict_css='ul.pager li.next')
     links = le.extract_links(response)
     if links:
         next_url = links[0].url
         yield scrapy.Request(next_url, callback=self.parse)
     '''next_url = response.css('.pager .next>a::attr(href)').extract_first()
Ejemplo n.º 19
0
 def parse(self, response):
     books = Selector(response).xpath('//article[@class="product_pod"]')
     for book in books:
         item = BooksItem()
         item['title'] = book.xpath('div/a/img/@alt').extract()[0]
         item['price'] = book.xpath(
             'div/p[@class="price_color"]/text()').extract()[0]
         instock_status = "".join(
             book.xpath(
                 'div/p[@class="instock availability"]/text()').extract())
         instock_status = instock_status.strip('\n')
         instock_status = instock_status.strip()
         item['in_stock'] = instock_status
         rating = book.xpath(
             'p[contains(@class, "star-rating")]/@class').extract()[0]
         rating = rating.replace("star-rating ", "")
         item['rating'] = rating
         item['url'] = book.xpath(
             'div[@class="image_container"]/a/@href').extract()[0]
         yield item
Ejemplo n.º 20
0
    def parse(self, response):
        data = OrderedDict(BooksItem())
        books = response.css('ol.row')
        for book in books:
            for b in book.css('article.product_pod'):
                data['title'] = b.css('a::attr(title)').getall()
                '''
                #without import items  
                
                data['title']=b.css('a::attr(title)').getall()
                data['price']=b.css('div.product_price p.price_color::text').getall()
                data['stock']=b.css('div.product_price p.instock.availability::text').getall()[1].strip()
                data['rating']=conv[b.css('p::attr(class)').getall()[0].split()[-1]]
                '''
                yield data

        nextPage = response.css('li.next a::attr(href)').getall()
        if self.COUNT < self.LIMIT:
            nextLink = response.urljoin(nextPage[0])
            self.COUNT += 1
            yield scrapy.Request(url=nextLink, callback=self.parse)
Ejemplo n.º 21
0
    def parse(self, response):
        getItems = response.css('.wrap .item')
        for getItem in getItems:
            item = BooksItem()
            title = getItem.css('h4 a::text').get()
            author = getItem.css('.info a::text').getall()[0]
            price = getItem.css('.price_box .set2 strong::text').getall()[1]
            item['title'] = title
            item['author'] = author
            item['price'] = price
            yield item

        next_page = response.css('.wrap .nxt::attr(href)').get()
        print(next_page)

        # 下面這一段老師是教 yield response.follow(next_page,  self.parse)
        # 但是我試了就是沒有過,會是因為爬取不同網頁的關係嗎? 他爬的是 PTT
        page = 0
        if next_page is not None and page < 3:
            page += 1
            # time.sleep(3)
            yield Request(start_urls=next_page, callback=self.parse)
Ejemplo n.º 22
0
    def parse(self,response):

        item = BooksItem()
        selector = Selector(response)
        books = selector.xpath('//a[@class="tip"]')

        for book in books:
            title = book.xpath('./text()').extract()
            item['title'] = title
            id = book.xpath('./@href').extract()
            id = id[0]
            whichbook = "http://www.oreilly.com.cn/{}".format(str(id))
            item['content']=whichbook
            yield item

#             yield  scrapy.Request(url=whichbook,callback=self.detail_parse,
#                                      meta={"title":title})
        nextLink = selector.xpath('//div[@class="plain_page"]/div/span/span/a/@href').extract()
        #第10页是最后一页,没有下一页的链接
        if nextLink:
            nextLink = nextLink[0]
            print nextLink
            yield Request(self.url + nextLink,callback=self.parse)
Ejemplo n.º 23
0
    def parse(self, response):
        urls = "https://book.douban.com/tag/"
        isFirst = re.findall('start=(\d*)&', response.url)
        if isFirst:
            pass
        else:
            try:
                related = response.xpath(
                    '//div[@class="tags-list"]/a/@href').extract()
            except:
                related = None
            for rel in related:
                next_tag = parse.quote(rel)
                next_tag = 'https://book.douban.com' + next_tag
                with open("tag.txt") as f:
                    url_list = f.read()
                if next_tag not in url_list:
                    with open("tag.txt", "a") as f:
                        f.write(next_tag + '----')
                    yield Request(next_tag)

            # print(related)
        try:
            result = (response.url.split('ag')[0]) in urls  # 判断是否为列表页
        except:
            result = False
        ret = 'doings'
        item = BooksItem()

        # wait = round(random.uniform(0, 1), 2)
        # time.sleep(wait)
        # print(wait)

        if result:

            # 详情页
            count = 0
            for href in response.xpath(
                    '//li[@class="subject-item"]/div[@class="info"]//h2//a/@href'
            ).extract():
                count += 1
                with open("url.txt") as f:
                    url_list = f.read()
                if href not in url_list:
                    with open("url.txt", "a") as f:
                        f.write(href + '----')
                    print(href)
                    yield Request(href)

            print(response.url)
            if count == 0:
                print(
                    '---------------------------------------------------------------'
                )

            # yield下一页链接
            try:
                next = response.xpath(
                    '//span[@class="next"]/link/@href').extract_first()
            except:
                next = None
            if next:
                next_url = 'https://book.douban.com' + next
                yield Request(next_url)

        # elif ret in response.url:
        #     item['url'] = response.url.split('/d',1)[0]
        #     try:
        #         item['read'] = response.xpath('//*[@id="collections_bar"]/span/a/text()').extract()[0]
        #         item['read'] = re.findall(r'\d*', item['read'])[0]
        #     except:
        #         print("No find read!")
        #         item['read'] = 0
        #
        #     try:
        #         item['read_want'] = response.xpath('//*[@id="wishes_bar"]/span/a/text()').extract()[0]
        #         item['read_want'] = re.findall(r'\d*', item['read_want'])[0]
        #
        #     except:
        #         print("No find read_want!")
        #         item['read_want'] = 0
        #
        #     try:
        #         item['reading'] = response.xpath('//*[@id="doings_bar"]/span/text()').extract()[0]
        #         item['reading'] = re.findall(r'\d*', item['reading'])[0]
        #     except:
        #         print("No find reading!")
        #         item['reading'] = 0
        #     try:
        #         base_info = response.xpath('//div[@class="indent"]').extract()[0].replace('\n', '').replace(' ', '')
        #         item['ISBN'] = re.findall(r'isbn:</span>(\d*)<br>', base_info)[0]
        #     except:
        #         print('No find ISBN!')
        #         item['ISBN'] = 0
        #     try:
        #         item['author'] = re.findall(r"作者:(.*?)</span>", base_info, re.S)[0]
        #     except:
        #         print('No find 作者!')
        #         item['author'] = 0
        #     try:
        #         item['title'] = re.findall(r'书名:</span>(.*?)<br>', base_info)[0]
        #     except:
        #         item['title'] = 0
        #     try:
        #         item['press'] = re.findall(r'出版社:</span>(.*?)<br>', base_info)[0]
        #     except:
        #         item['press'] = 0
        #     try:
        #         item['trans'] = re.findall(r'译者:(.*?)</span>', base_info)[0]
        #     except:
        #         item['trans'] = 0
        #     try:
        #         date = re.findall(r'出版年:</span>(.*?)<br>', base_info)[0]
        #         item['publish_date'] = re.findall(r'\d*-\d+', date)[0]
        #     except:
        #         item['publish_date'] = 0
        #
        #     try:
        #         item['price'] = re.findall(r'定价:</span>:(\d*.\d*)<br>', base_info)[0]
        #     except:
        #         item['price'] = 0
        #
        #     try:
        #         item['score'] = response.xpath('//*[@id="content"]/div/div[1]/div[1]/div[1]/p/strong/text()').extract()[0]
        #     except:
        #         item['score'] = 0
        #     try:
        #         number = response.xpath('//*[@id="content"]/div/div[1]/h2/span/text()').extract()[0]
        #         item['number'] = re.findall(r'(\d*)', number)[0]
        #     except:
        #         item['number'] = 0
        #     try:
        #         item['image'] = response.xpath('//*[@id="content"]/div/div[2]/div[2]/a/img/@src').extract()[0]
        #     except:
        #         item['image'] = 0
        #
        #     try:
        #         distr = response.xpath('//div[@class="rating_detail_star"]').extract()[0].replace('\n', '').replace(' ', '')
        #         part = re.compile("(\d*.\d*)%")
        #         item['one'] = part.findall(distr)[4]
        #         item['two'] = part.findall(distr)[3]
        #         item['three'] = part.findall(distr)[2]
        #         item['four'] = part.findall(distr)[1]
        #         item['five'] = part.findall(distr)[0]
        #     except:
        #         print('can not find distr')
        #         item['one'] = 0
        #         item['two'] = 0
        #         item['three'] = 0
        #         item['four'] = 0
        #         item['five'] = 0
        #     try:
        #         item['reading_date'] = response.xpath('//div[@class="sub_ins"]//tr/td//p[@class="pl"]/span[1]/text()').extract()[0]
        #     except:
        #         item['reading_date'] = 0
        #
        #
        #
        #     postitem = dict(item)
        #     tes.insert(postitem)

        else:

            item['url'] = response.url
            try:
                item['title'] = response.xpath(
                    '//*[@id="wrapper"]/h1/span/text()').extract()[0]
            except:
                print("No find title!")
                item['title'] = 0

            try:
                item['score'] = response.xpath(
                    '//div[@id="interest_sectl"]//strong/text()').extract(
                    )[0].replace(' ', '')
            except:
                print("未找到评分")
                item['score'] = 0

            try:
                item['number'] = response.xpath(
                    '//div[@class="rating_sum"]//a//text()').extract(
                    )[0].replace('\n', '')
            except:
                print("未找到评分人数")
                item['number'] = 0

            # 基本信息
            try:
                base_info = response.xpath(
                    '//div[@id="info"]').extract()[0].replace('\n',
                                                              '').replace(
                                                                  ' ', '')
                item['info'] = base_info
            except:
                item['info'] = None

            try:
                author = re.findall(r"作者(.*?)</a>", base_info, re.S)[0]
                part = re.compile(r'">(.*)')
                item['author'] = part.findall(author)[0]
            except:
                print("No find author!")
                item['author'] = 0

            try:
                part = re.findall(r"译者</span>(.*?)</a>", base_info, re.S)[0]
                ret = re.compile(r'">(.*)')
                item['trans'] = ret.findall(part)[0]
            except:
                print("trans find error!")
                item['trans'] = 0

            try:
                press = re.findall(r'出版社(.*?)<br>', base_info)[0]
                part = re.compile(r'</span>(.*)')
                press = part.findall(press)[0]
                item['press'] = press
            except:
                print("No find press!")
                item['press'] = 0
            try:
                item['price'] = re.findall(r'定价:</span>(\d*.\d*)',
                                           base_info)[0]
            except:
                item['price'] = 0

            try:
                item['pages'] = re.findall(r'页数:</span>(\d*)<br>',
                                           base_info)[0]
            except:
                item['pages'] = None

            try:
                ret = re.findall(r'出版年(.*?)<br>', base_info)[0]
                ret = re.findall(r'\d*-\d+', ret)[0]
                item['publish_date'] = ret
            except:
                print("No find date!")
                item['publish_date'] = 0
            try:
                item['ISBN'] = re.findall(r'ISBN:</span>(\d*)', base_info)[0]
            except:
                item['ISBN'] = None

            try:
                reads = response.text
            except:
                pass
            try:
                item['reading'] = re.findall(r'>(\d*)人在读', reads)[0]
            except:
                item['reading'] = 0
            try:
                item['read_want'] = re.findall(r'>(\d*)人想读', reads)[0]
            except:
                item['read_want'] = 0
            try:
                item['read'] = re.findall(r'>(\d*)人读过', reads)[0]
            except:
                item['read'] = 0
            # try:
            #     read = response.xpath('//div[@id="collector"]/p/a/text()').extract()
            #     item['reading'] = re.findall(r'(\d*)人在读', read)[0]
            # except:
            #     # print("reading find error!")
            #     item['reading'] = 0
            #
            # try:
            #     item['read'] = re.findall(r'(\d*)人读过', read)[0]
            # except:
            #     # print("read find error!")
            #     item['read'] = 0
            #
            # try:
            #     item['read_want'] = re.findall(r'(\d*)人想读', read)[0]
            # except:
            #     # print('read_want find error!')
            #     item['read_want'] = 0

            try:
                item['label'] = response.xpath(
                    '//div[@id="db-tags-section"] /div[@class="indent"]/span/a/text()'
                ).extract()
            except:
                print("No label!")
                item['label'] = 0
            try:
                item['image'] = response.xpath(
                    '//*[@id="mainpic"]/a/img/@src').extract()[0]
            except:
                item['image'] = None

            try:
                item['short'] = response.xpath(
                    '//*[@id="comments"]//p[@class="comment-content"]/span/text()'
                ).extract()
            except:
                item['short'] = 0
            try:
                ret = response.xpath(
                    '//div[@id="buyinfo-ebook"]//li//text()').extract()
                item['price_d'] = re.findall(r'(\d*.\d*)', ret)[0]
            except:
                item['price_d'] = 0

            try:
                ret = response.xpath(
                    '//div[@class="mod-hd"]/h2/span/a/text()').extract()[0]
                item['short_number'] = re.findall(r'(\d+)', ret)[0]
            except:
                item['short_number'] = 0

            try:
                ret = response.xpath(
                    '//section[@class="reviews mod book-content"]//h2/span/a/text()'
                ).extract()[0]
                item['book_number'] = re.findall(r'(\d+)', ret)[0]
            except:
                item['book_number'] = 0

            try:
                ret = response.xpath(
                    '//div[@class="ugc-mod reading-notes"]//span/a/span/text()'
                ).extract()[0]
                item['note_number'] = ret
            except:
                item['note_number'] = 0

            try:
                distr = response.xpath('//div[@class="rating_wrap clearbox"]'
                                       ).extract()[0].replace('\n',
                                                              '').replace(
                                                                  ' ', '')
                part = re.compile("(\d*.\d*)%")
                item['one'] = part.findall(distr)[4]
                item['two'] = part.findall(distr)[3]
                item['three'] = part.findall(distr)[2]
                item['four'] = part.findall(distr)[1]
                item['five'] = part.findall(distr)[0]
            except:
                print('can not find distr')
                item['one'] = 0
                item['two'] = 0
                item['three'] = 0
                item['four'] = 0
                item['five'] = 0

            # 想看 详情页
            # yield Request(response.url + 'doings')

        # data={
        #     '书名' : item['title'],
        #     '作者' : item['author'],
        #     '评分' : item['score'],
        #     '评分人数': item['number'],
        #     '定价': item['price'],
        #     '出版社': item['press'],
        #     '出版年': item['publish_date'],
        #     '页数': item['pages'],
        #     'ISBN': item['ISBN'],
        #     '标签': item['label'],
        #     '读过的人': item['read'],
        #     '想读的人': item['read_want'],
        #     '在读的人': item['reading'],
        #     '内容简介':item['summary'],
        #     '作者简介':item['author_s'],
        #     '图书链接':item['url']
        # }
        #     print(item)

            postitem = dict(item)
            all.insert(postitem)
Ejemplo n.º 24
0
    def detail_parse(self, response):
        item = BooksItem()

        selector = Selector(response)
        #http://p.3.cn/prices/mgets?skuIds=J_11252778,J_&type=1
        #price = selector.xpath('//*[@id="page_maprice"]/text()').extract()
        #content = selector.xpath('//*[@id="detail-tag-id-6"]/div[2]/div/text()').extract()
        #.pop().replace("<br>", " ")

        # with Browser() as browser:
        #     # Visit URL
        #     executable_path = {'executable_path': '/usr/local/Cellar/phantomjs/2.1.1/bin/phantomjs'}
        #     browser = Browser('phantomjs', **executable_path)
        #repr

        # driver.execute_script("window.scrollBy(0,5000)")
        # time.sleep(4)
        # time.sleep(4)
        # content = driver.find_elements_by_class_name(book-detail-content)
        # driver = webdriver.Chrome()

        bookurl = response.meta["bookurl"]
        driver = webdriver.PhantomJS()
        driver.get(bookurl)
        content = driver.find_element_by_xpath(
            '//*[@id="detail-tag-id-6"]/div[2]/div').text
        driver.close()
        #content = 123

        try:
            item['title'] = response.meta["title"]
            item['price'] = response.meta["price"]
            item['content'] = content
            item['bookurl'] = response.meta["bookurl"]
            yield item
        except Exception:
            pass

            # import urllib.request;  #载入urllib.request,用于获取页面html源代码


# from pandas import Series;  #载入series包
# from pandas import DataFrame;   #载入dataframe包
# from bs4 import BeautifulSoup;  #载入beautifulsoup包
# import json; #载入json包
#
# response = urllib.request.urlopen('http://item.jd.com/2957726.html'); #获取html源代码
# html = response.read(); #将源代码转入html
# soup = BeautifulSoup(html); #解析html
# data = DataFrame(columns=['Feature', 'Property']) #创建空白dataframe用于装载爬取信息
#
# divSoup = soup.find(id="product-detail-2")  #通过分析,发现规格参数所在部分id
# trs = divSoup.find_all('tr');
#
# for tr in trs :
#     tds = tr.find_all('td');
#     if len(tds)==2: #列表有两个值的时才执行爬取
#         f=tds[0].getText();
#         p=tds[1].getText();
#         data = data.append(
#             Series(
#                 [f, p],
#                 index=['Feature', 'Property']
#             ), ignore_index=True
#         );
#
# response = urllib.request.urlopen('http://p.3.cn/prices/get?skuid=J_2244423');
# jsonString = response.read();
# jsonObject = json.loads(jsonString.decode())
# jsonObject[0]['p']  #解析p的值,即价格
#
# df.to_csv("D:\\df.csv"); #导出结果