Beispiel #1
0
 def getrank(self, response):
     item = DangdangItem()
     item = response.meta['item']
     if len(response.text) > 100:
         item['ranking'] = re.findall('"rank":"(.*?)"', response.text)[0]
     else:
         item['ranking'] = '无排名'
     return item
Beispiel #2
0
 def single_goods(self, response):
     item = DangdangItem()
     item['title'] = response.meta['title']  #商品名
     item['url'] = response.url  #购买链接
     item['comment'] = response.xpath(
         '//a[@dd_name="评论数"]/text()').extract()  #评论数
     item['price'] = response.meta['price']  #价格
     # 商品id,为获得商品排名
     goods_id = str(item['url']).split('/')
     g_id = goods_id[len(goods_id) - 1].split('.')[0]
     ranking_url = 'http://product.dangdang.com/index.php?r=callback%2Fget-bang-rank&productId=' + g_id
     yield Request(ranking_url, self.getrank, meta={'item': item})
Beispiel #3
0
 def parse(self, response):
     item = DangdangItem()
     item['name'] = response.xpath("//a[@dd_name='单品标题']/text()").extract()
     item['price'] = response.xpath(
         "//span[@class='price_n']/text()").extract()
     item['link'] = response.xpath("//a[@class='pic']/@href").extract()
     item['comnum'] = response.xpath(
         "//a[@dd_name='单品评论']/text()").extract()
     yield item
     for i in range(1, 100):
         url = "http://category.dangdang.com/pg" + str(
             i) + "-cid4011029.html"
         yield Request(url, callback=self.parse)
Beispiel #4
0
 def parse(self, response):
     item = DangdangItem()
     item['title'] = response.xpath('//a[@name="itemlist-picture"]/@title').extract()
     #print(item['title'])
     item['link'] = response.xpath('//a[@name="itemlist-picture"]/@href').extract()
     item['comment'] = response.xpath('//a[@dd_name="单品评论"]/text()').extract()
     for i in range(0,len(item['comment'])):
         item['comment'][i] = int(item['comment'][i].replace("条评论",''))
     item['price'] = response.xpath('//span[@class="search_now_price"]/text()').extract()
     for i in range(0, len(item['price'])):
         item['price'][i] = float(re.findall('[0-9.].*',item['price'][i].replace('¥', ''))[0])
     yield item
     for i in range(2,101):
         url = 'http://category.dangdang.com/pg'+str(i)+'-cp01.54.00.00.00.00.html'
         yield Request(url,callback=self.parse)
Beispiel #5
0
    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            lis = selector.xpath(
                "//li['@ddt-pit'][starts-with(@class,'line')]")
            for li in lis:
                title = li.xpath("./a[position()=1]/@title").extract_first()
                price = li.xpath(
                    "./p[@class='price']/span[@class='search_now_price']/text()"
                ).extract_first()
                author = li.xpath(
                    "./p[@class='search_book_author']/span[position()=1]/a/@title"
                ).extract_first()
                date = li.xpath(
                    "./p[@class='search_book_author']/span[position()=last()- 1]/text()"
                ).extract_first()
                publisher = li.xpath(
                    "./p[@class='search_book_author']/span[position()=last()]/a/@title "
                ).extract_first()
                detail = li.xpath(
                    "./p[@class='detail']/text()").extract_first()
                #detail有时没有,结果None
                item = DangdangItem()
                item["title"] = title.strip() if title else ""
                item["author"] = author.strip() if author else ""
                item["date"] = date.strip()[1:] if date else ""
                item["publisher"] = publisher.strip() if publisher else ""
                item["price"] = price.strip() if price else ""
                item["detail"] = detail.strip() if detail else ""
                yield item
                #最后一页时link为None
            link = selector.xpath(
                "//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href"
            ).extract_first()
            if link:
                url = response.urljoin(link)
                yield scrapy.Request(url=url, callback=self.parse)

        except Exception as err:
            print(err)
Beispiel #6
0
 def parse(self, response):
     item = DangdangItem()
     products = json.loads(response.text)
     for product in products['products']:
         item['book_name'] = product.get('name')  # 书名
         item['author_name'] = product.get('authorname')  # 作者
         item['price'] = product.get('price')  # 现价
         item['original_price'] = product.get('original_price')  # 原价
         item['score'] = product.get('score')  # 评分
         item['stock'] = product.get('stock')  # 库存
         item['total_review_count'] = product.get(
             'total_review_count')  # 评论数
         item['shop_id'] = product.get('shop_id')  # 店铺id
         item['shop_info'] = product.get('shop_info')  # 店铺名称
         item['publisher'] = product.get('publisher')  # 出版社
         item['publish_date'] = product.get('publish_date')  # 出版日期
         item['image_url'] = product.get('image_url')  # 图书封面
         item['product_url'] = product.get('product_url')  # 图书url
         yield item
         self.offset += 1
         if self.offset <= settings.MAX_PAGE:
             yield scrapy.Request(self.url.format(self.offset))
Beispiel #7
0
    def parse_item(self, response):
        item = DangdangItem()  # 实例化item
        commment_item = CommentItem()
        item["category"] = response.xpath('//*[@id="breadcrumb"]/a[1]/b/text()').extract_first()+'>'+response.xpath('//*[@id="breadcrumb"]/a[2]/text()').extract_first()+'>'+response.xpath('//*[@id="breadcrumb"]/a[3]/text()').extract_first()
        item["title"] = response.xpath("//*[@id='product_info']/div[1]/h1/@title").extract_first()
        item["detail"] = json.dumps(response.xpath("//*[@id='detail_describe']/ul//li/text()").extract(),ensure_ascii=False)
        item["link"] = response.url
        item["img_link"] =json.dumps(response.xpath("//div[@class='img_list']/ul//li/a/@data-imghref").extract())
        try:
            item["price"] = response.xpath("//*[@id='dd-price']/text()").extract()[1].strip()
        except IndexError as e:
            item["price"] = response.xpath("//*[@id='dd-price']/text()").extract()[0].strip()
        item["comment_num"] = response.xpath("//*[@id='comm_num_down']/text()").extract()[0]

        try:
            item["source"] = response.xpath("//*[@id='shop-geo-name']/text()").extract()[0].replace('\xa0至','')
        except IndexError as e:
            item["source"] = '当当自营'
        
        # 通过正则表达式提取url中的商品id
        goodsid = re.compile('\/(\d+).html').findall(response.url)[0]  
        commment_item['goods_id'] = goodsid
        item["goods_id"] = goodsid

        '''########################################################
                      通过抓包分析,提取商品的好评率             
        ########################################################'''
        # 提取详情页源码中的categoryPath
        script = response.xpath("/html/body/script[1]/text()").extract()[0]
        categoryPath = re.compile(r'.*categoryPath":"(.*?)","describeMap').findall(script)[0]
        # 构造包含好评率包的链接
        rate_url = "http://product.dangdang.com/index.php?r=comment%2Flist&productId="+str(goodsid)+"&categoryPath="+str(categoryPath)+"&mainProductId="+str(goodsid)
        r = requests.get(rate_url)
        data_dict = json.loads(r.text)
        item["rate"] = data_dict['data']['list']['summary']['goodRate']
        item["good_comment_num"] = data_dict['data']['list']['summary']['total_crazy_count']
        item["mid_comment_num"] = data_dict['data']['list']['summary']['total_indifferent_count']
        item["bad_comment_num"] = data_dict['data']['list']['summary']['total_detest_count']
        yield item

        '''#####################################################
                         开始对评论、评分进行清洗并爬取                 
        #####################################################'''
        html_str = data_dict['data']['list']['html']
        html = etree.HTML(html_str)
        comment_items = html.xpath('//div[@class="comment_items clearfix"]')
        pageIndex = 1
        while comment_items:   
            pageIndex += 1
            for item in comment_items:
                comment_unit = item.xpath('.//div[@class="describe_detail"][1]/span[not(@class="icon")]/text()')
                score = item.xpath('.//div[@class="pinglun"]/em/text()')[0]
                time = item.xpath('.//div[@class="items_right"]/div[@class="starline clearfix"][1]/span[1]/text()')[0]
                comment = ' '.join(comment_unit)
                commment_item["comment"] = comment 
                commment_item['score'] = score
                commment_item["time"] = time
                yield commment_item


            rate_url = "http://product.dangdang.com/index.php?r=comment%2Flist&productId="+str(goodsid)+"&categoryPath="+str(categoryPath)+"&mainProductId="+str(goodsid) + "&pageIndex=" + str(pageIndex)
            r = requests.get(rate_url)
            data_dict = json.loads(r.text)
            html_str = data_dict['data']['list']['html']
            html = etree.HTML(html_str)
            comment_items = html.xpath('//div[@class="comment_items clearfix"]')