Beispiel #1
0
    def parse(self, response, **kwargs):
        li_list = response.xpath('//ul[@class="gl-warp clearfix"]/li')
        for li in li_list:
            item = JdItem()
            phone_desc = li.xpath(
                './/div[@class="p-name p-name-type-3"]/a/em/text()'
            ).extract_first().strip()
            phone_price = li.xpath(
                './/div[@class="p-price"]//i/text()').extract_first()
            phone_link = response.urljoin(
                li.xpath('.//div[@class="p-name p-name-type-3"]/a/@href').
                extract_first())
            from_phone = li.xpath(
                './/span[@class="J_im_icon"]/a/text()').extract_first()
            item['phone_desc'] = phone_desc
            item['phone_price'] = phone_price
            item['phone_link'] = phone_link
            item['from_phone'] = from_phone

            yield item

        # 翻页
        page = int(
            re.findall('adv_param={page:"(.*?)",page_count:".*?"',
                       response.body.decode())[0])
        count_page = int(
            re.findall('adv_param={page:".*?",page_count:"(.*?)"',
                       response.body.decode())[0])
        if count_page > page:
            page = page + 1
            next_url = self.start_urls[0] + f'&page={page}'
            print(next_url)
            yield scrapy.Request(url=next_url, callback=self.parse)
Beispiel #2
0
 def parse(self, response):
     products = response.xpath('//*[@id="plist"]/ul/li[@class="gl-item"]')
     # soup = BeautifulSoup(response.body, "lxml")
     for p in products:
         item = JdItem()
         # imlist = soup.find_all(name="img",attrs={"width":"220","height":"220"})
         # for im in imlist:
         # if 'src' in im.attrs:
         #     imurl = "https:" + im.attrs['src']
         # else:
         #     imurl = "https:" + im.attrs['data-lazy-img']
         # 首先判断有无src属性来决定如何获取
         if p.xpath(".//div[@class='p-img']/a/img/@src").extract():
             item['image'] = ''.join(
                 ["https:"] +
                 p.xpath(".//div[@class='p-img']/a/img/@src").extract())
         else:
             item['image'] = ''.join(["https:"] + p.xpath(
                 ".//div[@class='p-img']/a/img/@data-lazy-img").extract())
         item['price'] = ''.join(
             p.xpath(
                 ".//div[@class='p-price']/strong[@class='J_price']//text()"
             ).extract()).strip()
         item['title'] = ''.join(
             p.xpath(
                 ".//div[@class='p-name']/a/em/text()").extract()).strip()
         yield item
Beispiel #3
0
    def parse_book_list(self, response):
        temp = response.meta['temp']

        book_list = response.xpath('//*[@id="J_goodsList"]/ul/li/div')

        for book in book_list:
            item = JdItem()

            item['big_category'] = temp['big_category']
            item['big_category_link'] = temp['big_category_link']
            item['small_category'] = temp['small_category']
            item['small_category_link'] = temp['small_category_link']

            item['bookname'] = book.xpath(
                './div[3]/a/em/text()|./div/div[2]/div[2]/div[3]/a/em/text()'
            ).extract_first().strip()
            item['author'] = book.xpath(
                './div[4]/span[1]/span/a/text()|./div/div[2]/div[2]/div[4]/span[1]/span[1]/a/text()'
            ).extract_first().strip()
            item['link'] = book.xpath(
                './div[1]/a/@href|./div/div[2]/div[2]/div[1]/a/@href'
            ).extract_first()

            # 获取图书编号
            skuid = book.xpath('.//@data-sku').extract_first()
            # skuid = book.xpath('./@data-sku').extract_first()
            # print("skuid:",skuid)
            # 拼接图书价格低至
            pri_url = 'https://p.3.cn/prices/mgets?skuIds=J_' + skuid
            yield scrapy.Request(url=pri_url,
                                 callback=self.parse_price,
                                 meta={'meta_1': item})
Beispiel #4
0
 def next_parse(self, response):
     item = JdItem()
     try:
         item["url"] = response.url
         item["name"] = response.xpath(
             '//div[@class="p-info lh"]/div[@class="p-name"]/text()'
         ).extract()
         item["store"] = response.xpath(
             '//div[@class="name"]/a/text()').extract()
         pat = "//item.jd.com/(.*?).html"
         shop_id = re.compile(pat).findall(item["url"])[0]
         response1 = requests.get(
             "https://p.3.cn/prices/mgets?callback=jQuery7879290&type=1&area=1_72_2799_0&pdtk=&pduid=719435848&pdpin=&pin=null&pdbp=0&skuIds=J_{}%2CJ_19659646005%2CJ_42646006588%2CJ_4741808%2CJ_33239063849%2CJ_33341525798%2CJ_3494451%2CJ_797802%2CJ_37652171093&ext=11100000&source=item-pc"
             .format(shop_id))
         data1 = response1.text
         price = data1[data1.index("["):data1.rindex("]") + 1]
         p = json.loads(price)
         item["price"] = p[0]["p"]
         response2 = requests.get(
             "https://club.jd.com/comment/productCommentSummaries.action?referenceIds={}&callback=jQuery2538049&_=1559982177443"
             .format(shop_id))
         data2 = response2.text
         comment = data2[data2.find("["):data2.rfind("]") + 1]
         c = json.loads(comment)
         item["comment"] = c[0]["CommentCountStr"]
         item["good_comment"] = c[0]["GoodRateShow"]
         yield item
     except Exception as e:
         print(e)
Beispiel #5
0
    def parse_news(self, response):

        t = JdItem()
        price = response.xpath(
            './/span[@class="p-price"]/span[2]/text()').extract_first()
        t['price'] = price
        info = response.xpath(
            './/div[@class="sku-name"]/text()').extract_first()
        t['info'] = info.strip()
        time = datetime.datetime.now()
        t['time'] = time
        try:
            if (round(float(price)) < round(float(self.wantprice))):

                emailSenderClient = emailSender()
                toSendEmailLst = [self.email]
                startTime = datetime.datetime.now()
                subject = "低价提醒" + info.strip()
                body = "细节:检测到有低于您设置的低价"
                emailSenderClient.sendEmail(toSendEmailLst, subject,
                                            body)  # 发送邮件
        except exception:
            pass

        yield t
Beispiel #6
0
    def parse(self, response):
        li_list = response.xpath('//div[@id="J_goodsList"]/ul/li')
        for li in li_list:
            try:
                id = li.xpath('./div/div[6]/a/@data-sku').extract_first()
                title = li.xpath('./div/div[4]/a/em/text()').extract()
                link = li.xpath('./div/div[1]/a/@href').extract_first()
                price = li.xpath(
                    './div/div[3]/strong/i/text()').extract_first()
                # comments = li.xpath('./div/div[5]/strong/a//text()').extract_first()
                shop_name = li.xpath(
                    './div/div[7]/span/a/text()').extract_first()

                item = JdItem()

                item["id"] = id
                item["title"] = title
                item["link"] = link
                item["price"] = price
                # self.item["comments"] = comments
                item["shop_name"] = shop_name

            except:
                raise Exception("解析异常!!!")
            yield scrapy.Request(url=self.comments_urls + str(id),
                                 callback=self.getDetailpage,
                                 meta={"item": item})
Beispiel #7
0
    def parse(self, response, **kwargs):
        data = self.get_data()
        for i in data:

            big_cate = i['categoryName']
            s1 = int(i['fatherCategoryId'])
            s2 = int(i['categoryId'])
            big_cate_link = f'https://channel.jd.com/{s1}-{s2}.html'
            # print(big_cate,big_cate_link)
            small_list = i['sonList']
            for small in small_list:
                item = JdItem()
                small_cate = small['categoryName']
                s3 = int(small['categoryId'])
                s4 = int(small['fatherCategoryId'])
                small_cate_link = f'https://list.jd.com/list.html?cat={s1},{s4},{s3}'
                # print(small_cate,small_cate_link)
                item['big_cate'] = big_cate
                item['big_cate_link'] = big_cate_link
                item['small_cate'] = small_cate
                item['small_cate_link'] = small_cate_link

                yield scrapy.Request(url=item['small_cate_link'],
                                     meta={'item': item},
                                     callback=self.parse_book_list)
Beispiel #8
0
    def parse_book_list(self, response):
        temp = response.meta['py21']
        book_list = response.xpath('//*[@id="J_goodsList"]/ul/li/div')
        data = ""
        for book in book_list:
            item = JdItem()
            item['big_category'] = temp["big_category"]
            item['big_category_link'] = temp["big_category_link"]
            item['small_category'] = temp["small_category"]
            item['small_category_link'] = temp["small_category_link"]

            item['bookname'] = book.xpath(
                './div[3]/a/em/text()').extract_first()
            item['author'] = book.xpath(
                './div[4]/span[1]/a/text()').extract_first()
            item['link'] = book.xpath('./div[1]/a/@href').extract_first()
            item['price'] = book.xpath(
                './div[2]/strong/i/text()').extract_first()
            yield item

            next_url = book.xpath('.//i[@class="promo-words"]/@id'
                                  ).extract_first().split('_')[-1]
            data += next_url + ','
        cat = response.url.split('=')[-1].replace(",", "%2c")

        yield scrapy.Request(
            url=
            f'https://list.jd.com/listNew.php?cat={cat}&page=2&s=27&scrolling=y&tpl=2_M&isList=1&show_items={data[:-1]}',
            callback=self.parse_book_list_one,
            meta={"py21": temp},
            headers={
                "referer": "https://list.jd.com/list.html?cat=1713,3258,3297"
            })
Beispiel #9
0
 def text(self, response):
     # 正文级别
     item = JdItem()
     print("执行正文级别yield")
     a = 0
     try:
         nevelone_title = response.xpath(
             '//td[@class="bav_border_top"][1]/a[3]//text()').extract_first(
             )
         text_title = response.xpath(
             '//div[@id="cps_title"]//text()').extract_first()
         text_text = response.xpath(
             '//div[@id="cp_content"]//text()').extract()
         item['title'] = nevelone_title
         item['url'] = response.meta['url']
         item['author'] = response.meta['author']
         item['time'] = response.meta['time']
         item['category'] = response.meta['category']
         item['text_title'] = text_title
         item['text_text_all'] = item['title'] + ''.join(text_text).replace(
             '\u3000\u3000', '').strip('\r\n')
         print(item['title'])
         a += 1
         print(a)
         yield item
     except Exception:
         pass
Beispiel #10
0
class Jdssspider(scrapy.Spider):
    name = 'jdsn'
    allowed_domains = ['www.jd.comd.com']
    page = 3
    s = 56
    key = 'swith'
    nopass = False
    number = 2
    item = JdItem()
    start_urls = [f'https://search.jd.com/Search?keyword={key}&wq={key}&page={page}&s={s}']

    def parse(self, response):
        list_id = []
        pieces = 0

        print(response)
        print('——————————————————————开始爬取——————————————————————')
        print('——————————————————————开始输出——————————————————————')

        all = response.xpath('//div[@id="J_goodsList"]/ul/li')
        for i in all:
            id = i.xpath('./@data-sku').extract_first()
            url = 'https:' + i.xpath('.//div[@class="p-name p-name-type-2"]/a/@href').extract_first()
            title = i.xpath('.//div[@class="p-name p-name-type-2"]/a/em/text()').extract_first()
            shop = i.xpath('.//div[@class="p-shop"]/span/a/text()').extract_first()
            price = i.xpath('.//div[@class="p-price"]/strong/i/text()').extract_first()
            print(title, price)
            list_id.append(id)
            self.item['url'] = url
            self.item['title'] = title
            self.item['shop'] = shop
            self.item['price'] = price
            yield self.item
            pieces += 1
            print(f'当前为第{pieces}')

        url = [f"https://search.jd.com/Search?keyword={self.key}&wq={self.key}&page={self.page}&s={self.s}",
               f'https://search.jd.com/s_new.php?keyword={self.key}&page={self.page - 1}&show_items={",".join(list_id)}']
        headers = {"Referer": url}
        if self.nopass:
            self.nopass = False
            url = url[0]
            headers = None

        else:
            self.page += 2
            self.s += 60
            self.nopass = True
            url = url[1]
        print(f'当前为第{self.number}页')
        self.number += 1
        if self.number < 20 and pieces == 30:
            yield scrapy.Request(
                url=url,
                callback=self.parse,
                headers=headers,
                dont_filter=True)
        else:
            return
Beispiel #11
0
    def parse(self, response):
        wines = response.xpath(
            '//ul[@class="gl-warp clearfix"]/li/div[@class="gl-i-wrap"]')
        for wine in wines:

            item = JdItem()
            item['name'] = wine.xpath(
                './div[@class="p-name p-name-type-2"]/a[@target="_blank"]//em/text()'
            ).extract_first()

            print item['name'].strip()

            yield item
Beispiel #12
0
 def parse(self, response):
     item = JdItem()
     url = response.xpath("//div[@class='p-img']/a/@href").extract()
     # print(url)
     for this_url in url:
         if this_url.startswith("https", 0, 5):
             yield Request(this_url, callback=self.next_parse)
         else:
             yield Request("https:" + this_url, callback=self.next_parse)
     for i in range(3, 200, 2):
         next_url = "https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&enc=utf-8&page={}".format(
             i, i + 55)
         yield Request(next_url, callback=self.parse)
Beispiel #13
0
    def parse_book_list(self, response):
        temp = response.meta['meta_1']
        # print ('----',temp)

        # 获取书籍节点列表
        book_list = response.xpath('//*[@id="plist"]/ul/li/div')
        # print (len(book_list))
        # 遍历
        for book in book_list:
            item = JdItem()

            # 提取图书信息
            item['big_cate'] = temp['big_cate']
            item['big_cate_link'] = temp['big_cate_link']
            item['small_cate'] = temp['small_cate']
            item['small_cate_link'] = temp['small_cate_link']

            item['book_name'] = book.xpath(
                './div[3]/a/em/text()').extract_first()
            if item['book_name'] != None:
                item['book_name'] = item['book_name'].strip()
            item['cover_link'] = book.xpath(
                './div[1]/a/img/@src|./div[1]/a/img/@data-lazy-img'
            ).extract_first()

            item['detail_url'] = book.xpath('./div[1]/a/@href').extract_first()
            if item['detail_url'] != None:
                item['detail_url'] = 'https:' + item['detail_url']

            item['authors'] = book.xpath(
                './div[4]/span[1]/span/a/text()').extract()
            item['publisher'] = book.xpath(
                './div[4]/span[2]/a/text()').extract_first()

            item['pub_time'] = book.xpath(
                './div[4]/span[3]/text()').extract_first()
            if item['pub_time'] != None:
                item['pub_time'] = item['pub_time'].strip()

            item['sku'] = book.xpath('./@data-sku').extract_first()
            # print (item)

            # 发起价格请求
            if item['sku'] != None:
                url = 'https://p.3.cn/prices/mgets?skuIds=J_' + item['sku']
                yield scrapy.Request(url,
                                     callback=self.parse_price,
                                     meta={'meta_2': item})
Beispiel #14
0
    def parse_book_list(self, response):
        temp = response.meta['meta_1']
        # print(temp['big_category'])

        # print(temp)

        # 获取所有图书节点
        book_list = response.xpath('//*[@id="plist"]/ul/li/div')
        # print(len(book_list))

        # 编列所有的图书节点列表
        for book in book_list:
            # 构建item实例
            item = JdItem()

            # 抽取数据
            item['big_category'] = temp['big_category']
            item['big_category_link'] = temp['big_category_link']
            item['small_category'] = temp['small_category']
            item['small_category_link'] = temp['small_category_link']

            item['name'] = book.xpath('./div[3]/a/em/text()').extract_first()
            try:
                item['cover_link'] = 'https:' + book.xpath(
                    './div[1]/a/img/@src').extract_first()
            except:
                item['cover_link'] = None
            try:
                item['detail_url'] = 'https:' + book.xpath(
                    './div[1]/a/@href').extract_first()
            except:
                item['detail_url'] = None
            item['author'] = book.xpath(
                './div[4]/span[1]/span/a/text()').extract_first()
            item['publisher'] = book.xpath(
                '/div[4]/span[2]/a/text()').extract_first()
            item['pub_date'] = book.xpath(
                './div[4]/span[3]/text()').extract_first()
            # item['price'] = book.xpath('./div[4]/span[3]/text()').extract_first()

            # 构建价格请求
            skuid = book.xpath('./@data-sku').extract_first()
            if skuid is not None:
                url = 'https://p.3.cn/prices/mgets?skuIds=J_' + skuid
                yield scrapy.Request(url,
                                     callback=self.parse_price,
                                     meta={'meta_2': item})
Beispiel #15
0
    def parse(self, response):
        temp_list = response.xpath("//*[@id='J_goodsList']/ul/li")
        for temp in temp_list:
            item = JdItem()
            item['name'] = str(
                temp.xpath('div/div[3]/a/em/text()').extract()).replace(
                    "['", "").replace("']", "")
            item['sales'] = temp.xpath(
                'div/div[4]/strong/a/text()').extract()[0]
            item['price'] = temp.xpath(
                'div/div[2]/strong/i/text()').extract()[0]
            yield item

        if self.i < 100:
            self.i = self.i + 2
        url = self.url1 + str(self.i) + self.url2
        yield Request(url, callback=self.parse)
Beispiel #16
0
 def parse(self, response):
     # 图书大分类:
     dt_list = response.xpath('//*[@id="booksort"]/div[2]/dl/dt')
     # 遍历dt得到dd
     # following—sibling::*[1]
     for dt in dt_list:
         item = JdItem()
         item['big_name'] = dt.xpath('./a/text()').extract_first()
         # 小分类:
         em_list = dt.xpath('./following-sibling::*[1]/em')
         for em in em_list:
             item['small_name'] = em.xpath('a/text()').extract_first()
             small_link = 'http:' + em.xpath('a/@href').extract_first()
             #开启第二层
             yield scrapy.Request(small_link,
                                  callback=self.parse_book,
                                  meta={"book": deepcopy(item)})
Beispiel #17
0
 def parse(self, response):
     # print(response.text)
     itemList = response.xpath('//div[@class="search_prolist_item"]')
     subclass = response.xpath('//title/text()').extract()[0].split(' ')[0]
     for node in itemList[
             0:4]:  # only the info of the first 4 items can be retrieved
         item = JdItem()
         # xpath returns a list
         # .// means any child node under current node
         item['name'] = node.xpath(
             './/div[@class="search_prolist_title"]/text()').extract(
             )[0].strip()
         item['img_url'] = node.xpath(
             './/div[@class="search_prolist_cover"]/img[@class="photo"]/@src'
         ).extract()[0]
         item['subclass'] = subclass
         item['item_id'] = node.xpath('./@skuid').extract()[0]
         yield item  # certain subclass can not be retrieved, need to clean in pipeline
Beispiel #18
0
 def extract_product_coupon(self, response):
     product = response.meta['product']
     response_json = json.loads(str(response.body, encoding='utf8'))
     skuCoupon = response_json['skuCoupon']
     skuPromote = response_json['prom']['pickOneTag']
     skuCoupon_list = []
     skuPromote_list = []
     print(skuCoupon)
     for i in skuCoupon:
         counpon = {}
         # 配額
         counpon['quota'] = i['quota']
         # 減免
         counpon['trueDiscount'] = i['trueDiscount']
         # 限制
         counpon['limit'] = i['name']
         counpon['beginTime'] = i['beginTime']
         counpon['endTime'] = i['endTime']
         skuCoupon_list.append(counpon)
     for i in skuPromote:
         prom = {}
         prom['content'] = i['content']
         prom['name'] = i['name']
         try:
             prom['adurl'] = i['adurl']
         except Exception:
             pass
         skuPromote_list.append(prom)
     product['jetso'] = {'product_coupon': skuCoupon_list, 'skuPromote': skuPromote_list}
     item = JdItem()
     item['sku'] = product['sku']
     item['name'] = product['name']
     item['detail'] = product['detail']
     item['image'] = product['image']
     item['other_type'] = product['other_type']
     item['price'] = product['price']
     item['p_type'] = product['p_type']
     item['crawl_date'] = product['crawl_date']
     item['sku_slave_typeid'] = product['sku_slave_typeid']
     item['jetso'] = product['jetso']
     print(product)
     yield item
Beispiel #19
0
 def extract_product_price(self, response):
     """
     獲取商品價格
     :param response:
     :return:
     """
     product = response.meta['product']
     price_json = json.loads(str(response.body, encoding='utf8'))
     ret_json = {}
     try:
         ret_json["old_price"] = price_json[0]["op"]
     except:
         pass
     ret_json["price"] = price_json[0]["p"]
     # 尝试获取是否有京东会员价格
     if "tpp" in price_json[0].keys():
         ret_json["vip"] = price_json[0]["tpp"]
     product['price'] = ret_json
     cumpon_url = 'https://cd.jd.com/promotion/v2?skuId=%s&area=19_1609_41655_0&cat=%s' % (
     product['sku'], product['sku_slave_typeid'])
     print(cumpon_url)
     if product['sku_slave_typeid']:
         # cumpon_url = 'https://cd.jd.com/promotion/v2?skuId=%s&area=19_1609_41655_0&cat=%s' % (product['sku'], product['sku_slave_typeid'])
         yield scrapy.Request(
             cumpon_url,
             meta={'product': product},
             callback=self.extract_product_coupon,
             dont_filter=True
         )
     else:
         item = JdItem()
         item['sku'] = product['sku']
         item['name'] = product['name']
         item['detail'] = product['detail']
         item['image'] = product['image']
         item['other_type'] = product['other_type']
         item['price'] = product['price']
         item['p_type'] = product['p_type']
         item['crawl_date'] = product['crawl_date']
         item['sku_slave_typeid'] = product['sku_slave_typeid']
         item['product_coupon'] = ''
         yield item
Beispiel #20
0
    def parse(self, response):
        """ 对Selenium返回的页面进行解析 """

        products = response.xpath("//*[@id='plist']/ul/li")

        for product in products:
            item = JdItem()

            item['title'] = product.xpath(
                ".//div/div[3]/a/em/text()").extract_first().strip()
            item['price'] = product.xpath(
                ".//div/div[2]/strong[1]/i/text()").extract_first()
            item['pic'] = product.xpath(
                ".//div/div[1]/a/img/@src").extract_first()
            item['comment'] = product.xpath(
                ".//div/div[4]/strong/a/text()").extract_first()
            item['store'] = product.xpath(
                ".//div/div[5]/span/a/text()").extract_first()

            yield item
Beispiel #21
0
    def parse_book_list_one(self, response):
        temp = response.meta['py21']
        book_list = response.xpath('//li[@class="gl-item"]')
        for book in book_list:
            item = JdItem()
            item['big_category'] = temp["big_category"]
            item['big_category_link'] = temp["big_category_link"]
            item['small_category'] = temp["small_category"]
            item['small_category_link'] = temp["small_category_link"]

            item['bookname'] = book.xpath(
                './/div[@class="p-name"]/a/em/text()').extract_first()
            item['author'] = book.xpath(
                './/div[@class="p-bookdetails"]/span[1]/a/text()'
            ).extract_first()
            item['link'] = book.xpath(
                './/div[@class="p-name"]/a/@href').extract_first()
            item['price'] = book.xpath(
                './/div[@class="p-price"]/strong/i/text()').extract_first()

            yield item
Beispiel #22
0
 def parse(self, response):
     a = json.dumps(response.text)
     b = json.loads(a)
     comment = re.findall(r'{"productAttr":.*}', str(b))
     # json解析
     comm_dict = json.loads(comment[0])
     # comments是评论实体
     comm_list = comm_dict['comments']
     for com in comm_list:
         item = JdItem()
         # 用户id
         item['id'] = com["id"]
         # 名称
         item['name'] = com['referenceName']
         # 用split去空格与换行 join连成整段评论
         item['comment'] = ''.join(com['content'].split())
         # 用户评分
         item['score'] = com['score']  # 用户打分
         # 时间
         item['time'] = com['creationTime']
         yield item
Beispiel #23
0
    def parse_detail(self, response):
        data = response.request.meta['meta_1']
        # 获取所有图书节点列表
        books = response.xpath('//*[@id="plist"]/ul/li/div')
        # 创建item对象
        item = JdItem()
        # 遍历节点
        for book in books:
            item['big_category'] = data['big_category']
            item['big_category_link'] = data['big_category_link']
            item['small_category'] = data['small_category']
            item['small_category_link'] = data['small_category_link']

            item['book_name'] = book.xpath(
                './div[3]/a/em/text()').extract_first()
            item['book_cover'] = book.xpath(
                './div[1]/a/img/@src|./div[1]/a/img/@data-lazy-img'
            ).extract_first()
            try:
                item['detail_link'] = 'https:' + book.xpath(
                    './div[1]/a/@href').extract_first()
            except:
                item['detail_link'] = None
            item['author'] = book.xpath(
                './div[4]/span[1]/span/a/text()').extract_first()
            item['publisher'] = book.xpath(
                './div[4]/span[2]/a/text()').extract_first()
            item['pub_data'] = book.xpath(
                './div[4]/span[3]/text()').extract_first()
            # item['price'] = book.xpath('./div[2]/strong[1]/i/text()').extract_first()
            # yield item
            skuid = book.xpath('./@data-sku').extract_first()
            if skuid != None:
                # 拼接价格url
                price_url = 'https://p.3.cn/prices/mgets?skuIds=J_' + str(
                    skuid)
                yield scrapy.Request(url=price_url,
                                     callback=self.parse_price,
                                     meta={'meta_2': item})
Beispiel #24
0
 def parse(self, response):
     name = response.xpath(
         '//div[@id="name"]/h1/text()').extract_first().replace(' ', '')
     data = response.xpath('//div[@id="p-author"]')
     author = data.xpath('string(.)').extract_first().replace('\n',
                                                              '').replace(
                                                                  ' ', '')
     data = response.xpath('//ul[@id="parameter2"]')
     detail = data.xpath('string(.)').extract_first()[1:-1].replace(' ', '')
     shop = '京东自营' if detail[:3] == '出版社' else response.xpath(
         '//ul[@id="parameter2"]/li[1]/a/text()').extract_first()
     book_id = response.xpath('//head/link/@href').extract_first().split(
         '/')[-1].split('.')[0]
     this = 'http:' + response.xpath(
         '//head/link/@href').extract_first()  #本页的url
     #因为价格与评论信息是发起JS请求,从响应中提取JSON数据,所以不再利用scrapy内置功能。自己Requests
     jsurl = 'http://p.3.cn/prices/get?type=1&area=1_72_2799&ext=11000000&pin=&' \
             'pdtk=FPccakpV9mj2W7jFSF%2BtATks2rbgJDLiwIUI5nkedHiAWTgr9wJVrXOToICN%2B93%2B&' \
             'pduid=1506124005948838590885&pdpin=&pdbp=0&skuid=J_{}&callback=cnp'.format(book_id)
     #pdtk是 Cookie的最后一段。想办法动态获得。
     str = self.getJsonFrom(jsurl).text[5:-4]  #去掉前后部分才是JSON格式
     price = json.loads(str)  #获取json数据。
     cmturl = 'http://club.jd.com/comment/productCommentSummaries.action?referenceIds={}'.format(
         book_id)
     info = self.getJsonFrom(cmturl).json()
     Item = JdItem(
         book_dict={
             '书名': name,
             '作者': author,
             '店铺': shop,
             '现价': float(price['p']),
             '定价': float(price['m']),
             '平常价': float(price['op']),
             '评论数': int(info['CommentsCount'][0]['CommentCount']),
             '好评率': float(info['CommentsCount'][0]['GoodRate']),
             '详情': detail,
             '链接': this
         })
     yield Item
Beispiel #25
0
    def parse_booklist(self, response):
        temp = response.meta
        page = int(temp.get("page"))
        total_page = int(
            response.xpath(
                "//span[@class='p-skip']/em/b/text()").get().strip())

        book_list = response.xpath("//div[@id='plist']/ul/li/div")
        for book in book_list:
            item = JdItem()
            item["big_category"] = temp["big_category"]
            item["big_category_link"] = temp["big_category_link"]
            item["small_category"] = temp["small_category"]
            item["small_category_link"] = temp["small_category_link"]
            item["bookname"] = book.xpath(
                ".//div[@class='p-name']/a/em/text()").get().strip()
            item["link"] = response.urljoin(
                book.xpath("./div[@class='p-name']/a/@href").get())
            item["author"] = book.xpath(
                "./div[@class='p-bookdetails']/span/span/a/text()").get()

            # 获取图书编号,拼接图书价格地址
            sku_id = book.xpath(".//@data-sku").get()
            price_url = "https://p.3.cn/prices/mgets?skuIds=J_" + sku_id
            yield scrapy.Request(url=price_url,
                                 callback=self.parse_price,
                                 meta={"meta_1": item})

        if page < total_page:
            page += 1
            small_category_link = temp[
                "small_category_link"] + "&page={}".format(page)
            temp["page"] = page
            yield scrapy.Request(small_category_link,
                                 callback=self.parse_booklist,
                                 meta=temp)
Beispiel #26
0
    def parse_book_list(self, response):
        """解析图书小分类下面的书籍列表"""
        # 获取parse方法传递的meta数据,传递temo的作用是为了构建模型的时候传递oarse中获得的大小分类的name和url
        temp = response.meta['meta1']

        book_list = response.xpath('//*[@id="plist"]/ul/li/div')
        # 遍历图书列表
        for book in book_list:
            # 实例化item
            item = JdItem()
            # 书名信息、分类信息,出版社只有在鼠标滑过的时候才会显示
            item['name'] = book.xpath(
                './div[3]/a/em/text()').extract_first().strip()
            item['big_category'] = temp['big_category']
            item['big_category_url'] = temp['big_category_url']
            item['small_category'] = temp['small_category']
            item['small_category_url'] = temp['small_category_url']
            item['author'] = book.xpath(
                './div[@class="p-bookdetails"]/span[@class="p-bi-name"]/span[@class="author_type_1"]/a/text()'
            ).extract_first()
            item['publisher'] = book.xpath(
                './div[@class="p-bookdetails"]/span[2]/a/text()'
            ).extract_first()
            item['pub_date'] = book.xpath(
                './div[@class="p-bookdetails"]/span[3]/text()').extract_first(
                ).strip()

            try:
                item['cover_url'] = 'https:' + book.xpath(
                    './div[1]/a/img/@src').extract_first()
            except:
                item['cover_url'] = None
            try:
                item['detail_url'] = 'https:' + book.xpath(
                    './div[3]/a/@href').extract_first()
            except:
                item['detail_url'] = None

            # 获取价格的url,价格的保存并不在html中而是在jquery进行请求一个p.3.cn的接口,
            # 并且需要skuid等等参数,发现参数的值藏在html页面中,传入参数来获取当页价格的json list

            # https://p.3.cn/prices/mgets?skuIds=J_11757834%2CJ_10367073%2CJ_11711801%2CJ_12090377%2
            # CJ_10199768%2CJ_11711801%2CJ_12018031%2CJ_10019917%2CJ_11711801%2CJ_10162899%2CJ_110816
            # 95%2CJ_12114139%2CJ_12010088%2CJ_12161302%2CJ_11779454%2CJ_11939717%2CJ_12026957%2CJ_12
            # 184621%2CJ_12115244%2CJ_11930113%2CJ_10937943%2CJ_12192773%2CJ_12073030%2CJ_12098764%2CJ
            # _11138599%2CJ_11165561%2CJ_11920855%2CJ_11682924%2CJ_11682923%2CJ_11892139&pduid=1523432
            # 585886562677791

            # 获得skuid=11757834的价格链接,pduid是固定的
            # https://p.3.cn/prices/mgets?skuIds=J_11757834&pduid=1523432585886562677791
            skuid = book.xpath('./@data-sku').extract_first()

            pduid = '&pduid=1523432585886562677791'
            # print(item)
            # 再次发送请求,获取价格信息
            if skuid is not None:

                # 如果打印不出价格,是因为获取价格的时候惊醒了跨域请求的域名已经发生了改变
                url = 'https://p.3.cn/prices/mgets?skuIds=J_' + skuid + pduid
                yield scrapy.Request(url=url,
                                     callback=self.parse_price,
                                     meta={'meta2': item})