Exemple #1
0
    def __init__(self, category=None, *args, **kwargs):
        super(JDSpider, self).__init__(*args, **kwargs)

        self.log(category, INFO)
        for url in self.generate_root_url_by_configuration():
            self.start_urls.append(url)
        self.category = category

        self.job = Job()
Exemple #2
0
    def __init__(self, category=None, *args, **kwargs):
        super(JDSpider, self).__init__(*args, **kwargs)

        self.log(category, INFO)
        for url in self.generate_root_url_by_configuration():
            self.start_urls.append(url)
        self.category = category

        self.job = Job()
Exemple #3
0
class JDSpider(scrapy.Spider):
    name = "jd_web"
    start_urls = []

    def __init__(self, category=None, *args, **kwargs):
        super(JDSpider, self).__init__(*args, **kwargs)

        self.log(category, INFO)
        for url in self.generate_root_url_by_configuration():
            self.start_urls.append(url)
        self.category = category

        self.job = Job()

        #self.promotion_log = open('promotion.log', 'w')

    def generate_root_url_by_configuration(self):
        self.client = MongoClient(settings['MONGODB_SERVER'],
                                  settings['MONGODB_PORT'])
        self.db = self.client[settings['MONGODB_DB']]
        self.collection = self.db['category']

        items = self.collection.aggregate([{
            "$unwind": "$provider"
        }, {
            "$match": {
                "provider.name": "jd"
            }
        }, {
            "$group": {
                "_id": "$_id",
                "name": {
                    '$first': '$name'
                },
                "value": {
                    '$first': '$value'
                },
                "param1": {
                    '$first': '$provider.param1'
                },
            }
        }])

        self.category_mapping = {}
        for item in items['result']:
            self.category_mapping[item['param1']] = int(item['value'])
            yield "http://list.jd.com/list.html?cat=%s&page=1&&delivery=1&JL=6_0_0" % (
                item["param1"])

    def get_category(self, provider_category):
        """
        convert jd category to standard category
        :return: the value of standard category
        """
        return self.category_mapping[provider_category]

    def make_requests_from_url(self, url):
        self.job.log_start()
        m = re.search('cat=(.*?)&', url)
        return Request(url,
                       dont_filter=True,
                       meta={
                           'stock_page': 1,
                           'category': m.group(1)
                       })

    def extract_single_stock(self, node):
        #price_class = node.xpath('.//div[@class="p-price"]/strong/@class').extract()[0]
        url = node.xpath('.//div[@class="p-name"]/a/@href').extract()[0]
        name = node.xpath('.//div[@class="p-name"]/a/@title').extract()[0]
        img = node.xpath(
            './/div[@class="p-img"]/a/img/@data-lazy-img').extract()[0]
        comments = node.xpath(
            './/div[@class="p-commit"]//a/text()').extract()[0]
        # remove the prefix J_
        id = url[url.rfind("/") + 1:url.rfind(".html")]
        #self.log("%s-%s-%s" % (id, url, name), INFO)

        return (id, name, url, img, comments)

    def generate_price_query_url(self, stock_id):
        return 'http://p.3.cn/prices/get?skuid=J_%s' % (stock_id)

    def generate_mobile_price_query_url(self, stock_id):
        return 'http://item.m.jd.com/product/%s.html' % (stock_id)

    def generate_promotion_query_url(self, stock_id):
        return 'http://pi.3.cn/promoinfo/get?id=%s&origin=1&callback=Promotions.set' % (
            stock_id)

    def generate_item(self, stock, category):
        item = JDStockItem()
        item['uid'] = int(stock[0])
        item['name'] = stock[1]
        item['url'] = stock[2]
        item['comments'] = int(stock[4])
        item['category'] = self.get_category(category)
        item['changed'] = 0
        item['last_update'] = datetime.now()
        item['last_price'] = float(0.0)
        item['last_mobile_price'] = float(0.0)
        return item

    def generate_price_item(self, price):
        item = JDStockPrice()
        item['uid'] = int(price[0])
        item['price'] = round(float(price[1]), 2)
        item['timestamp'] = datetime.now()
        return item

    def generate_mobile_price_item(self, uid, price):
        item = JDStockMobilePrice()
        item['uid'] = int(uid)
        try:
            item['mobile_price'] = round(float(price), 2)
        except UnicodeEncodeError:
            self.log(
                "UnicodeEncodeError-->SKU:%s mobile price %s can't decode" %
                (uid, price))
            item['mobile_price'] = -1.00
        item['timestamp'] = datetime.now()
        return item

    def generate_img_item(self, image):
        item = JDStockImage()
        item['uid'] = int(image[0])
        item['data'] = image[3]
        return item

    def is_stock_img_exist(self, uid):
        return os.path.exists(
            os.path.join(settings['JD_IMAGE_PATH'], '%s.jpg' % (uid)))

    def parse(self, response):
        # Get price of item
        if (response.meta.has_key('stock_price')):
            price_obj = JSONDecoder().decode(response.body)
            #remove the prefix J_
            stock_id_str = price_obj[0]['id'][2:]
            yield self.generate_price_item((stock_id_str, price_obj[0]['p']))
            yield Request(
                url=self.generate_mobile_price_query_url(stock_id_str),
                meta={
                    'stock_mobile_price': 1,
                    'stock_id': stock_id_str
                },
                priority=PRIORITY_MOBILE_PRICE)
            return

        if (response.meta.has_key('stock_mobile_price')):
            m_price = response.xpath(
                '//span[@class="p-price"]/text()').extract()[0][1:]

            yield self.generate_mobile_price_item(response.meta['stock_id'],
                                                  m_price)
            yield Request(url=self.generate_promotion_query_url(
                response.meta['stock_id']),
                          meta={
                              'stock_promotion': 1,
                              'stock_id': response.meta['stock_id']
                          },
                          priority=PRIORITY_PROMOTION_PRICE)

        if (response.meta.has_key('stock_img')):
            item = JDStockImage()
            item['uid'] = int(response.meta['stock_id'])
            item['data'] = response.body
            yield item
            return

        if (response.meta.has_key('stock_promotion')):
            itemList = JDStockPromotionList()
            itemList['uid'] = int(response.meta['stock_id'])
            itemList['promotionList'] = []

            m = re.match('Promotions.set\((.*)\);', response.body)
            if m:
                content = m.group(1)
                if content != "":
                    #debug
                    #self.promotion_log.write(content+"\n\n")

                    promotion_obj = JSONDecoder().decode(content)
                    promotionInfoList = promotion_obj['promotionInfoList']
                    if promotionInfoList:
                        for promotion in promotionInfoList:
                            """
                            promoType
                                1       满额返券满1万元送500元京东E卡
                                4       有赠品
                                10      满减

                                set : 1 as rebate now
                            """

                            if promotion['rebate']:
                                item = JDStockPromotion()
                                item['type'] = 1
                                item['rebate'] = promotion['rebate']
                                itemList['promotionList'].append(item)
                            if promotion['needMondey'] and promotion['reward']:
                                item = JDStockPromotion()
                                item['type'] = promotion[
                                    'promoType']  # 10 is 满减
                                item['needMoney'] = float(
                                    promotion['needMondey'])
                                item['reward'] = float(promotion['reward'])
                                itemList['promotionList'].append(item)
            yield itemList
            return

        if (response.meta.has_key('stock_page')):
            for stock in response.xpath('//li[@index]'):
                stock_tab_items = stock.xpath(
                    './/div[contains(@class, "tab-content-item")]')
                if stock_tab_items:
                    for single_item in stock_tab_items:
                        item = self.extract_single_stock(single_item)
                        yield self.generate_item(item,
                                                 response.meta['category'])
                        if not self.is_stock_img_exist(item[0]):
                            yield Request(url=item[3],
                                          meta={
                                              'stock_img': 1,
                                              'stock_id': item[0]
                                          })
                        yield Request(url=self.generate_price_query_url(
                            item[0]),
                                      priority=PRIORITY_PRICE,
                                      meta={'stock_price': 1})
                else:
                    item = self.extract_single_stock(stock)
                    yield self.generate_item(item, response.meta['category'])
                    if not self.is_stock_img_exist(item[0]):
                        yield Request(url=item[3],
                                      meta={
                                          'stock_img': 1,
                                          'stock_id': item[0]
                                      })
                    yield Request(url=self.generate_price_query_url(item[0]),
                                  priority=PRIORITY_PRICE,
                                  meta={'stock_price': 1})

            next_page_nodes = response.xpath('//a[@class="pn-next"]')
            if next_page_nodes:
                next_page = next_page_nodes[0].xpath('@href').extract()[0]
                next_page_url = "http://list.jd.com%s" % (next_page)
                #self.log(next_page_url, INFO)
                r = Request(url=next_page_url,
                            priority=PRIORITY_PAGE,
                            meta={
                                'stock_page': 1,
                                'category': response.meta['category']
                            })
                yield r
            return
Exemple #4
0
class JDSpider(scrapy.Spider):
    name = "jd_web"
    start_urls = []

    def __init__(self, category=None, *args, **kwargs):
        super(JDSpider, self).__init__(*args, **kwargs)

        self.log(category, INFO)
        for url in self.generate_root_url_by_configuration():
            self.start_urls.append(url)
        self.category = category

        self.job = Job()

        #self.promotion_log = open('promotion.log', 'w')

    def generate_root_url_by_configuration(self):
        self.client = MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
        self.db = self.client[settings['MONGODB_DB']]
        self.collection = self.db['category']

        items = self.collection.aggregate([
            {"$unwind":"$provider"},
            {"$match":{"provider.name":"jd"}},
            {"$group": {
                "_id":"$_id",
                "name": {'$first': '$name'},
                "value": {'$first': '$value'},
                "param1": {'$first': '$provider.param1'},
            }}
        ])

        self.category_mapping = {}
        for item in items['result']:
            self.category_mapping[item['param1']] = int(item['value'])
            yield "http://list.jd.com/list.html?cat=%s&page=1&&delivery=1&JL=6_0_0" % (item["param1"])

    def get_category(self, provider_category):
        """
        convert jd category to standard category
        :return: the value of standard category
        """
        return self.category_mapping[provider_category]

    def make_requests_from_url(self, url):
        self.job.log_start()
        m = re.search('cat=(.*?)&', url)
        return Request(url, dont_filter=True, meta={'stock_page':1, 'category' : m.group(1)})

    def extract_single_stock(self, node):
        #price_class = node.xpath('.//div[@class="p-price"]/strong/@class').extract()[0]
        url = node.xpath('.//div[@class="p-name"]/a/@href').extract()[0]
        name = node.xpath('.//div[@class="p-name"]/a/@title').extract()[0]
        img = node.xpath('.//div[@class="p-img"]/a/img/@data-lazy-img').extract()[0]
        comments = node.xpath('.//div[@class="p-commit"]//a/text()').extract()[0]
        # remove the prefix J_
        id = url[url.rfind("/")+1 : url.rfind(".html")]
        #self.log("%s-%s-%s" % (id, url, name), INFO)

        return (id, name, url, img, comments)

    def generate_price_query_url(self, stock_id):
        return 'http://p.3.cn/prices/get?skuid=J_%s' % (stock_id)

    def generate_mobile_price_query_url(self, stock_id):
        return 'http://item.m.jd.com/product/%s.html' % (stock_id)

    def generate_promotion_query_url(self, stock_id):
        return 'http://pi.3.cn/promoinfo/get?id=%s&origin=1&callback=Promotions.set' % (stock_id)

    def generate_item(self, stock, category):
        item = JDStockItem()
        item['uid'] = int(stock[0])
        item['name'] = stock[1]
        item['url'] = stock[2]
        item['comments'] = int(stock[4])
        item['category'] = self.get_category(category)
        item['changed'] = 0
        item['last_update'] = datetime.now()
        item['last_price'] = float(0.0)
        item['last_mobile_price'] = float(0.0)
        return item

    def generate_price_item(self, price):
        item = JDStockPrice()
        item['uid'] = int(price[0])
        item['price'] = round(float(price[1]), 2)
        item['timestamp'] = datetime.now()
        return item

    def generate_mobile_price_item(self, uid, price):
        item = JDStockMobilePrice()
        item['uid'] = int(uid)
        try:
            item['mobile_price'] = round(float(price), 2)
        except UnicodeEncodeError:
            self.log("UnicodeEncodeError-->SKU:%s mobile price %s can't decode" % (uid, price))
            item['mobile_price'] = -1.00
        item['timestamp'] = datetime.now()
        return item

    def generate_img_item(self, image):
        item = JDStockImage()
        item['uid'] = int(image[0])
        item['data'] = image[3]
        return item

    def is_stock_img_exist(self, uid):
        return os.path.exists(os.path.join(settings['JD_IMAGE_PATH'], '%s.jpg' % (uid)))

    def parse(self, response):
        # Get price of item
        if(response.meta.has_key('stock_price')):
            price_obj = JSONDecoder().decode(response.body)
            #remove the prefix J_
            stock_id_str = price_obj[0]['id'][2:]
            yield self.generate_price_item((stock_id_str, price_obj[0]['p']))
            yield Request(url=self.generate_mobile_price_query_url(stock_id_str),
                    meta={'stock_mobile_price':1, 'stock_id':stock_id_str}, priority=PRIORITY_MOBILE_PRICE)
            return

        if(response.meta.has_key('stock_mobile_price')):
            m_price = response.xpath('//span[@class="p-price"]/text()').extract()[0][1:]

            yield self.generate_mobile_price_item(response.meta['stock_id'], m_price);
            yield Request(url=self.generate_promotion_query_url(response.meta['stock_id']),
                    meta={'stock_promotion':1, 'stock_id':response.meta['stock_id']}, priority=PRIORITY_PROMOTION_PRICE)

        if(response.meta.has_key('stock_img')):
            item = JDStockImage()
            item['uid'] = int(response.meta['stock_id'])
            item['data'] = response.body
            yield item
            return

        if(response.meta.has_key('stock_promotion')):
            itemList = JDStockPromotionList()
            itemList['uid'] = int(response.meta['stock_id'])
            itemList['promotionList'] = []

            m = re.match('Promotions.set\((.*)\);', response.body)
            if m:
                content = m.group(1)
                if content != "":
                    #debug
                    #self.promotion_log.write(content+"\n\n")

                    promotion_obj = JSONDecoder().decode(content)
                    promotionInfoList = promotion_obj['promotionInfoList']
                    if promotionInfoList:
                        for promotion in promotionInfoList:
                            """
                            promoType
                                1       满额返券满1万元送500元京东E卡
                                4       有赠品
                                10      满减

                                set : 1 as rebate now
                            """

                            if promotion['rebate']:
                                item = JDStockPromotion()
                                item['type'] = 1
                                item['rebate'] = promotion['rebate']
                                itemList['promotionList'].append(item)
                            if promotion['needMondey'] and promotion['reward']:
                                item = JDStockPromotion()
                                item['type'] = promotion['promoType'] # 10 is 满减
                                item['needMoney'] = float(promotion['needMondey'])
                                item['reward'] = float(promotion['reward'])
                                itemList['promotionList'].append(item)
            yield itemList
            return

        if(response.meta.has_key('stock_page')):
            for stock in response.xpath('//li[@index]'):
                stock_tab_items = stock.xpath('.//div[contains(@class, "tab-content-item")]')
                if stock_tab_items:
                    for single_item in stock_tab_items:
                        item = self.extract_single_stock(single_item)
                        yield self.generate_item(item, response.meta['category'])
                        if not self.is_stock_img_exist(item[0]):
                            yield Request(url=item[3], meta={'stock_img':1, 'stock_id':item[0]})
                        yield Request(url=self.generate_price_query_url(item[0]), priority=PRIORITY_PRICE,
                                  meta={'stock_price':1})
                else:
                    item = self.extract_single_stock(stock)
                    yield self.generate_item(item, response.meta['category'])
                    if not self.is_stock_img_exist(item[0]):
                        yield Request(url=item[3], meta={'stock_img':1, 'stock_id':item[0]})
                    yield Request(url=self.generate_price_query_url(item[0]), priority=PRIORITY_PRICE,
                                  meta={'stock_price':1})

            next_page_nodes = response.xpath('//a[@class="pn-next"]')
            if next_page_nodes:
                next_page = next_page_nodes[0].xpath('@href').extract()[0]
                next_page_url = "http://list.jd.com%s" % (next_page)
                #self.log(next_page_url, INFO)
                r = Request(url=next_page_url, priority=PRIORITY_PAGE,
                            meta={'stock_page':1, 'category' : response.meta['category']})
                yield  r
            return