Ejemplo n.º 1
0
    def crawl_listing(self, url, ctx='', **kwargs):
        res = requests.get(url)
        res.raise_for_status()
        tree = lxml.html.fromstring(res.content)

        category = Category.objects(key=kwargs.get('key')).first()
        if not category:
            common_failed.send(sender=ctx, url=url, reason='category %s not found in db' % kwargs.get('key'))
            return

        product_nodes = tree.cssselect('div#searchResults a')
        for product_node in product_nodes:
            price = None; listprice = None
            price = product_node.cssselect('.price-6pm')[0].text
            listprice_node = product_node.cssselect('.discount')
            listprice = ''.join(listprice_node[0].xpath('text()')) if listprice_node else None

            # eliminate products of no discountIndexError:
            if price is None or listprice is None:
                # common_failed.send(sender=ctx, url=url, \
                #     reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice))
                continue

            key = product_node.get('data-product-id')
            if not key:
                common_failed.send(sender=ctx, url=url, reason='listing product has no key')
                continue

            combine_url = product_node.get('href')
            key = '%s_%s' % (key, combine_url.split('/')[-1])
            match = re.search(r'https?://.+', combine_url)
            if not match:
                combine_url = '%s%s' % (HOST, combine_url)

            brand = product_node.cssselect('.brandName')[0].text.strip()
            title = product_node.cssselect('.productName')[0].text.strip()

            is_new = False; is_updated = False
            product = Product.objects(key=key).first()
            if not product:
                is_new = True
                product = Product(key=key)
                product.updated = False
                product.event_type = False

            if title and title != product.title:
                product.title = title
                is_updated = True

            if brand and brand != product.brand:
                product.brand = brand
                is_updated = True

            if combine_url and combine_url != product.combine_url:
                product.combine_url = combine_url
                is_updated = True

            if price and price != product.price:
                product.price = price
                is_updated = True

            if listprice and listprice != product.listprice:
                product.listprice = listprice
                is_updated = True

            if category.cats and set(category.cats).difference(product.dept):
                product.dept = list(set(category.cats) | set(product.dept or []))
                is_updated = True

            if category.key not in product.category_key:
                product.category_key.append(category.key)
                is_updated = True

            if is_updated:
                product.list_update_time = datetime.utcnow()
            
            # To pick the product which fit our needs, such as a certain discount, brand, dept etc.
            selected = Picker(site='6pm').pick(product)
            if not selected:
                continue

            product.hit_time = datetime.utcnow()
            product.save()

            common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \
                is_new=is_new, is_updated=((not is_new) and is_updated) )


            print product.key; print product.brand; print product.title; \
            print product.price, ' / ', product.listprice; print product.combine_url; \
            print product.dept; print

        # Go to the next page to keep on crawling.
        next_page = None
        page_node = tree.cssselect('div.pagination')
        if not page_node:
            return

        last_node =page_node[0].cssselect('.last')
        if last_node:
            next_page = page_node[0].cssselect('a')[-1].get('href')

        if next_page:
            match = re.search(r'https?://.+', next_page)
            if not match:
                next_page = '%s%s' % (HOST, next_page)
            print next_page
            self.crawl_listing(url=next_page, ctx=ctx, **kwargs)
Ejemplo n.º 2
0
    def _parse_product(self, event_id, asins, cAsins, prefix_url, product_data, ctx):
        """ no video info, list_info, summary

        :param event_id: this product belongs to the event's id
        :param asins: all asins info in this event
        :param cAsins: all casins info in this event
        :param prefix_url: image and js prefix_url, probably 'http://z-ecx.images-amazon.com/images/I/'
        :param product_data: product data in this product
        """
        asin = product_data['asin']
        casin = product_data['cAsin']
        title = product_data['title'].encode('utf-8') # color is in title
#        image_urls = [product_data['image']] + product_data['altImages'] # one picture, altImages is []
        if 'listPrice' in product_data:
            listprice = product_data['listPrice']['display'] # or 'amount', if isRange: True, don't know what 'amount' will be
        else: listprice = ''
        price = product_data['ourPrice']['display']
        sizes = []
        if product_data['teenagers']: # no size it is {}
            for k, v in product_data['teenagers'].iteritems():
                if v['size'] not in sizes: sizes.append(v['size'])
        # tag is not precision. e.g. a bag is in shoes
        # tag = product_data['productGL'] if 'productGL' in product_data else '' # 'apparel', 'home', 'jewelry', ''

        soldout_link = 'http://www.myhabit.com/request/getBuyableAsinInfo?asin={0}&saleId={1}&flavor=parent&sid=177-4704555-7345351'.format(asin, event_id)
        # one soldout link contains this asin's all color.
        ret = req.get(soldout_link)
        jsdata = json.loads(ret.content)
        key_list = sorted(jsdata['buyableAsin'].keys())
        len_sizes = len(sizes)
        soldout = False
        if len_sizes == 0:
            if jsdata['buyableAsin'][casin]['stats']['remaining']['claimed'] == 0:
                soldout = True
            else:
                soldout = False

        else: # more than one size.
            if 'asin' in key_list: key_list.remove('asin')
            if 'privateSaleID' in key_list: key_list.remove('privateSaleID')

            count = 0
            for l in key_list:
                if l == casin or (count > 0 and count < len_sizes):
                    count += 1
                    if jsdata['buyableAsin'][l]['stats']['remaining']['claimed'] == 0:
                        soldout = True
                    else:
                        soldout = False
                        break
        
#        if casin in cAsins and 'soldOut' in cAsins[casin] and cAsins[casin]['soldOut'] == 1:
#            soldout = True
#        else: soldout = False
        jslink = prefix_url + asins[asin]['url'] if asin in asins else ''
        combine_url = 'http://www.myhabit.com/homepage#page=d&sale={0}&asin={1}&cAsin={2}'.format(event_id, asin, casin)

        is_new, is_updated = False, False
        product = Product.objects(key=casin).first()
        if not product:
            is_new = True
            product = Product(key=casin)
            product.combine_url = combine_url
            product.asin = asin
            product.title = title
#            product.image_urls = image_urls
            product.listprice = listprice
            product.price = price
            product.sizes = sizes
            product.soldout = soldout
            product.updated = False
        else:
            if soldout and product.soldout != soldout:
                product.soldout = True
                is_updated = True
                product.update_history.update({ 'soldout': datetime.utcnow() })
            if product.title != title:
                product.title = title
                product.update_history.update({ 'title': datetime.utcnow() })
            if product.combine_url != combine_url:
                product.combine_url = combine_url
                product.update_history.update({ 'combine_url': datetime.utcnow() })
            if product.listprice != listprice:
                product.listprice = listprice
                product.update_history.update({ 'listprice': datetime.utcnow() })
            if product.price != price:
                product.price = price
                product.update_history.update({ 'price': datetime.utcnow() })

        if event_id not in product.event_id: product.event_id.append(event_id)
        product.jslink = jslink
        product.list_update_time = datetime.utcnow()
        product.save()
        common_saved.send(sender=ctx, obj_type='Product', key=casin, url=product.combine_url, is_new=is_new, is_updated=is_updated)
        return casin
Ejemplo n.º 3
0
    def crawl_listing(self, url, ctx='', **kwargs):
        if url.startswith('http://blogs.nordstrom.com'):
            return
        try:
            res = requests.get(url, params={'sort': 'sale'})
        except requests.exceptions.ConnectionError:
            return

        res.raise_for_status()
        tree = lxml.html.fromstring(res.content)
        listing_node = tree.cssselect('div.fashion-results')

        if listing_node:
            listing_node = listing_node[0]
        else:
            if tree.cssselect('div#brandsIndex'):
                return

            self.crawl_listing_of_no_leaf(tree, ctx=ctx, **kwargs)
            return

        product_nodes = listing_node.cssselect('div.row div.fashion-item')
        if not product_nodes:
            self.crawl_listing_of_no_leaf(tree, ctx=ctx, **kwargs)
            return
        
        category = Category.objects(key=kwargs.get('key')).first()
        no_discount_num = 0 # sometimes no discount product occurs between the  discount ones ordered by sale.
        for product_node in product_nodes:
            key = product_node.get('id')
            if not key:
                common_failed.send(sender=ctx, url=url, reason='listing product has no id')
                continue

            try:
                info_node = product_node.cssselect('div.info')[0]
                a_node = info_node.cssselect('a')[0]
                title = a_node.text.strip()

                price = None; listprice = None
                price_nodes = info_node.cssselect(".price")
                for price_node in price_nodes:
                    if 'regular' in price_node.get('class'):
                        listprice = price_node.text
                    elif 'sale' in price_node.get('class'):
                        price = price_node.text
                
                if price is None or listprice is None:
                    no_discount_num += 1
                    if no_discount_num < 3:
                        continue
                    # common_failed.send(sender=ctx, url=url, \
                    #     reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice))
                    return

                combine_url = a_node.get('href')
                if not combine_url:
                    common_failed.send(sender=ctx, url=url, reason='listing product %s.%s cannot crawl combine_url' % (key, title))
                    continue

                match = re.search(r'https?://.+', combine_url)
                if not match:
                    combine_url = 'http://shop.nordstrom.com%s' % (combine_url)

            except IndexError:
                print traceback.format_exc()
                common_failed.send(sender=ctx, url=url, reason='listing product %s -> %s' % (key, traceback.format_exc()))
                continue


            is_new = False; is_updated = False
            product = Product.objects(key=key).first()
            if not product:
                is_new = True
                product = Product(key=key)
                product.updated = False
                product.event_type = False

            if combine_url and combine_url != product.combine_url:
                product.combine_url = combine_url
                is_updated = True

            if title and title != product.title:
                product.title = title
                is_updated = True

            if price and price != product.price:
                product.price = price
                is_updated = True

            if listprice and listprice != product.listprice:
                product.listprice = listprice
                is_updated = True

            if category.cats and set(category.cats).difference(product.dept):
                product.dept = list(set(category.cats) | set(product.dept or []))
                is_updated = True

            if category.key not in product.category_key:
                product.category_key.append(category.key)
                is_updated = True

            if is_updated:
                product.list_update_time = datetime.utcnow()
            
            # To pick the product which fit our needs, such as a certain discount, brand, dept etc.
            selected = Picker(site='nordstrom').pick(product)
            if not selected:
                continue

            product.hit_time = datetime.utcnow()
            product.save()
            
            # print product.title
            # print product.combine_url
            # print product.listprice
            # print product.price
            # print is_new
            # print is_updated
            # print

            common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \
                is_new=is_new, is_updated=((not is_new) and is_updated) )

        # Go to the next page to keep on crawling.
        try:
            arrow_node = tree.cssselect('div.fashion-results-header div.fashion-results-pager ul.arrows li.next')[0]
        except IndexError:
            common_failed.send(sender=ctx, url=url, reason=traceback.format_exc())
            return
        next_page = arrow_node.cssselect('a')[0].get('href') \
            if 'disabled' not in arrow_node.get('class') else None

        if next_page:
            print next_page
            self.crawl_listing(url=next_page, ctx=ctx, **kwargs)
Ejemplo n.º 4
0
    def crawl_listing(self, url, ctx='', **kwargs):
        res = requests.get(url)
        res.raise_for_status()
        tree = lxml.html.fromstring(res.content)

        category = kwargs['category'] if kwargs.get('category') else Category.objects(key=kwargs.get('key')).first()
        if not category:
            common_failed.send(sender=ctx, url=url, reason='category %s not found in db' % kwargs.get('key'))
            return
        
        product_nodes = tree.cssselect('div#atg_store_prodList ul li')
        for product_node in product_nodes:
            info_node = product_node.cssselect('div.thumbnailInfo')[0]

            price = None; listprice = None
            price_node = info_node.cssselect('div.our_price')[0]
            weekly_price_node = price_node.cssselect('.newPrice_value')
            sale_price_node = price_node.cssselect('#salePrice')
            if weekly_price_node:
                price = weekly_price_node[0].text.strip()
            elif sale_price_node:
                price = sale_price_node[0].text.strip()
            else:
                price = ''.join(price_node.xpath('.//text()')).strip()
            listprice = info_node.cssselect('div.retail_price')[0].text.strip()
            listprice = re.sub('\n', '', listprice)

            # eliminate products of no discountIndexError:
            if price is None or listprice is None:
                # common_failed.send(sender=ctx, url=url, \
                #     reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice))
                continue

            key = info_node.cssselect('div.product_id')[0].text.strip()
            brand = info_node.cssselect('a.sameBrandProduct')[0].text.strip()
            title_node = info_node.cssselect('a.product_gender_name')[0]
            # title = title_node.get('title')
            combine_url = title_node.get('href')
            match = re.search(r'https?://.+', combine_url)
            if not match:
                combine_url = '%s%s' % (HOST, combine_url)

            #
            is_new = False; is_updated = False
            product = Product.objects(key=key).first()
            if not product:
                is_new = True
                product = Product(key=key)
                product.updated = False
                product.event_type = False

            if brand and brand != product.brand:
                product.brand = brand
                is_updated = True

            if combine_url and combine_url != product.combine_url:
                product.combine_url = combine_url
                is_updated = True

            if price and price != product.price:
                product.price = price
                is_updated = True

            if listprice and listprice != product.listprice:
                product.listprice = listprice
                is_updated = True

            # if category.cats and set(category.cats).difference(product.dept):
            #     product.dept = list(set(category.cats) | set(product.dept or []))
            #     is_updated = True

            if category.key not in product.category_key:
                product.category_key.append(category.key)
                is_updated = True

            # To pick the product which fit our needs, such as a certain discount, brand, dept etc.
            try:
                selected = Picker(site='ashford').pick(product) if product.updated \
                    else self.crawl_detail(ctx, is_new, is_updated, product)
            except:
                common_failed.send(sender=ctx, url=product.combine_url, reason=traceback.format_exc())
                continue
            if not selected:
                continue

            if is_updated:
                product.list_update_time = datetime.utcnow()

            product.hit_time = datetime.utcnow()
            product.save()
            
            common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \
                is_new=is_new, is_updated=((not is_new) and is_updated), ready=(product.ready if hasattr(product, 'ready') else False))
Ejemplo n.º 5
0
    def crawl_listing(self, url, ctx='', **kwargs):
        res = requests.get(url, params={'Ns': 'P_sale_flag|1'})
        res.raise_for_status()
        tree = lxml.html.fromstring(res.content)

        category = Category.objects(key=kwargs.get('key')).first()
        if not category:
            print 'Category does not exist'
            common_failed.send(sender=ctx, url=url, reason='Category does not exist -> {0} .'.format(kwargs))
            return

        product_nodes = tree.cssselect('div#product-container div');
        no_discount_num = 0 # sometimes no discount product occurs between the  discount ones ordered by sale.

        for product_node in product_nodes:
            if not product_node.get('id') or 'product' not in product_node.get('id').lower():
                continue

            key = product_node.get('id')
            info_node = product_node.cssselect('div.product-text a')[0]
            price = None; listprice = None
            listprice_node = info_node.cssselect('span.product-price')
            price_node = info_node.cssselect('span.product-sale-price')
            if listprice_node:
                listprice = ''.join(listprice_node[0].xpath('.//text()')).strip()
            if price_node:
                price = ''.join(price_node[0].xpath('.//text()')).strip()

            if price is None or listprice is None:
                no_discount_num += 1
                if no_discount_num < 3:
                    continue
                return
            no_discount_num = 0

            brand = info_node.cssselect('p span.product-designer-name')[0].text
            if brand:
                brand = brand.strip()
            title = info_node.cssselect('p.product-description')[0].text.strip()
            combine_url = info_node.get('href')

            is_new = False; is_updated = False
            product = Product.objects(key=key).first()
            if not product:
                is_new = True
                product = Product(key=key)
                product.updated = False
                product.event_type = False

            if title and title != product.title:
                product.title = title
                is_updated = True
                product.update_history['title'] = datetime.utcnow()

            if brand and brand != product.brand:
                product.brand = brand
                is_updated = True

            if combine_url and combine_url != product.combine_url:
                product.combine_url = combine_url
                is_updated = True
                product.update_history['combine_url'] = datetime.utcnow()

            if price and price != product.price:
                product.price = price
                is_updated = True

            if listprice and listprice != product.listprice:
                product.listprice = listprice
                is_updated = True

            if category.cats and set(category.cats).difference(product.dept):
                product.dept = list(set(category.cats) | set(product.dept or []))
                is_updated = True

            if category.key not in product.category_key:
                product.category_key.append(category.key)
                is_updated = True

            if is_updated:
                product.list_update_time = datetime.utcnow()
            
            # To pick the product which fit our needs, such as a certain discount, brand, dept etc.
            selected = Picker(site='saksfifthavenue').pick(product)
            if not selected:
                continue

            product.hit_time = datetime.utcnow()
            product.save()
            
            # print product.brand; print product.title; print product.combine_url; print product.listprice, ' / ', product.price; print is_new; print is_updated
            # print

            common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \
                is_new=is_new, is_updated=((not is_new) and is_updated) )

        # Go to the next page to keep on crawling.
        next_page = None
        page_nodes = tree.cssselect('div.pagination-container ol.pa-page-number li a')
        for page_node in page_nodes:
            if page_node.get('class') == 'next':
                href = page_node.get('href')
                match = re.search(r'https?://.+', href)
                next_page = href if match else '{0}/{1}'.format(HOST, href)
                break

        if next_page:
            print next_page
            self.crawl_listing(url=next_page, ctx=ctx, **kwargs)