def crawl_listing(self, url, ctx='', **kwargs): res = requests.get(url) res.raise_for_status() tree = lxml.html.fromstring(res.content) category = Category.objects(key=kwargs.get('key')).first() if not category: common_failed.send(sender=ctx, url=url, reason='category %s not found in db' % kwargs.get('key')) return product_nodes = tree.cssselect('div#searchResults a') for product_node in product_nodes: price = None; listprice = None price = product_node.cssselect('.price-6pm')[0].text listprice_node = product_node.cssselect('.discount') listprice = ''.join(listprice_node[0].xpath('text()')) if listprice_node else None # eliminate products of no discountIndexError: if price is None or listprice is None: # common_failed.send(sender=ctx, url=url, \ # reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice)) continue key = product_node.get('data-product-id') if not key: common_failed.send(sender=ctx, url=url, reason='listing product has no key') continue combine_url = product_node.get('href') key = '%s_%s' % (key, combine_url.split('/')[-1]) match = re.search(r'https?://.+', combine_url) if not match: combine_url = '%s%s' % (HOST, combine_url) brand = product_node.cssselect('.brandName')[0].text.strip() title = product_node.cssselect('.productName')[0].text.strip() is_new = False; is_updated = False product = Product.objects(key=key).first() if not product: is_new = True product = Product(key=key) product.updated = False product.event_type = False if title and title != product.title: product.title = title is_updated = True if brand and brand != product.brand: product.brand = brand is_updated = True if combine_url and combine_url != product.combine_url: product.combine_url = combine_url is_updated = True if price and price != product.price: product.price = price is_updated = True if listprice and listprice != product.listprice: product.listprice = listprice is_updated = True if category.cats and set(category.cats).difference(product.dept): product.dept = list(set(category.cats) | set(product.dept or [])) is_updated = True if category.key not in product.category_key: product.category_key.append(category.key) is_updated = True if is_updated: product.list_update_time = datetime.utcnow() # To pick the product which fit our needs, such as a certain discount, brand, dept etc. selected = Picker(site='6pm').pick(product) if not selected: continue product.hit_time = datetime.utcnow() product.save() common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated) ) print product.key; print product.brand; print product.title; \ print product.price, ' / ', product.listprice; print product.combine_url; \ print product.dept; print # Go to the next page to keep on crawling. next_page = None page_node = tree.cssselect('div.pagination') if not page_node: return last_node =page_node[0].cssselect('.last') if last_node: next_page = page_node[0].cssselect('a')[-1].get('href') if next_page: match = re.search(r'https?://.+', next_page) if not match: next_page = '%s%s' % (HOST, next_page) print next_page self.crawl_listing(url=next_page, ctx=ctx, **kwargs)
def _parse_product(self, event_id, asins, cAsins, prefix_url, product_data, ctx): """ no video info, list_info, summary :param event_id: this product belongs to the event's id :param asins: all asins info in this event :param cAsins: all casins info in this event :param prefix_url: image and js prefix_url, probably 'http://z-ecx.images-amazon.com/images/I/' :param product_data: product data in this product """ asin = product_data['asin'] casin = product_data['cAsin'] title = product_data['title'].encode('utf-8') # color is in title # image_urls = [product_data['image']] + product_data['altImages'] # one picture, altImages is [] if 'listPrice' in product_data: listprice = product_data['listPrice']['display'] # or 'amount', if isRange: True, don't know what 'amount' will be else: listprice = '' price = product_data['ourPrice']['display'] sizes = [] if product_data['teenagers']: # no size it is {} for k, v in product_data['teenagers'].iteritems(): if v['size'] not in sizes: sizes.append(v['size']) # tag is not precision. e.g. a bag is in shoes # tag = product_data['productGL'] if 'productGL' in product_data else '' # 'apparel', 'home', 'jewelry', '' soldout_link = 'http://www.myhabit.com/request/getBuyableAsinInfo?asin={0}&saleId={1}&flavor=parent&sid=177-4704555-7345351'.format(asin, event_id) # one soldout link contains this asin's all color. ret = req.get(soldout_link) jsdata = json.loads(ret.content) key_list = sorted(jsdata['buyableAsin'].keys()) len_sizes = len(sizes) soldout = False if len_sizes == 0: if jsdata['buyableAsin'][casin]['stats']['remaining']['claimed'] == 0: soldout = True else: soldout = False else: # more than one size. if 'asin' in key_list: key_list.remove('asin') if 'privateSaleID' in key_list: key_list.remove('privateSaleID') count = 0 for l in key_list: if l == casin or (count > 0 and count < len_sizes): count += 1 if jsdata['buyableAsin'][l]['stats']['remaining']['claimed'] == 0: soldout = True else: soldout = False break # if casin in cAsins and 'soldOut' in cAsins[casin] and cAsins[casin]['soldOut'] == 1: # soldout = True # else: soldout = False jslink = prefix_url + asins[asin]['url'] if asin in asins else '' combine_url = 'http://www.myhabit.com/homepage#page=d&sale={0}&asin={1}&cAsin={2}'.format(event_id, asin, casin) is_new, is_updated = False, False product = Product.objects(key=casin).first() if not product: is_new = True product = Product(key=casin) product.combine_url = combine_url product.asin = asin product.title = title # product.image_urls = image_urls product.listprice = listprice product.price = price product.sizes = sizes product.soldout = soldout product.updated = False else: if soldout and product.soldout != soldout: product.soldout = True is_updated = True product.update_history.update({ 'soldout': datetime.utcnow() }) if product.title != title: product.title = title product.update_history.update({ 'title': datetime.utcnow() }) if product.combine_url != combine_url: product.combine_url = combine_url product.update_history.update({ 'combine_url': datetime.utcnow() }) if product.listprice != listprice: product.listprice = listprice product.update_history.update({ 'listprice': datetime.utcnow() }) if product.price != price: product.price = price product.update_history.update({ 'price': datetime.utcnow() }) if event_id not in product.event_id: product.event_id.append(event_id) product.jslink = jslink product.list_update_time = datetime.utcnow() product.save() common_saved.send(sender=ctx, obj_type='Product', key=casin, url=product.combine_url, is_new=is_new, is_updated=is_updated) return casin
def crawl_listing(self, url, ctx='', **kwargs): if url.startswith('http://blogs.nordstrom.com'): return try: res = requests.get(url, params={'sort': 'sale'}) except requests.exceptions.ConnectionError: return res.raise_for_status() tree = lxml.html.fromstring(res.content) listing_node = tree.cssselect('div.fashion-results') if listing_node: listing_node = listing_node[0] else: if tree.cssselect('div#brandsIndex'): return self.crawl_listing_of_no_leaf(tree, ctx=ctx, **kwargs) return product_nodes = listing_node.cssselect('div.row div.fashion-item') if not product_nodes: self.crawl_listing_of_no_leaf(tree, ctx=ctx, **kwargs) return category = Category.objects(key=kwargs.get('key')).first() no_discount_num = 0 # sometimes no discount product occurs between the discount ones ordered by sale. for product_node in product_nodes: key = product_node.get('id') if not key: common_failed.send(sender=ctx, url=url, reason='listing product has no id') continue try: info_node = product_node.cssselect('div.info')[0] a_node = info_node.cssselect('a')[0] title = a_node.text.strip() price = None; listprice = None price_nodes = info_node.cssselect(".price") for price_node in price_nodes: if 'regular' in price_node.get('class'): listprice = price_node.text elif 'sale' in price_node.get('class'): price = price_node.text if price is None or listprice is None: no_discount_num += 1 if no_discount_num < 3: continue # common_failed.send(sender=ctx, url=url, \ # reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice)) return combine_url = a_node.get('href') if not combine_url: common_failed.send(sender=ctx, url=url, reason='listing product %s.%s cannot crawl combine_url' % (key, title)) continue match = re.search(r'https?://.+', combine_url) if not match: combine_url = 'http://shop.nordstrom.com%s' % (combine_url) except IndexError: print traceback.format_exc() common_failed.send(sender=ctx, url=url, reason='listing product %s -> %s' % (key, traceback.format_exc())) continue is_new = False; is_updated = False product = Product.objects(key=key).first() if not product: is_new = True product = Product(key=key) product.updated = False product.event_type = False if combine_url and combine_url != product.combine_url: product.combine_url = combine_url is_updated = True if title and title != product.title: product.title = title is_updated = True if price and price != product.price: product.price = price is_updated = True if listprice and listprice != product.listprice: product.listprice = listprice is_updated = True if category.cats and set(category.cats).difference(product.dept): product.dept = list(set(category.cats) | set(product.dept or [])) is_updated = True if category.key not in product.category_key: product.category_key.append(category.key) is_updated = True if is_updated: product.list_update_time = datetime.utcnow() # To pick the product which fit our needs, such as a certain discount, brand, dept etc. selected = Picker(site='nordstrom').pick(product) if not selected: continue product.hit_time = datetime.utcnow() product.save() # print product.title # print product.combine_url # print product.listprice # print product.price # print is_new # print is_updated # print common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated) ) # Go to the next page to keep on crawling. try: arrow_node = tree.cssselect('div.fashion-results-header div.fashion-results-pager ul.arrows li.next')[0] except IndexError: common_failed.send(sender=ctx, url=url, reason=traceback.format_exc()) return next_page = arrow_node.cssselect('a')[0].get('href') \ if 'disabled' not in arrow_node.get('class') else None if next_page: print next_page self.crawl_listing(url=next_page, ctx=ctx, **kwargs)
def crawl_listing(self, url, ctx='', **kwargs): res = requests.get(url) res.raise_for_status() tree = lxml.html.fromstring(res.content) category = kwargs['category'] if kwargs.get('category') else Category.objects(key=kwargs.get('key')).first() if not category: common_failed.send(sender=ctx, url=url, reason='category %s not found in db' % kwargs.get('key')) return product_nodes = tree.cssselect('div#atg_store_prodList ul li') for product_node in product_nodes: info_node = product_node.cssselect('div.thumbnailInfo')[0] price = None; listprice = None price_node = info_node.cssselect('div.our_price')[0] weekly_price_node = price_node.cssselect('.newPrice_value') sale_price_node = price_node.cssselect('#salePrice') if weekly_price_node: price = weekly_price_node[0].text.strip() elif sale_price_node: price = sale_price_node[0].text.strip() else: price = ''.join(price_node.xpath('.//text()')).strip() listprice = info_node.cssselect('div.retail_price')[0].text.strip() listprice = re.sub('\n', '', listprice) # eliminate products of no discountIndexError: if price is None or listprice is None: # common_failed.send(sender=ctx, url=url, \ # reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice)) continue key = info_node.cssselect('div.product_id')[0].text.strip() brand = info_node.cssselect('a.sameBrandProduct')[0].text.strip() title_node = info_node.cssselect('a.product_gender_name')[0] # title = title_node.get('title') combine_url = title_node.get('href') match = re.search(r'https?://.+', combine_url) if not match: combine_url = '%s%s' % (HOST, combine_url) # is_new = False; is_updated = False product = Product.objects(key=key).first() if not product: is_new = True product = Product(key=key) product.updated = False product.event_type = False if brand and brand != product.brand: product.brand = brand is_updated = True if combine_url and combine_url != product.combine_url: product.combine_url = combine_url is_updated = True if price and price != product.price: product.price = price is_updated = True if listprice and listprice != product.listprice: product.listprice = listprice is_updated = True # if category.cats and set(category.cats).difference(product.dept): # product.dept = list(set(category.cats) | set(product.dept or [])) # is_updated = True if category.key not in product.category_key: product.category_key.append(category.key) is_updated = True # To pick the product which fit our needs, such as a certain discount, brand, dept etc. try: selected = Picker(site='ashford').pick(product) if product.updated \ else self.crawl_detail(ctx, is_new, is_updated, product) except: common_failed.send(sender=ctx, url=product.combine_url, reason=traceback.format_exc()) continue if not selected: continue if is_updated: product.list_update_time = datetime.utcnow() product.hit_time = datetime.utcnow() product.save() common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated), ready=(product.ready if hasattr(product, 'ready') else False))
def crawl_listing(self, url, ctx='', **kwargs): res = requests.get(url, params={'Ns': 'P_sale_flag|1'}) res.raise_for_status() tree = lxml.html.fromstring(res.content) category = Category.objects(key=kwargs.get('key')).first() if not category: print 'Category does not exist' common_failed.send(sender=ctx, url=url, reason='Category does not exist -> {0} .'.format(kwargs)) return product_nodes = tree.cssselect('div#product-container div'); no_discount_num = 0 # sometimes no discount product occurs between the discount ones ordered by sale. for product_node in product_nodes: if not product_node.get('id') or 'product' not in product_node.get('id').lower(): continue key = product_node.get('id') info_node = product_node.cssselect('div.product-text a')[0] price = None; listprice = None listprice_node = info_node.cssselect('span.product-price') price_node = info_node.cssselect('span.product-sale-price') if listprice_node: listprice = ''.join(listprice_node[0].xpath('.//text()')).strip() if price_node: price = ''.join(price_node[0].xpath('.//text()')).strip() if price is None or listprice is None: no_discount_num += 1 if no_discount_num < 3: continue return no_discount_num = 0 brand = info_node.cssselect('p span.product-designer-name')[0].text if brand: brand = brand.strip() title = info_node.cssselect('p.product-description')[0].text.strip() combine_url = info_node.get('href') is_new = False; is_updated = False product = Product.objects(key=key).first() if not product: is_new = True product = Product(key=key) product.updated = False product.event_type = False if title and title != product.title: product.title = title is_updated = True product.update_history['title'] = datetime.utcnow() if brand and brand != product.brand: product.brand = brand is_updated = True if combine_url and combine_url != product.combine_url: product.combine_url = combine_url is_updated = True product.update_history['combine_url'] = datetime.utcnow() if price and price != product.price: product.price = price is_updated = True if listprice and listprice != product.listprice: product.listprice = listprice is_updated = True if category.cats and set(category.cats).difference(product.dept): product.dept = list(set(category.cats) | set(product.dept or [])) is_updated = True if category.key not in product.category_key: product.category_key.append(category.key) is_updated = True if is_updated: product.list_update_time = datetime.utcnow() # To pick the product which fit our needs, such as a certain discount, brand, dept etc. selected = Picker(site='saksfifthavenue').pick(product) if not selected: continue product.hit_time = datetime.utcnow() product.save() # print product.brand; print product.title; print product.combine_url; print product.listprice, ' / ', product.price; print is_new; print is_updated # print common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated) ) # Go to the next page to keep on crawling. next_page = None page_nodes = tree.cssselect('div.pagination-container ol.pa-page-number li a') for page_node in page_nodes: if page_node.get('class') == 'next': href = page_node.get('href') match = re.search(r'https?://.+', href) next_page = href if match else '{0}/{1}'.format(HOST, href) break if next_page: print next_page self.crawl_listing(url=next_page, ctx=ctx, **kwargs)