def crawl_listing(self, url, ctx='', **kwargs): if url.startswith('http://blogs.nordstrom.com'): return try: res = requests.get(url, params={'sort': 'sale'}) except requests.exceptions.ConnectionError: return res.raise_for_status() tree = lxml.html.fromstring(res.content) listing_node = tree.cssselect('div.fashion-results') if listing_node: listing_node = listing_node[0] else: if tree.cssselect('div#brandsIndex'): return self.crawl_listing_of_no_leaf(tree, ctx=ctx, **kwargs) return product_nodes = listing_node.cssselect('div.row div.fashion-item') if not product_nodes: self.crawl_listing_of_no_leaf(tree, ctx=ctx, **kwargs) return category = Category.objects(key=kwargs.get('key')).first() no_discount_num = 0 # sometimes no discount product occurs between the discount ones ordered by sale. for product_node in product_nodes: key = product_node.get('id') if not key: common_failed.send(sender=ctx, url=url, reason='listing product has no id') continue try: info_node = product_node.cssselect('div.info')[0] a_node = info_node.cssselect('a')[0] title = a_node.text.strip() price = None; listprice = None price_nodes = info_node.cssselect(".price") for price_node in price_nodes: if 'regular' in price_node.get('class'): listprice = price_node.text elif 'sale' in price_node.get('class'): price = price_node.text if price is None or listprice is None: no_discount_num += 1 if no_discount_num < 3: continue # common_failed.send(sender=ctx, url=url, \ # reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice)) return combine_url = a_node.get('href') if not combine_url: common_failed.send(sender=ctx, url=url, reason='listing product %s.%s cannot crawl combine_url' % (key, title)) continue match = re.search(r'https?://.+', combine_url) if not match: combine_url = 'http://shop.nordstrom.com%s' % (combine_url) except IndexError: print traceback.format_exc() common_failed.send(sender=ctx, url=url, reason='listing product %s -> %s' % (key, traceback.format_exc())) continue is_new = False; is_updated = False product = Product.objects(key=key).first() if not product: is_new = True product = Product(key=key) product.updated = False product.event_type = False if combine_url and combine_url != product.combine_url: product.combine_url = combine_url is_updated = True if title and title != product.title: product.title = title is_updated = True if price and price != product.price: product.price = price is_updated = True if listprice and listprice != product.listprice: product.listprice = listprice is_updated = True if category.cats and set(category.cats).difference(product.dept): product.dept = list(set(category.cats) | set(product.dept or [])) is_updated = True if category.key not in product.category_key: product.category_key.append(category.key) is_updated = True if is_updated: product.list_update_time = datetime.utcnow() # To pick the product which fit our needs, such as a certain discount, brand, dept etc. selected = Picker(site='nordstrom').pick(product) if not selected: continue product.hit_time = datetime.utcnow() product.save() # print product.title # print product.combine_url # print product.listprice # print product.price # print is_new # print is_updated # print common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated) ) # Go to the next page to keep on crawling. try: arrow_node = tree.cssselect('div.fashion-results-header div.fashion-results-pager ul.arrows li.next')[0] except IndexError: common_failed.send(sender=ctx, url=url, reason=traceback.format_exc()) return next_page = arrow_node.cssselect('a')[0].get('href') \ if 'disabled' not in arrow_node.get('class') else None if next_page: print next_page self.crawl_listing(url=next_page, ctx=ctx, **kwargs)
def crawl_listing(self, url, ctx='', **kwargs): res = requests.get(url) res.raise_for_status() tree = lxml.html.fromstring(res.content) category = Category.objects(key=kwargs.get('key')).first() if not category: common_failed.send(sender=ctx, url=url, reason='category %s not found in db' % kwargs.get('key')) return product_nodes = tree.cssselect('div#searchResults a') for product_node in product_nodes: price = None; listprice = None price = product_node.cssselect('.price-6pm')[0].text listprice_node = product_node.cssselect('.discount') listprice = ''.join(listprice_node[0].xpath('text()')) if listprice_node else None # eliminate products of no discountIndexError: if price is None or listprice is None: # common_failed.send(sender=ctx, url=url, \ # reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice)) continue key = product_node.get('data-product-id') if not key: common_failed.send(sender=ctx, url=url, reason='listing product has no key') continue combine_url = product_node.get('href') key = '%s_%s' % (key, combine_url.split('/')[-1]) match = re.search(r'https?://.+', combine_url) if not match: combine_url = '%s%s' % (HOST, combine_url) brand = product_node.cssselect('.brandName')[0].text.strip() title = product_node.cssselect('.productName')[0].text.strip() is_new = False; is_updated = False product = Product.objects(key=key).first() if not product: is_new = True product = Product(key=key) product.updated = False product.event_type = False if title and title != product.title: product.title = title is_updated = True if brand and brand != product.brand: product.brand = brand is_updated = True if combine_url and combine_url != product.combine_url: product.combine_url = combine_url is_updated = True if price and price != product.price: product.price = price is_updated = True if listprice and listprice != product.listprice: product.listprice = listprice is_updated = True if category.cats and set(category.cats).difference(product.dept): product.dept = list(set(category.cats) | set(product.dept or [])) is_updated = True if category.key not in product.category_key: product.category_key.append(category.key) is_updated = True if is_updated: product.list_update_time = datetime.utcnow() # To pick the product which fit our needs, such as a certain discount, brand, dept etc. selected = Picker(site='6pm').pick(product) if not selected: continue product.hit_time = datetime.utcnow() product.save() common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated) ) print product.key; print product.brand; print product.title; \ print product.price, ' / ', product.listprice; print product.combine_url; \ print product.dept; print # Go to the next page to keep on crawling. next_page = None page_node = tree.cssselect('div.pagination') if not page_node: return last_node =page_node[0].cssselect('.last') if last_node: next_page = page_node[0].cssselect('a')[-1].get('href') if next_page: match = re.search(r'https?://.+', next_page) if not match: next_page = '%s%s' % (HOST, next_page) print next_page self.crawl_listing(url=next_page, ctx=ctx, **kwargs)
def crawl_listing(self, url, ctx='', **kwargs): res = requests.get(url, params={'Ns': 'P_sale_flag|1'}) res.raise_for_status() tree = lxml.html.fromstring(res.content) category = Category.objects(key=kwargs.get('key')).first() if not category: print 'Category does not exist' common_failed.send(sender=ctx, url=url, reason='Category does not exist -> {0} .'.format(kwargs)) return product_nodes = tree.cssselect('div#product-container div'); no_discount_num = 0 # sometimes no discount product occurs between the discount ones ordered by sale. for product_node in product_nodes: if not product_node.get('id') or 'product' not in product_node.get('id').lower(): continue key = product_node.get('id') info_node = product_node.cssselect('div.product-text a')[0] price = None; listprice = None listprice_node = info_node.cssselect('span.product-price') price_node = info_node.cssselect('span.product-sale-price') if listprice_node: listprice = ''.join(listprice_node[0].xpath('.//text()')).strip() if price_node: price = ''.join(price_node[0].xpath('.//text()')).strip() if price is None or listprice is None: no_discount_num += 1 if no_discount_num < 3: continue return no_discount_num = 0 brand = info_node.cssselect('p span.product-designer-name')[0].text if brand: brand = brand.strip() title = info_node.cssselect('p.product-description')[0].text.strip() combine_url = info_node.get('href') is_new = False; is_updated = False product = Product.objects(key=key).first() if not product: is_new = True product = Product(key=key) product.updated = False product.event_type = False if title and title != product.title: product.title = title is_updated = True product.update_history['title'] = datetime.utcnow() if brand and brand != product.brand: product.brand = brand is_updated = True if combine_url and combine_url != product.combine_url: product.combine_url = combine_url is_updated = True product.update_history['combine_url'] = datetime.utcnow() if price and price != product.price: product.price = price is_updated = True if listprice and listprice != product.listprice: product.listprice = listprice is_updated = True if category.cats and set(category.cats).difference(product.dept): product.dept = list(set(category.cats) | set(product.dept or [])) is_updated = True if category.key not in product.category_key: product.category_key.append(category.key) is_updated = True if is_updated: product.list_update_time = datetime.utcnow() # To pick the product which fit our needs, such as a certain discount, brand, dept etc. selected = Picker(site='saksfifthavenue').pick(product) if not selected: continue product.hit_time = datetime.utcnow() product.save() # print product.brand; print product.title; print product.combine_url; print product.listprice, ' / ', product.price; print is_new; print is_updated # print common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated) ) # Go to the next page to keep on crawling. next_page = None page_nodes = tree.cssselect('div.pagination-container ol.pa-page-number li a') for page_node in page_nodes: if page_node.get('class') == 'next': href = page_node.get('href') match = re.search(r'https?://.+', href) next_page = href if match else '{0}/{1}'.format(HOST, href) break if next_page: print next_page self.crawl_listing(url=next_page, ctx=ctx, **kwargs)