Esempio n. 1
0
def extract_ranks_names_urls(rank_page_lxml):
	"""Extracts the ranks, names, and urls given rank page html in unicode"""
	if helpers.get_debug_config():
		print "[DEBUG] Extracting ranks, names, urls from rank page: " + \
		rank_page_lxml.base_url
	ranks_names_urls = []
	product_list = rank_page_lxml.find_class('s-result-item')
	for product in product_list:
		try:
			prod_html = etree.tostring(product)
			rank_search = re.search(r'\<li id=\"result\_(\d+)\"', prod_html)
			rank = int(rank_search.group(1)) + 1
		except:
			helpers.log('Rank error on page')
			rank = 'Error'
	
		try:
			title = product.find_class('a-spacing-mini')[0]
			name = unicode(title.text_content()).strip()
			name = helpers.remove_unsafe_chars(name)
		except:
			helpers.log('Name error on page')
			name = 'Error'
		
		try:
			url = product.find_class('a-spacing-mini')[0].iterlinks().next()[2].strip()
		except:
			helpers.log('URL error on page')
			url = 'Error'
		
		ranks_names_urls.append([rank, name, url])
	return ranks_names_urls
Esempio n. 2
0
def extract_category(rank_page_lxml):
	"""Extracts the category given rank page html in unicode"""
	if helpers.get_debug_config():
		print "[DEBUG] Extracting category from rank page: " + \
		 rank_page_lxml.base_url
	try:
		category = rank_page_lxml.cssselect('h2#s-result-count')
		category = category[0][0].text_content().strip()
	except:
		helpers.log('Category error on page')
		category = 'Error'
	if category == 'Error':
			category = dbdo.get_category_name(str(rank_page_lxml.base_url))[0][0]
	return category
Esempio n. 3
0
def process_product_page(product_url):
	"""Scrapes and returns all the data in the given product url"""
	if helpers.get_debug_config():
		print "[DEBUG] Extracting manufacturer, price, and sold by from \
		product page: " + product_url
	product_page_lxml = helpers.get_page(product_url)
	if product_page_lxml == False:
		print "Scrape of product page failed!!! URL: " + product_url
		helpers.log("Scrape of product page failed!!! URL: " + product_url)
		return 'Error', 'Error', 'Error'
	try:
		manufacturer = extract_manufacturer(product_page_lxml)
		price = extract_price(product_page_lxml)
		sold_by = extract_sold_by(product_page_lxml)
	except Exception, e:
		print e
		print traceback.print_exc()
		helpers.log('Error with url: ' + product_url)
		raise