Beispiel #1
0
def extract_ranks_names_urls(rank_page_lxml):
	"""Extracts the ranks, names, and urls given rank page html in unicode"""
	if helpers.get_debug_config():
		print "[DEBUG] Extracting ranks, names, urls from rank page: " + \
		rank_page_lxml.base_url
	ranks_names_urls = []
	product_list = rank_page_lxml.find_class('s-result-item')
	for product in product_list:
		try:
			prod_html = etree.tostring(product)
			rank_search = re.search(r'\<li id=\"result\_(\d+)\"', prod_html)
			rank = int(rank_search.group(1)) + 1
		except:
			helpers.log('Rank error on page')
			rank = 'Error'
	
		try:
			title = product.find_class('a-spacing-mini')[0]
			name = unicode(title.text_content()).strip()
			name = helpers.remove_unsafe_chars(name)
		except:
			helpers.log('Name error on page')
			name = 'Error'
		
		try:
			url = product.find_class('a-spacing-mini')[0].iterlinks().next()[2].strip()
		except:
			helpers.log('URL error on page')
			url = 'Error'
		
		ranks_names_urls.append([rank, name, url])
	return ranks_names_urls
Beispiel #2
0
def extract_manufacturer(product_page_lxml):
	"""Extracts the manufacturer given a product page html in unicode"""
	manufacturer = 'Error'
	try:
		results = product_page_lxml.cssselect('div.buying')
		for result in results:
			test = result.cssselect('h1.parseasinTitle')
			if test != []:
				manufacturer = re.sub(regex_by, '', \
					result.cssselect('span')[1].text_content()).strip()
				break
	except:
		helpers.log('Manufacturer error on page')
		manufacturer = 'Error'
	return helpers.remove_unsafe_chars(manufacturer)