def extract_ranks_names_urls(rank_page_lxml): """Extracts the ranks, names, and urls given rank page html in unicode""" if helpers.get_debug_config(): print "[DEBUG] Extracting ranks, names, urls from rank page: " + \ rank_page_lxml.base_url ranks_names_urls = [] product_list = rank_page_lxml.find_class('s-result-item') for product in product_list: try: prod_html = etree.tostring(product) rank_search = re.search(r'\<li id=\"result\_(\d+)\"', prod_html) rank = int(rank_search.group(1)) + 1 except: helpers.log('Rank error on page') rank = 'Error' try: title = product.find_class('a-spacing-mini')[0] name = unicode(title.text_content()).strip() name = helpers.remove_unsafe_chars(name) except: helpers.log('Name error on page') name = 'Error' try: url = product.find_class('a-spacing-mini')[0].iterlinks().next()[2].strip() except: helpers.log('URL error on page') url = 'Error' ranks_names_urls.append([rank, name, url]) return ranks_names_urls
def extract_manufacturer(product_page_lxml): """Extracts the manufacturer given a product page html in unicode""" manufacturer = 'Error' try: results = product_page_lxml.cssselect('div.buying') for result in results: test = result.cssselect('h1.parseasinTitle') if test != []: manufacturer = re.sub(regex_by, '', \ result.cssselect('span')[1].text_content()).strip() break except: helpers.log('Manufacturer error on page') manufacturer = 'Error' return helpers.remove_unsafe_chars(manufacturer)