def extract_ranks_names_urls(rank_page_lxml): """Extracts the ranks, names, and urls given rank page html in unicode""" if helpers.get_debug_config(): print "[DEBUG] Extracting ranks, names, urls from rank page: " + \ rank_page_lxml.base_url ranks_names_urls = [] product_list = rank_page_lxml.find_class('s-result-item') for product in product_list: try: prod_html = etree.tostring(product) rank_search = re.search(r'\<li id=\"result\_(\d+)\"', prod_html) rank = int(rank_search.group(1)) + 1 except: helpers.log('Rank error on page') rank = 'Error' try: title = product.find_class('a-spacing-mini')[0] name = unicode(title.text_content()).strip() name = helpers.remove_unsafe_chars(name) except: helpers.log('Name error on page') name = 'Error' try: url = product.find_class('a-spacing-mini')[0].iterlinks().next()[2].strip() except: helpers.log('URL error on page') url = 'Error' ranks_names_urls.append([rank, name, url]) return ranks_names_urls
def extract_category(rank_page_lxml): """Extracts the category given rank page html in unicode""" if helpers.get_debug_config(): print "[DEBUG] Extracting category from rank page: " + \ rank_page_lxml.base_url try: category = rank_page_lxml.cssselect('h2#s-result-count') category = category[0][0].text_content().strip() except: helpers.log('Category error on page') category = 'Error' if category == 'Error': category = dbdo.get_category_name(str(rank_page_lxml.base_url))[0][0] return category
def process_product_page(product_url): """Scrapes and returns all the data in the given product url""" if helpers.get_debug_config(): print "[DEBUG] Extracting manufacturer, price, and sold by from \ product page: " + product_url product_page_lxml = helpers.get_page(product_url) if product_page_lxml == False: print "Scrape of product page failed!!! URL: " + product_url helpers.log("Scrape of product page failed!!! URL: " + product_url) return 'Error', 'Error', 'Error' try: manufacturer = extract_manufacturer(product_page_lxml) price = extract_price(product_page_lxml) sold_by = extract_sold_by(product_page_lxml) except Exception, e: print e print traceback.print_exc() helpers.log('Error with url: ' + product_url) raise