Esempio n. 1
0
def calc_categories_to_scrape(days=5):
	"""Returns an array of categories to scrape, which are older than 5 days"""
	categories_to_scrape = []
	results = [['http://www.amazon.com/s/ref=sr_hi_1?rh=n%3A3760901&ie=UTF8']]
	#Scraping only Health & Personal category_results... uncomment line below to scrape all
	#results = dbdo.get_all_category_urls()
	for line in results:
		category_url = line[0]
		start_rank = last_rank_scraped(category_url)
		#Check if last scrape attempt was completed
		if start_rank != 1:
			categories_to_scrape.append([category_url, start_rank])
			continue
		#Add to queue categories that haven't been scraped or are older than days
		scrape_date_results = dbdo.get_last_scrape_date(category_url)
		if scrape_date_results == ():
			categories_to_scrape.append([category_url, start_rank])
			continue
		last_scrape = scrape_date_results[0][0]
		if helpers.day_delta(last_scrape) < -days:
			categories_to_scrape.append([category_url, start_rank])
	return categories_to_scrape
Esempio n. 2
0
def last_rank_scraped(category_url):
	"""Calculates the highest rank scraped on the latest run and then compares 
	that with older scrapes if a higher rank has been scraped previously that 
	means the lastest run was incomplete and returns the highest rank of the 
	latest run, if the category was completely scraped returns 1"""
	#Calc last scrape date
	scrape_date_results = dbdo.get_last_scrape_date(category_url)
	if scrape_date_results == ():
		return 1
	last_scrape = scrape_date_results[0][0]
	#Calc highest rank scraped in last scrape
	highest_rank_results = dbdo.get_highest_rank(category_url, last_scrape)
	if highest_rank_results == ():
		return 1
	highest_rank = highest_rank_results[0][0]
	#Calc max rank scraped ever
	max_rank_results = dbdo.get_highest_rank(category_url)
	if max_rank_results == ():
		return 1
	max_rank = max_rank_results[0][0]
	if highest_rank != max_rank:
		return highest_rank
	else:
		return 1