def calc_categories_to_scrape(days=5): """Returns an array of categories to scrape, which are older than 5 days""" categories_to_scrape = [] results = [['http://www.amazon.com/s/ref=sr_hi_1?rh=n%3A3760901&ie=UTF8']] #Scraping only Health & Personal category_results... uncomment line below to scrape all #results = dbdo.get_all_category_urls() for line in results: category_url = line[0] start_rank = last_rank_scraped(category_url) #Check if last scrape attempt was completed if start_rank != 1: categories_to_scrape.append([category_url, start_rank]) continue #Add to queue categories that haven't been scraped or are older than days scrape_date_results = dbdo.get_last_scrape_date(category_url) if scrape_date_results == (): categories_to_scrape.append([category_url, start_rank]) continue last_scrape = scrape_date_results[0][0] if helpers.day_delta(last_scrape) < -days: categories_to_scrape.append([category_url, start_rank]) return categories_to_scrape
def last_rank_scraped(category_url): """Calculates the highest rank scraped on the latest run and then compares that with older scrapes if a higher rank has been scraped previously that means the lastest run was incomplete and returns the highest rank of the latest run, if the category was completely scraped returns 1""" #Calc last scrape date scrape_date_results = dbdo.get_last_scrape_date(category_url) if scrape_date_results == (): return 1 last_scrape = scrape_date_results[0][0] #Calc highest rank scraped in last scrape highest_rank_results = dbdo.get_highest_rank(category_url, last_scrape) if highest_rank_results == (): return 1 highest_rank = highest_rank_results[0][0] #Calc max rank scraped ever max_rank_results = dbdo.get_highest_rank(category_url) if max_rank_results == (): return 1 max_rank = max_rank_results[0][0] if highest_rank != max_rank: return highest_rank else: return 1