def calc_slope(asin, category_url): """Calculates the slope given asin and category_url 1. create a list of lists, with each point for each asin [[x,y], [x,y]]: x is days (starting from 0); y is rank 2. given the list calculate the following variables: num_of_points, sum_of_x, sum_of_y, sum_of_xy, sum_of_x_squared 3. if num_of_points < 3: skip the asin 4. calculate the slope of the line: ((num_of_points * sum_of_xy) - (sum_of_x * sum_of_y)) / ((num_of_points * sum_of_x_squared) - sum_of_x^2) """ datapoints = [] results = dbdo.get_scrape_date_rank(category_url, asin) if len(results) == 0 or len(results) == 1: return None zero_date = results[0][0] for line in results: datapoints.append([helpers.day_delta(line[0], zero_date), int(line[1])]) num_of_points = len(datapoints) sum_of_x = 0.0 sum_of_y = 0.0 sum_of_xy = 0.0 sum_of_x_squared = 0.0 for datum in datapoints: sum_of_x += datum[0] sum_of_y += datum[1] sum_of_xy += (datum[0] * datum[1]) sum_of_x_squared += (datum[0] * datum[0]) try: slope = ((num_of_points * sum_of_xy) - (sum_of_x * sum_of_y)) / \ ((num_of_points * sum_of_x_squared) - (sum_of_x * sum_of_x)) except Exception, e: print datapoints print num_of_points print sum_of_xy print sum_of_x print sum_of_y print sum_of_x_squared raise
def calc_categories_to_scrape(days=5): """Returns an array of categories to scrape, which are older than 5 days""" categories_to_scrape = [] results = [['http://www.amazon.com/s/ref=sr_hi_1?rh=n%3A3760901&ie=UTF8']] #Scraping only Health & Personal category_results... uncomment line below to scrape all #results = dbdo.get_all_category_urls() for line in results: category_url = line[0] start_rank = last_rank_scraped(category_url) #Check if last scrape attempt was completed if start_rank != 1: categories_to_scrape.append([category_url, start_rank]) continue #Add to queue categories that haven't been scraped or are older than days scrape_date_results = dbdo.get_last_scrape_date(category_url) if scrape_date_results == (): categories_to_scrape.append([category_url, start_rank]) continue last_scrape = scrape_date_results[0][0] if helpers.day_delta(last_scrape) < -days: categories_to_scrape.append([category_url, start_rank]) return categories_to_scrape