def test_RecognizerController(): """Test ``RecognizerController`` class.""" from clair.textprocessing import RecognizerController from clair.coredata import DataStore data_dir = relative("../../example-data") data = DataStore() data.read_data(data_dir) controller = RecognizerController() #create new recognizers and train them controller.train_recognizers(data.products, data.listings) #Save and load the newly created recognizers to/from disk. controller.write_recognizers(data_dir) controller = RecognizerController() controller.read_recognizers(data_dir) #Save recognizers to disk use internal file name. controller.write_recognizers() #Iterate over all listings and recognize products controller.recognize_products(data.listings.index, data.listings) #TODO: assertions # data.write_listings() print "finished"
def test_FeatureExtractor(): """Test ``FeatureExtractor`` class.""" from clair.textprocessing import FeatureExtractor from clair.coredata import DataStore data_dir = relative("../../example-data") data = DataStore() data.read_data(data_dir) listing = data.listings.ix["eb-110685959294"] #Words to test different extraction functionality # from title and description, test entities feature_words = ["nikon", "photo", "d90", u"blitzgerät", # seller, item specifics "photo-porst-memmingen", "mpn", # words that are not in listing "foo", "bar"] extractor = FeatureExtractor(feature_words) features = extractor.extract_features(listing) print features assert features['contains-photo'] == True assert features['contains-photo-porst-memmingen'] == True assert features['contains-nikon'] == True assert features['contains-mpn'] == True assert features['contains-d90'] == True assert features[u'contains-blitzgerät'] == True assert features['contains-foo'] == False assert features['contains-bar'] == False print "finished"
def test_PriceEstimator_compute_product_occurrence_matrix(): "Test construction of matrix for linear least square algorithm." from clair.coredata import DataStore from clair.prices import PriceEstimator print "start" data = DataStore() data.read_data(relative("../../example-data")) test_listings = data.listings.ix[0:20] print test_listings print test_listings.to_string(columns=["products", "price"]) product_ids = [u'nikon-d70', u'nikon-d90', u'nikon-sb-24', u'nikon-sb-26', u'nikon-18-70-f/3.5-4.5--1', u'nikon-18-105-f/3.5-5.6--1', u'nikon-28-85-f/3.5-4.5--1'] estimator = PriceEstimator() matrix, prices, listing_ids, product_ids = \ estimator.compute_product_occurrence_matrix(test_listings, product_ids) print print "matrix:\n", matrix print "matrix rank:", np.linalg.matrix_rank(matrix) print "number products:", len(product_ids) print "prices:\n", prices print "listing_ids:\n", listing_ids print "product_ids:\n", product_ids #TODO: assertions print "finshed"
def test_ProductRecognizer(): """Test ``FeatureExtractor`` class.""" from clair.textprocessing import ProductRecognizer, split_random from clair.coredata import DataStore print "start" data_dir = relative("../../example-data") data = DataStore() data.read_data(data_dir) finder = ProductRecognizer("nikon-d70") print "Test: filter_trainig_samples" samples, _, _ = finder.filter_trainig_samples(data.listings) train_samples = samples print "Number training samples:", len(samples) # print samples #Test if search for training samples worked assert len(samples) > 100 assert all(samples["training_sample"] == 1.0) pe = samples["products"].map(lambda l: "nikon-d70" in l) pa = samples["products_absent"].map(lambda l: "nikon-d70" in l) assert all(pe | pa) print "\nTest: filter_candidate_listings" samples = cand_samples = finder.filter_candidate_listings(data.listings) print "Number candidate samples:", len(samples) #Test if filter for candidate samples worked assert len(samples) > 10 assert all(samples["training_sample"] != 1.0) pe = samples["expected_products"].map(lambda l: "nikon-d70" in l) assert all(pe) print "\nTest: train_finder, compute_accuracy" train_set, test_set = split_random(train_samples, 0.8) finder.train_finder(train_set) finder.compute_accuracy(test_set) print "\nTest: contains_product" for i, (_, listing) in enumerate(cand_samples.iterrows()): if i >= 10: break contains = finder.contains_product(listing) print listing["title"] print "Contains", finder.product_id, ":", contains print print "finished"
def test_PriceEstimator_create_prices_lstsq_soln_1(): "Test creation of price records with real data." from clair.coredata import DataStore from clair.prices import PriceEstimator print "start" data = DataStore() data.read_data(relative("../../example-data")) #Use all data as test data # listings = data.listings product_ids = [p.id for p in data.products if not p.id.startswith("xxx-unknown")] # #Take a small amount of test data. listings = data.listings.ix[0:200] # product_ids = [u'nikon-d70', u'nikon-d90', u'nikon-sb-24', u'nikon-sb-26', # u'nikon-18-70-f/3.5-4.5--1', u'nikon-18-105-f/3.5-5.6--1', # u'nikon-28-85-f/3.5-4.5--1'] print listings # print listings.to_string(columns=["products", "price"]) estimator = PriceEstimator() #Create matrix and vectors for linear least square matrix, listing_prices, listing_ids, product_ids = \ estimator.compute_product_occurrence_matrix(listings, product_ids) # print # print "matrix:\n", matrix # print "matrix rank:", np.linalg.matrix_rank(matrix) # print "number products:", len(product_ids) # print "listing_prices:\n", listing_prices # print "listing_ids:\n", listing_ids # print "product_ids:\n", product_ids #Compute average product prices product_prices, good_rows, good_cols, problem_products = \ estimator.solve_prices_lstsq(matrix, listing_prices, listing_ids, product_ids) #Create price records prices = estimator.create_prices_lstsq_soln(matrix, listing_prices, listing_ids, product_prices, product_ids, good_rows, good_cols, listings) # print prices.to_string() #TODO: assertions print "finshed"
def test_PriceEstimator_find_observed_prices(): "Test price computation for listings with only a single product." from clair.coredata import DataStore from clair.prices import PriceEstimator print "start" data = DataStore() data.read_data(relative("../../example-data")) test_listings = data.listings.ix[0:20] print test_listings estimator = PriceEstimator() prices = estimator.find_observed_prices(test_listings) print prices.to_string() #TODO: assertions print "finshed"
def test_PriceEstimator_compute_prices_1(): "Test main method for creation of price records with real data." from clair.coredata import DataStore from clair.prices import PriceEstimator print "start" data = DataStore() data.read_data(relative("../../example-data")) #Use all data as test data listings = data.listings # product_ids = [p.id for p in data.products # if not p.id.startswith("xxx-unknown")] # #Take a small amount of test data. # listings = data.listings.ix[0:50] # product_ids = [u'nikon-d70', u'nikon-d90', u'nikon-sb-24', u'nikon-sb-26', # u'nikon-18-70-f/3.5-4.5--1', u'nikon-18-105-f/3.5-5.6--1', # u'nikon-28-85-f/3.5-4.5--1'] # print listings print listings.to_string(columns=["products", "price"]) estimator = PriceEstimator() prices = estimator.compute_prices(listings, data.products, time_start=None, time_end=None, avg_period="week") # print prices.to_string() prices = prices.sort("time") prices_d90 = prices.ix[prices["product"] == "nikon-d90"] pl.plot(prices_d90["time"].tolist(), prices_d90["price"].tolist()) prices_sb26 = prices.ix[prices["product"] == "nikon-sb-26"] prices_sb26.set_index("time", inplace=True, verify_integrity=False) prices_sb26["price"].plot() prices_sb24 = prices.ix[prices["product"] == "nikon-sb-24"] prices_sb24.set_index("time", inplace=True, verify_integrity=False) prices_sb24["price"].plot() # pl.plot(prices_sb24["time"], prices_d90["price"]) # pl.show() #TODO: assertions print "finshed"
def experiment_update_all_listings(): """Update all listings.""" from clair.coredata import DataStore from clair.network import EbayConnector print "====================================================================" print " Updating all listings! " print "====================================================================" ds = DataStore() ec = EbayConnector(relative("../../example-data/python-ebay.apikey")) ds.read_data(relative("../../example-data")) # print ds.listings["description"]["eb-150850751507"] print "Updating", len(ds.listings), "listings..." listings_upd = ec.update_listings(ds.listings) ds.merge_listings(listings_upd) ds.write_listings() print "finished"
def test_PriceEstimator_solve_prices_lstsq_1(): "Test linear least square algorithm with real data." from clair.coredata import DataStore from clair.prices import PriceEstimator print "start" data = DataStore() data.read_data(relative("../../example-data")) #Take a small amount of test data. listings = data.listings.ix[0:50] # listings = data.listings # product_ids = [p.id for p in data.products] product_ids = [u'nikon-d70', u'nikon-d90', u'nikon-sb-24', u'nikon-sb-26', u'nikon-18-70-f/3.5-4.5--1', u'nikon-18-105-f/3.5-5.6--1', u'nikon-28-85-f/3.5-4.5--1'] print listings print listings.to_string(columns=["products", "price"]) estimator = PriceEstimator() #Create matrix and vectors for linear least square matrix, listing_prices, listing_ids, product_ids = \ estimator.compute_product_occurrence_matrix(listings, product_ids) print print "matrix:\n", matrix print "matrix rank:", np.linalg.matrix_rank(matrix) print "number products:", len(product_ids) print "listing_prices:\n", listing_prices print "listing_ids:\n", listing_ids print "product_ids:\n", product_ids product_prices, good_rows, good_cols, problem_products = \ estimator.solve_prices_lstsq(matrix, listing_prices, listing_ids, product_ids) print "product_prices:\n", product_prices * 0.7 #TODO: assertions print "finshed"
class DaemonMain(object): """Main object of operation without GUI. daemon """ def __init__(self, conf_dir, data_dir, data_store=None): self.data_dir = data_dir self.server = EbayConnector(path.join(conf_dir, "python-ebay.apikey")) self.data = DataStore() if data_store is None else data_store self.recognizers = RecognizerController() def compute_next_due_time(self, curr_time, recurrence_pattern, add_random=False): """ Compute next due time for recurrent tasks. Parameters ---------- curr_time : datetime Start time of the recurrence. Current time should be used. recurrence_pattern: str How often should the task be executed? One of: * "m", "month", "monthly" * "w", "week", "weekly" * "d", "day", "daily" * "h", "hour", "hourly" add_random: bool If ``True``, add a random amount of time to the computed due time, to avoid load spikes. If ``False``, the computed times are at the start of the interval, for example at 00:00 o'clock for "daily" recurrence. Returns ------- datetime The new due time """ bymonth = None; bymonthday = None; byweekday = None; byhour = None byminute = 0; bysecond = 0 recurrence_pattern = recurrence_pattern.lower() if recurrence_pattern in ["m", "month", "monthly"]: freq = dateutil.rrule.MONTHLY byhour = 0 bymonthday = 1 rand_max = 15 * 24 * 60 * 60 #sec - 15 days elif recurrence_pattern in ["w", "week", "weekly"]: freq = dateutil.rrule.WEEKLY byhour = 0 byweekday = 0 rand_max = 3.5 * 24 * 60 * 60 #sec - 3.5 days elif recurrence_pattern in ["d", "day", "daily"]: freq = dateutil.rrule.DAILY byhour = 0 rand_max = 12 * 60 * 60 #sec - 12 hours elif recurrence_pattern in ["h", "hour", "hourly"]: freq = dateutil.rrule.HOURLY rand_max = 30 * 60 #sec - 30 minutes else: raise ValueError("Unkown recurrence_pattern: " + str(recurrence_pattern)) rrule = dateutil.rrule.rrule(freq=freq, dtstart=curr_time, count=2, bymonth=bymonth, bymonthday=bymonthday, byweekday=byweekday, byhour=byhour, byminute=byminute, bysecond=bysecond, cache=True) new_time = rrule.after(curr_time) #Add add_random component. if add_random: rand_secs = randint(0, rand_max) new_time += timedelta(seconds=rand_secs) return new_time def compute_next_wakeup_time(self): """ Compute time when application needs to wake up to execute next task. Lopps over all tasks in ``self.tasks``. Returns ------- datetime, float * Time when next task is due * Number of seconds to sleep until the next task is due. """ wakeup_time = datetime(9999, 12, 31) #The last possible month for task in self.data.tasks: wakeup_time = min(task.due_time, wakeup_time) sleep_interval = wakeup_time - datetime.utcnow() sleep_sec = max(sleep_interval.total_seconds(), 0.) return wakeup_time, sleep_sec def execute_search_task(self, task): """Search for new listings. Executes a search task.""" assert isinstance(task, SearchTask) logging.debug("Executing search task: '{id}'".format(id=task.id)) #Get new listings from server lst_found = self.server.find_listings( keywords=task.query_string, n_listings=task.n_listings, price_min=task.price_min, price_max=task.price_max, currency=task.currency) #fill in additional information, mainly for product recognition lst_found["search_tasks"].fill([task.id]) lst_found["expected_products"].fill(task.expected_products) lst_found["server"] = task.server #Sane handling of listings that are found by multiple search tasks. #Get IDs of listings that have already been found by other tasks common_ids = list(set(lst_found.index).intersection( set(self.data.listings.index))) for idx in common_ids: #Union of "search_tasks" list between existing and new listings tasks = lst_found["search_tasks"][idx] + \ self.data.listings["search_tasks"][idx] tasks = list(set(tasks)) tasks.sort() lst_found["search_tasks"][idx] = tasks #Union of "expected_products" list between existing and new listings prods = lst_found["expected_products"][idx] + \ self.data.listings["expected_products"][idx] prods = list(set(prods)) prods.sort() lst_found["expected_products"][idx] = prods self.data.merge_listings(lst_found) return list(lst_found["id"]) def execute_update_task(self, task): """ Download the complete information of known listings. Executes an update task. Tries to recognize products in updated tasks. """ assert isinstance(task, UpdateTask) logging.debug("Executing update task: '{id}'".format(id=task.id)) #Download the tasks lst_update = self.data.listings.ix[task.listings] lst_update = self.server.update_listings(lst_update) lst_update["server"] = task.server # lst_update["final_price"] = True #Use as flag, just to be sure self.data.merge_listings(lst_update) #Recognize products self.recognizers.recognize_products(lst_update.index, self.data.listings) return list(lst_update["id"]) def execute_tasks(self): """ Execute the due tasks in ``self.tasks``. Removes single shot tasks. """ logging.info("Executing due tasks.") now = datetime.utcnow() dead_tasks = [] for itask, task in enumerate(self.data.tasks): #Test is task due if task.due_time > now: continue logging.info("Executing task: {}".format(task.id)) #Search for new listings if isinstance(task, SearchTask): self.execute_search_task(task) #Update known listings elif isinstance(task, UpdateTask): self.execute_update_task(task) else: raise TypeError("Unknown task type:" + str(type(task)) + "\ntask:\n" + str(task)) #Mark non-recurrent tasks for removal if task.recurrence_pattern is None: dead_tasks.append(itask) #Compute new due time for recurrent tasks else: task.due_time = self.compute_next_due_time( datetime.utcnow(), task.recurrence_pattern, True) #Remove dead (non recurrent) tasks, after they have been executed. dead_tasks.reverse() for itask in dead_tasks: del self.data.tasks[itask] def create_final_update_tasks(self): """ Create tasks that update the listing information shortly after the auctions end. We want to know the final price of each auction. 20 auctions are updated at once. """ logging.info("Creating update tasks, to get final prices.") if len(self.data.listings) == 0: return #Create administration information if it doesn't exist try: self.data.listings["final_update_pending"] except KeyError: self.data.listings["final_update_pending"] = 0.0 #Get listings where final price is unknown and # where no final update is pending. #Note! Three-valued logic: 1., 0., nan where_no_final = ((self.data.listings["final_price"] != True) & (self.data.listings["final_update_pending"] != True)) no_final = self.data.listings[where_no_final] no_final = no_final.sort("time") if len(no_final) == 0: return #group listings into groups of 20 (max for Ebay get-items request) n_group = 20 elem_nums = range(len(no_final)) group_nums =[int(ne / n_group) for ne in elem_nums] groups = no_final.groupby(group_nums) #Create one update task for each group update_tasks = [] id_start = "update-" for i, group in groups: latest_time = group["time"].max() due_time = latest_time + timedelta(minutes=30) listing_ids = group["id"] task = UpdateTask(id=id_start + due_time.isoformat() + "-" + str(i), due_time=due_time, server=None, recurrence_pattern=None, listings=listing_ids) # print task update_tasks.append(task) self.data.add_tasks(update_tasks) #Remember the listings for which update tasks were just created self.data.listings["final_update_pending"][where_no_final] = True def run_daemon(self, nloops=-1): """ Simple main loop that downloads listings. To run a daemon from the command line call:: CommandLineHandler.daemon_main() Parameters ---------- nloops : int Number of cycles in the main loop. -1 means: loop infinitely. """ #Only load listings from one month in past and one month in future date_start = datetime.utcnow() - timedelta(days=30) date_end = datetime.utcnow() + timedelta(days=30) self.data.read_data(self.data_dir, date_start, date_end) self.recognizers.read_recognizers(self.data_dir) self.create_final_update_tasks() while nloops: #sleep until a task is due next_due_time, sleep_secs = self.compute_next_wakeup_time() logging.info("Sleeping until: {}".format(next_due_time)) time.sleep(sleep_secs) self.execute_tasks() self.create_final_update_tasks() self.data.write_listings() self.data.write_tasks() nloops -= 1