Python DataStore.read_data Examples, clair.coredata.DataStore.read_data Python Examples

Example #1

0

Show file

def test_RecognizerController():
    """Test ``RecognizerController`` class."""
    from clair.textprocessing import RecognizerController
    from clair.coredata import DataStore
    
    data_dir = relative("../../example-data")
    data = DataStore()
    data.read_data(data_dir)
    
    controller = RecognizerController()
    #create new recognizers and train them
    controller.train_recognizers(data.products, data.listings)
    #Save and load the newly created recognizers to/from disk. 
    controller.write_recognizers(data_dir)
    controller = RecognizerController()
    controller.read_recognizers(data_dir)
    #Save recognizers to disk use internal file name. 
    controller.write_recognizers()
    #Iterate over all listings and recognize products
    controller.recognize_products(data.listings.index, data.listings)
    
    #TODO: assertions
#    data.write_listings()
    
    print "finished"

Example #2

0

Show file

def test_FeatureExtractor():
    """Test ``FeatureExtractor`` class."""
    from clair.textprocessing import FeatureExtractor
    from clair.coredata import DataStore
    
    data_dir = relative("../../example-data")
    data = DataStore()
    data.read_data(data_dir)
    listing = data.listings.ix["eb-110685959294"]
    
    #Words to test different extraction functionality
    #                from title and description, test entities
    feature_words = ["nikon", "photo", "d90", u"blitzgerät",
    #                seller, item specifics
                     "photo-porst-memmingen", "mpn",
    #                words that are not in listing
                     "foo", "bar"]
    
    extractor = FeatureExtractor(feature_words)
    features = extractor.extract_features(listing)
    
    print features
    
    assert features['contains-photo'] == True
    assert features['contains-photo-porst-memmingen'] == True
    assert features['contains-nikon'] == True
    assert features['contains-mpn'] == True
    assert features['contains-d90'] == True
    assert features[u'contains-blitzgerät'] == True
    
    assert features['contains-foo'] == False
    assert features['contains-bar'] == False
    
    print "finished"

Example #3

0

Show file

File: test_prices.py Project: eike-welk/clair

def test_PriceEstimator_compute_product_occurrence_matrix():
    "Test construction of matrix for linear least square algorithm."
    from clair.coredata import DataStore
    from clair.prices import PriceEstimator
    print "start"
    
    data = DataStore()
    data.read_data(relative("../../example-data"))
    
    test_listings = data.listings.ix[0:20]
    print test_listings
    print test_listings.to_string(columns=["products", "price"])
    product_ids = [u'nikon-d70', u'nikon-d90', u'nikon-sb-24', u'nikon-sb-26', 
                   u'nikon-18-70-f/3.5-4.5--1', u'nikon-18-105-f/3.5-5.6--1',
                   u'nikon-28-85-f/3.5-4.5--1']
    
    estimator = PriceEstimator()
    matrix, prices, listing_ids, product_ids = \
        estimator.compute_product_occurrence_matrix(test_listings, product_ids)
    
    print
    print "matrix:\n", matrix
    print "matrix rank:", np.linalg.matrix_rank(matrix)
    print "number products:", len(product_ids)
    print "prices:\n", prices
    print "listing_ids:\n", listing_ids
    print "product_ids:\n", product_ids
    
    
    #TODO: assertions
    print "finshed"

Example #4

0

Show file

def test_ProductRecognizer():
    """Test ``FeatureExtractor`` class."""
    from clair.textprocessing import ProductRecognizer, split_random
    from clair.coredata import DataStore
    print "start"
    
    data_dir = relative("../../example-data")
    data = DataStore()
    data.read_data(data_dir)
    
    finder = ProductRecognizer("nikon-d70")
    
    print "Test: filter_trainig_samples"
    samples, _, _ = finder.filter_trainig_samples(data.listings)
    train_samples = samples
    print "Number training samples:", len(samples)
#    print samples
    #Test if search for training samples worked
    assert len(samples) > 100
    assert all(samples["training_sample"] == 1.0)
    pe = samples["products"].map(lambda l: "nikon-d70" in l)
    pa = samples["products_absent"].map(lambda l: "nikon-d70" in l)
    assert all(pe | pa)
    
    print "\nTest: filter_candidate_listings"
    samples = cand_samples = finder.filter_candidate_listings(data.listings)
    print "Number candidate samples:", len(samples)
    #Test if filter for candidate samples worked
    assert len(samples) > 10
    assert all(samples["training_sample"] != 1.0)
    pe = samples["expected_products"].map(lambda l: "nikon-d70" in l)
    assert all(pe)
    
    print "\nTest: train_finder, compute_accuracy"
    train_set, test_set = split_random(train_samples, 0.8)
    finder.train_finder(train_set)
    finder.compute_accuracy(test_set)
    
    print "\nTest: contains_product"
    for i, (_, listing) in enumerate(cand_samples.iterrows()):
        if i >= 10:
            break
        contains = finder.contains_product(listing)
        print listing["title"]
        print "Contains", finder.product_id, ":", contains
        print 
    
    print "finished"

Example #5

0

Show file

File: test_prices.py Project: eike-welk/clair

def test_PriceEstimator_create_prices_lstsq_soln_1():
    "Test creation of price records with real data."
    from clair.coredata import DataStore
    from clair.prices import PriceEstimator
    print "start"
    
    data = DataStore()
    data.read_data(relative("../../example-data"))
    
    #Use all data as test data
#    listings = data.listings
    product_ids = [p.id for p in data.products 
                   if not p.id.startswith("xxx-unknown")]
#    #Take a small amount of test data.
    listings = data.listings.ix[0:200]
#    product_ids = [u'nikon-d70', u'nikon-d90', u'nikon-sb-24', u'nikon-sb-26', 
#                   u'nikon-18-70-f/3.5-4.5--1', u'nikon-18-105-f/3.5-5.6--1',
#                   u'nikon-28-85-f/3.5-4.5--1']
    print listings
#    print listings.to_string(columns=["products", "price"])
    
    estimator = PriceEstimator()
    
    #Create matrix and vectors for linear least square
    matrix, listing_prices, listing_ids, product_ids = \
        estimator.compute_product_occurrence_matrix(listings, product_ids)
#    print
#    print "matrix:\n", matrix
#    print "matrix rank:", np.linalg.matrix_rank(matrix)
#    print "number products:", len(product_ids)
#    print "listing_prices:\n", listing_prices
#    print "listing_ids:\n", listing_ids
#    print "product_ids:\n", product_ids
    
    #Compute average product prices
    product_prices, good_rows, good_cols, problem_products = \
                estimator.solve_prices_lstsq(matrix, listing_prices, 
                                                     listing_ids, product_ids)
    
    #Create price records
    prices = estimator.create_prices_lstsq_soln(matrix, 
                                     listing_prices, listing_ids, 
                                     product_prices, product_ids,
                                     good_rows, good_cols, listings)
#    print prices.to_string()
    
    #TODO: assertions
    print "finshed"

Example #6

0

Show file

File: test_prices.py Project: eike-welk/clair

def test_PriceEstimator_find_observed_prices():
    "Test price computation for listings with only a single product."
    from clair.coredata import DataStore
    from clair.prices import PriceEstimator
    print "start"
    
    data = DataStore()
    data.read_data(relative("../../example-data"))
    
    test_listings = data.listings.ix[0:20]
    print test_listings
    
    estimator = PriceEstimator()
    prices = estimator.find_observed_prices(test_listings)
    
    print prices.to_string()
    #TODO: assertions
    print "finshed"

Example #7

0

Show file

File: test_prices.py Project: eike-welk/clair

def test_PriceEstimator_compute_prices_1():
    "Test main method for creation of price records with real data."
    from clair.coredata import DataStore
    from clair.prices import PriceEstimator
    print "start"
    
    data = DataStore()
    data.read_data(relative("../../example-data"))
    
    #Use all data as test data
    listings = data.listings
#    product_ids = [p.id for p in data.products 
#                   if not p.id.startswith("xxx-unknown")]
#    #Take a small amount of test data.
#    listings = data.listings.ix[0:50]
#    product_ids = [u'nikon-d70', u'nikon-d90', u'nikon-sb-24', u'nikon-sb-26', 
#                   u'nikon-18-70-f/3.5-4.5--1', u'nikon-18-105-f/3.5-5.6--1',
#                   u'nikon-28-85-f/3.5-4.5--1']
#    print listings
    print listings.to_string(columns=["products", "price"])
    
    estimator = PriceEstimator()
    prices = estimator.compute_prices(listings, data.products, 
                                      time_start=None, time_end=None, 
                                      avg_period="week")
#    print prices.to_string()
    
    prices = prices.sort("time")
    prices_d90 = prices.ix[prices["product"] == "nikon-d90"]
    pl.plot(prices_d90["time"].tolist(), prices_d90["price"].tolist())
    prices_sb26 = prices.ix[prices["product"] == "nikon-sb-26"]
    prices_sb26.set_index("time", inplace=True, verify_integrity=False)
    prices_sb26["price"].plot()
    prices_sb24 = prices.ix[prices["product"] == "nikon-sb-24"]
    prices_sb24.set_index("time", inplace=True, verify_integrity=False)
    prices_sb24["price"].plot()
    
#    pl.plot(prices_sb24["time"], prices_d90["price"])
#    pl.show()
    #TODO: assertions
    print "finshed"

Example #8

0

Show file

def experiment_update_all_listings():
    """Update all listings."""
    from clair.coredata import DataStore
    from clair.network import EbayConnector
    
    print "===================================================================="
    print "                 Updating all listings! "
    print "===================================================================="
    ds = DataStore()
    ec = EbayConnector(relative("../../example-data/python-ebay.apikey"))
    
    ds.read_data(relative("../../example-data")) 
    
#    print ds.listings["description"]["eb-150850751507"]
    
    print "Updating", len(ds.listings), "listings..."
    listings_upd = ec.update_listings(ds.listings)
    ds.merge_listings(listings_upd)
    ds.write_listings()
    
    print "finished"

Example #9

0

Show file

File: test_prices.py Project: eike-welk/clair

def test_PriceEstimator_solve_prices_lstsq_1():
    "Test linear least square algorithm with real data."
    from clair.coredata import DataStore
    from clair.prices import PriceEstimator
    print "start"
    
    data = DataStore()
    data.read_data(relative("../../example-data"))
    
    #Take a small amount of test data.
    listings = data.listings.ix[0:50]
#    listings = data.listings
#    product_ids = [p.id for p in data.products]
    product_ids = [u'nikon-d70', u'nikon-d90', u'nikon-sb-24', u'nikon-sb-26', 
                   u'nikon-18-70-f/3.5-4.5--1', u'nikon-18-105-f/3.5-5.6--1',
                   u'nikon-28-85-f/3.5-4.5--1']
    print listings
    print listings.to_string(columns=["products", "price"])
    
    estimator = PriceEstimator()
    
    #Create matrix and vectors for linear least square
    matrix, listing_prices, listing_ids, product_ids = \
        estimator.compute_product_occurrence_matrix(listings, product_ids)
    print
    print "matrix:\n", matrix
    print "matrix rank:", np.linalg.matrix_rank(matrix)
    print "number products:", len(product_ids)
    print "listing_prices:\n", listing_prices
    print "listing_ids:\n", listing_ids
    print "product_ids:\n", product_ids
    
    product_prices, good_rows, good_cols, problem_products = \
                estimator.solve_prices_lstsq(matrix, listing_prices, 
                                                     listing_ids, product_ids)
    
    print "product_prices:\n", product_prices * 0.7
    #TODO: assertions
    print "finshed"

Example #10

0

Show file

class DaemonMain(object):
    """Main object of operation without GUI. daemon """
    def __init__(self, conf_dir, data_dir, data_store=None):
        self.data_dir = data_dir
        self.server = EbayConnector(path.join(conf_dir, "python-ebay.apikey"))
        self.data = DataStore() if data_store is None else data_store
        self.recognizers = RecognizerController()
        
    
    def compute_next_due_time(self, curr_time, recurrence_pattern, 
                              add_random=False):
        """
        Compute next due time for recurrent tasks.
        
        Parameters
        ----------
        
        curr_time : datetime
            Start time of the recurrence. Current time should be used.
            
        recurrence_pattern: str 
            How often should the task be executed? One of:
                * "m", "month", "monthly"
                * "w", "week", "weekly"
                * "d", "day", "daily"
                * "h", "hour", "hourly"
        
        add_random: bool
            If ``True``, add a random amount of time to the computed due time,
            to avoid load spikes. 
            If ``False``, the computed times are at the start of the interval,
            for example at 00:00 o'clock for "daily" recurrence.
            
        Returns
        -------
        datetime
            The new due time
        """
        bymonth = None; bymonthday = None; byweekday = None; byhour = None
        byminute = 0; bysecond = 0
        
        recurrence_pattern = recurrence_pattern.lower()
        if recurrence_pattern in ["m", "month", "monthly"]:
            freq = dateutil.rrule.MONTHLY
            byhour = 0
            bymonthday = 1
            rand_max = 15 * 24 * 60 * 60 #sec - 15 days
        elif recurrence_pattern in ["w", "week", "weekly"]:
            freq = dateutil.rrule.WEEKLY
            byhour = 0
            byweekday = 0
            rand_max = 3.5 * 24 * 60 * 60 #sec - 3.5 days
        elif recurrence_pattern in ["d", "day", "daily"]:
            freq = dateutil.rrule.DAILY
            byhour = 0
            rand_max = 12 * 60 * 60 #sec - 12 hours
        elif recurrence_pattern in ["h", "hour", "hourly"]:
            freq = dateutil.rrule.HOURLY
            rand_max = 30 * 60 #sec - 30 minutes
        else:
            raise ValueError("Unkown recurrence_pattern: " + 
                             str(recurrence_pattern))
        
        rrule = dateutil.rrule.rrule(freq=freq, dtstart=curr_time, count=2, 
                                     bymonth=bymonth, bymonthday=bymonthday, 
                                     byweekday=byweekday, byhour=byhour, 
                                     byminute=byminute, bysecond=bysecond,
                                     cache=True)
        new_time = rrule.after(curr_time)
        
        #Add add_random component.
        if add_random:
            rand_secs = randint(0, rand_max)
            new_time += timedelta(seconds=rand_secs)
        
        return new_time
    
        
    def compute_next_wakeup_time(self):
        """
        Compute time when application needs to wake up to execute next task.
        
        Lopps over all tasks in ``self.tasks``.
        
        Returns
        -------
        
        datetime, float
        
        * Time when next task is due
        * Number of seconds to sleep until the next task is due.
        """
        wakeup_time = datetime(9999, 12, 31) #The last possible month
        for task in self.data.tasks:
            wakeup_time = min(task.due_time, wakeup_time)
            
        sleep_interval = wakeup_time - datetime.utcnow()
        sleep_sec = max(sleep_interval.total_seconds(), 0.) 
        
        return wakeup_time, sleep_sec
    
    
    def execute_search_task(self, task):
        """Search for new listings. Executes a search task."""
        assert isinstance(task, SearchTask)
        logging.debug("Executing search task: '{id}'".format(id=task.id))
        
        #Get new listings from server
        lst_found = self.server.find_listings(
                                    keywords=task.query_string, 
                                    n_listings=task.n_listings, 
                                    price_min=task.price_min, 
                                    price_max=task.price_max, 
                                    currency=task.currency)
        #fill in additional information, mainly for product recognition
        lst_found["search_tasks"].fill([task.id])
        lst_found["expected_products"].fill(task.expected_products)
        lst_found["server"] = task.server
        
        #Sane handling of listings that are found by multiple search tasks.
        #Get IDs of listings that have already been found by other tasks
        common_ids = list(set(lst_found.index).intersection(
                                        set(self.data.listings.index)))
        for idx in common_ids:
            #Union of "search_tasks" list between existing and new listings
            tasks = lst_found["search_tasks"][idx] + \
                    self.data.listings["search_tasks"][idx]
            tasks = list(set(tasks))
            tasks.sort()
            lst_found["search_tasks"][idx] = tasks
            #Union of "expected_products" list between existing and new listings
            prods = lst_found["expected_products"][idx] + \
                    self.data.listings["expected_products"][idx]
            prods = list(set(prods))
            prods.sort()
            lst_found["expected_products"][idx] = prods
        
        self.data.merge_listings(lst_found)
        return list(lst_found["id"])
        
        
    def execute_update_task(self, task):
        """
        Download the complete information of known listings. 
        Executes an update task.
        Tries to recognize products in updated tasks.
        """
        assert isinstance(task, UpdateTask)
        logging.debug("Executing update task: '{id}'".format(id=task.id))
        
        #Download the tasks
        lst_update = self.data.listings.ix[task.listings]
        lst_update = self.server.update_listings(lst_update)
        lst_update["server"] = task.server
#        lst_update["final_price"] = True #Use as flag, just to be sure
        self.data.merge_listings(lst_update)
        
        #Recognize products
        self.recognizers.recognize_products(lst_update.index, 
                                            self.data.listings)
        return list(lst_update["id"])
        
        
    def execute_tasks(self):
        """
        Execute the due tasks in ``self.tasks``.
        
        Removes single shot tasks.
        """
        logging.info("Executing due tasks.")
        now = datetime.utcnow()
        dead_tasks = []
        for itask, task in enumerate(self.data.tasks):
            #Test is task due
            if task.due_time > now:
                continue
            
            logging.info("Executing task: {}".format(task.id))
            #Search for new listings
            if isinstance(task, SearchTask):
                self.execute_search_task(task)
            #Update known listings
            elif isinstance(task, UpdateTask):
                self.execute_update_task(task)
            else:
                raise TypeError("Unknown task type:" + str(type(task)) + 
                                "\ntask:\n" + str(task))
            
            #Mark non-recurrent tasks for removal
            if task.recurrence_pattern is None:
                dead_tasks.append(itask)
            #Compute new due time for recurrent tasks
            else:
                task.due_time = self.compute_next_due_time(
                            datetime.utcnow(), task.recurrence_pattern, True)

        #Remove dead (non recurrent) tasks, after they have been executed.
        dead_tasks.reverse()
        for itask in dead_tasks:
            del self.data.tasks[itask]    
    
    
    def create_final_update_tasks(self):
        """
        Create tasks that update the listing information shortly after the 
        auctions end. We want to know the final price of each auction.
        20 auctions are updated at once.
        """
        logging.info("Creating update tasks, to get final prices.")
        if len(self.data.listings) == 0:
            return

        #Create administration information if it doesn't exist
        try:
            self.data.listings["final_update_pending"]
        except KeyError:
            self.data.listings["final_update_pending"] = 0.0
        
        #Get listings where final price is unknown and 
        #    where no final update is pending.
        #Note! Three-valued logic: 1., 0., nan
        where_no_final = ((self.data.listings["final_price"] != True) &
                          (self.data.listings["final_update_pending"] != True))
        no_final = self.data.listings[where_no_final]
        no_final = no_final.sort("time")
        if len(no_final) == 0:
            return
        
        #group listings into groups of 20 (max for Ebay get-items request)
        n_group = 20
        elem_nums = range(len(no_final))
        group_nums =[int(ne / n_group) for ne in elem_nums]
        groups = no_final.groupby(group_nums)
        
        #Create one update task for each group
        update_tasks = []
        id_start = "update-"
        for i, group in groups:
            latest_time = group["time"].max()
            due_time = latest_time + timedelta(minutes=30)
            listing_ids = group["id"]
            task = UpdateTask(id=id_start + due_time.isoformat() + "-" + str(i), 
                              due_time=due_time, 
                              server=None, recurrence_pattern=None, 
                              listings=listing_ids)
#            print task
            update_tasks.append(task)
            
        self.data.add_tasks(update_tasks)
        
        #Remember the listings for which update tasks were just created
        self.data.listings["final_update_pending"][where_no_final] = True


    def run_daemon(self, nloops=-1):
        """
        Simple main loop that downloads listings.
        To run a daemon from the command line call::
            
            CommandLineHandler.daemon_main()
        
        Parameters
        ----------
        nloops : int 
            Number of cycles in the main loop. -1 means: loop infinitely.
        """
        #Only load listings from one month in past and one month in future
        date_start = datetime.utcnow() - timedelta(days=30)
        date_end = datetime.utcnow() + timedelta(days=30)
        self.data.read_data(self.data_dir, date_start, date_end)
        
        self.recognizers.read_recognizers(self.data_dir)
        self.create_final_update_tasks()
        
        while nloops:
            #sleep until a task is due
            next_due_time, sleep_secs = self.compute_next_wakeup_time()
            logging.info("Sleeping until: {}".format(next_due_time))
            time.sleep(sleep_secs)

            self.execute_tasks()
            self.create_final_update_tasks()
            self.data.write_listings()
            self.data.write_tasks()
            
            nloops -= 1