Exemple #1
0
    def find_observed_prices(self, listings_frame):
        """
        Search listings_frame with only one product, and create prices from them.
        These prices are called 'observed' prices here.
        """
        # Price data is first collected in list of dicts, that is later
        # converted to a ``DataFrame``. Each dict is a row of the ``DataFrame``.
        price_data = []
        for _, listing in listings_frame.iterrows():
            # Select sold listings with only one product
            curr_prods = listing["products"]
            if len(curr_prods) != 1:
                continue
            if listing["sold"] != 1.0:
                continue
            # Put the price data into dict
            single_price_data = {}
            single_price_data["price"] = listing["price"]
            single_price_data["currency"] = listing["currency"]
            single_price_data["condition"] = listing["condition"]
            single_price_data["time"] = listing["time"]
            single_price_data["product"] = curr_prods[0]
            single_price_data["listing"] = listing["id"]
            single_price_data["type"] = "observed"
            single_price_data["avg_period"] = "none"
            single_price_data["avg_num_listings"] = 1
            single_price_data["id"] = make_price_id(single_price_data)
            price_data.append(single_price_data)

        price_frame = pd.DataFrame(price_data)
        price_frame.set_index("id", drop=False, inplace=True, verify_integrity=True)
        return price_frame
Exemple #2
0
    def create_prices_lstsq_soln(
        self, matrix, listing_prices, listing_ids, product_prices, product_ids, good_rows, good_cols, listings=None
    ):
        """
        Create product prices from the results of the linear least 
        square algorithm.

        Parameters
        ----------
        matrix : np.array[float]
            System matrix of linear least square problem. Each row represents 
            one listing. each column represents one product. Each entry
            represents the condition of a product in a listing. Conditions
            range from 1...0.; 1: new, 0.7: used, 0: unusable.
            
        listing_prices : np.array[float]
            Prices of listings, constant (known) term of equation system
            
        listing_ids : np.array[basestring]
            Listing ID of each matrix's row.
        
        product_prices : np.array[float]
            Average price of each product. The solution of the equation system.
        
        product_ids : np.array[basestring]
            IDs of the products, represented by elements of `product_prices`
            and columns of `matrix`.
        
        good_cols : np.array[bool]
            Where `True` prices could be computed by least square algorithm.
        
        good_rows : np.array[bool]
            Where `True` listings contain only products whose prices could be
            computed by the solution algorithm. 
        
        listings : pd.DataFrame
            The listings from which the the system of equations was generated.
            Will usually contain additional listings.
            
        Returns
        -------
        prices : pd.DataFrame
            The computed prices as a `pd.DataFrame`.
        """
        assert matrix.shape[0] == len(listing_prices) == len(listing_ids)
        assert matrix.shape[1] == len(product_prices) == len(product_ids)

        good_prod_idxs = np.argwhere(good_cols)[:, 0]

        # Create the average prices
        # Price data is first collected in list of dicts, that is later
        # converted to a ``DataFrame``. Each dict is a row of the ``DataFrame``.
        prices = make_price_frame(0)
        price_data = []
        for iprod in range(len(product_prices)):
            if iprod not in good_prod_idxs:
                continue
            single_price_data = {}
            # Multiply with condition, solver returns prices for condition "new".
            single_price_data["price"] = product_prices[iprod] * self.default_condition
            single_price_data["currency"] = self.default_currency
            single_price_data["condition"] = self.default_condition
            single_price_data["time"] = self.average_mid_time
            single_price_data["product"] = product_ids[iprod]
            single_price_data["listing"] = u"{}-average".format(self.average_mid_time)
            single_price_data["type"] = "average"
            single_price_data["avg_period"] = self.avg_period
            # Get number of listings that were used for this average price, from
            # the system matrix. Count non-zero entries in the price's column.
            prod_col = matrix[:, iprod]
            prod_col = np.where(prod_col > 0, 1, 0)  # Don't count NaNs
            n_listings = np.sum(prod_col)
            single_price_data["avg_num_listings"] = n_listings
            single_price_data["id"] = make_price_id(single_price_data)
            price_data.append(single_price_data)

        avg_prices = pd.DataFrame(price_data)
        prices = prices.append(avg_prices, ignore_index=True, verify_integrity=False)

        # Create prices for each item of each listing
        # Protect against prices that are NaN
        good_prod_prices = np.where(np.isnan(product_prices), 0, product_prices)
        # Price data is first collected in list of dicts. Each dict is a price.
        price_data = []
        for ilist in range(len(listing_prices)):
            # Each row of `matrix` represents a listing
            row = matrix[ilist, :]

            # compute percentage of each product on total listing price
            # from average prices.
            virt_prod_prices = row * good_prod_prices
            list_prod_percent = virt_prod_prices / virt_prod_prices.sum()
            # compute price of each item in listing based on these percentages
            listing_price = listing_prices[ilist]
            list_prod_prices = list_prod_percent * listing_price

            # `listings` data frame can be `None` for more easy testing.
            if listings is not None:
                list_id = listing_ids[ilist]
                list_currency = listings.ix[list_id, "currency"]
                list_time = listings.ix[list_id, "time"]
            else:
                list_id = listing_ids[ilist]
                list_currency = "Unknown Currency"
                list_time = datetime(2000, 1, 1)
            prod_idxs = np.argwhere(row > 0)[:, 0]
            if len(prod_idxs) == 1:
                price_type = "observed"
                avg_period = "none"
            else:
                price_type = "estimated"
                avg_period = self.avg_period

            # Create a price record for each of the estimated product prices
            for iprod in prod_idxs:
                if iprod not in good_prod_idxs:
                    continue
                single_price_data = {}
                single_price_data["price"] = list_prod_prices[iprod]
                single_price_data["currency"] = list_currency
                single_price_data["condition"] = row[iprod]
                single_price_data["time"] = list_time
                single_price_data["product"] = product_ids[iprod]
                single_price_data["listing"] = list_id
                single_price_data["type"] = price_type
                single_price_data["avg_period"] = avg_period
                # TODO: Better algorithm, analogous to algorithm above for average prices.
                single_price_data["avg_num_listings"] = len(listing_prices)
                single_price_data["id"] = make_price_id(single_price_data)
                price_data.append(single_price_data)

        list_prices = pd.DataFrame(price_data)
        prices = prices.append(list_prices, ignore_index=True, verify_integrity=False)
        prices.set_index("id", drop=False, inplace=True, verify_integrity=True)
        return prices