def compute_time_decay(self, df, decay_column):
        """Compute time decay on provided column.

        Args:
            df (pd.DataFrame): DataFrame of users and items
            decay_column (str): column to decay

        Returns:
            DataFrame: with column decayed
        """

        # if time_now is None use the latest time
        if self.time_now is None:
            self.time_now = df[self.col_timestamp].max()

        # apply time decay to each rating
        df[decay_column] *= exponential_decay(
            value=df[self.col_timestamp],
            max_val=self.time_now,
            half_life=self.time_decay_half_life,
        )

        # group time decayed ratings by user-item and take the sum as the user-item affinity
        return df.groupby([self.col_user, self.col_item]).sum().reset_index()
Ejemplo n.º 2
0
def test_exponential_decay():
    values = np.array([1, 2, 3, 4, 5, 6])
    expected = np.array([0.25, 0.35355339, 0.5, 0.70710678, 1.0, 1.0])
    actual = exponential_decay(value=values, max_val=5, half_life=2)
    assert np.allclose(actual, expected, atol=TOL)
Ejemplo n.º 3
0
def test_exponential_decay():
    values = np.array([1, 2, 3, 4, 5, 6])
    expected = np.array([0.25, 0.35355339, 0.5, 0.70710678, 1., 1.])
    actual = exponential_decay(value=values, max_val=5, half_life=2)
    assert np.allclose(actual, expected, atol=TOL)
Ejemplo n.º 4
0
    def fit(self, df):
        """Main fit method for SAR

        Args:
            df (pd.DataFrame): User item rating dataframe
        """

        # generate continuous indices if this hasn't been done
        if self.index2item is None:
            self.set_index(df)

        logger.info("Collecting user affinity matrix")
        if not np.issubdtype(df[self.col_rating].dtype, np.number):
            raise TypeError("Rating column data type must be numeric")

        # copy the DataFrame to avoid modification of the input
        temp_df = df[[self.col_user, self.col_item, self.col_rating]].copy()

        if self.time_decay_flag:
            logger.info("Calculating time-decayed affinities")
            # if time_now is None use the latest time
            if not self.time_now:
                self.time_now = df[self.col_timestamp].max()

            # apply time decay to each rating
            temp_df[self.col_rating] *= exponential_decay(
                value=df[self.col_timestamp],
                max_val=self.time_now,
                half_life=self.time_decay_half_life,
            )

            # group time decayed ratings by user-item and take the sum as the user-item affinity
            temp_df = (
                temp_df.groupby([self.col_user, self.col_item]).sum().reset_index()
            )
        else:
            # without time decay use the latest user-item rating in the dataset as the affinity score
            logger.info("De-duplicating the user-item counts")
            temp_df = temp_df.drop_duplicates(
                [self.col_user, self.col_item], keep="last"
            )

        logger.info("Creating index columns")
        # map users and items according to the two dicts. Add the two new columns to temp_df.
        temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].map(self.item2index)
        temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].map(self.user2index)

        # retain seen items for removal at prediction time
        self.seen_items = temp_df[[self.col_user_id, self.col_item_id]].values

        # affinity matrix
        logger.info("Building user affinity sparse matrix")
        self.user_affinity = self.compute_affinity_matrix(
            temp_df, self.n_users, self.n_items
        )

        # calculate item co-occurrence
        logger.info("Calculating item co-occurrence")
        item_cooccurrence = self.compute_coocurrence_matrix(
            temp_df, self.n_users, self.n_items
        )

        # free up some space
        del temp_df

        self.item_frequencies = item_cooccurrence.diagonal()

        logger.info("Calculating item similarity")
        if self.similarity_type is COOCCUR:
            logger.info("Using co-occurrence based similarity")
            self.item_similarity = item_cooccurrence
        elif self.similarity_type is JACCARD:
            logger.info("Using jaccard based similarity")
            self.item_similarity = jaccard(item_cooccurrence).astype(
                df[self.col_rating].dtype
            )
        elif self.similarity_type is LIFT:
            logger.info("Using lift based similarity")
            self.item_similarity = lift(item_cooccurrence).astype(
                df[self.col_rating].dtype
            )
        else:
            raise ValueError("Unknown similarity type: {}".format(self.similarity_type))

        # free up some space
        del item_cooccurrence

        logger.info("Done training")
Ejemplo n.º 5
0
    def fit(self, df):
        """Main fit method for SAR

        Args:
            df (pd.DataFrame): User item rating dataframe
        """

        # Generate continuous indices if this hasn't been done
        if self.index2item is None:
            self.set_index(df)

        logger.info("Collecting user affinity matrix")
        if not np.issubdtype(df[self.col_rating].dtype, np.floating):
            raise TypeError("Rating column data type must be floating point")

        # Copy the DataFrame to avoid modification of the input
        temp_df = df[[self.col_user, self.col_item, self.col_rating]].copy()

        if self.time_decay_flag:
            logger.info("Calculating time-decayed affinities")
            # if time_now is None use the latest time
            if not self.time_now:
                self.time_now = df[self.col_timestamp].max()

            # apply time decay to each rating
            temp_df[self.col_rating] *= exponential_decay(
                value=df[self.col_timestamp],
                max_val=self.time_now,
                half_life=self.time_decay_half_life,
            )

            # group time decayed ratings by user-item and take the sum as the user-item affinity
            temp_df = (temp_df.groupby([self.col_user,
                                        self.col_item]).sum().reset_index())
        else:
            # without time decay use the latest user-item rating in the dataset as the affinity score
            logger.info("De-duplicating the user-item counts")
            temp_df = temp_df.drop_duplicates([self.col_user, self.col_item],
                                              keep="last")

        logger.info("Creating index columns")
        # Map users and items according to the two dicts. Add the two new columns to temp_df.
        temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].map(
            self.item2index)
        temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].map(
            self.user2index)

        seen_items = None
        if self.remove_seen:
            # retain seen items for removal at prediction time
            seen_items = temp_df[[self.col_user_id, self.col_item_id]].values

        # Affinity matrix
        logger.info("Building user affinity sparse matrix")
        self.user_affinity = self.compute_affinity_matrix(
            temp_df, self.n_users, self.n_items)

        # Calculate item co-occurrence
        logger.info("Calculating item co-occurrence")
        item_cooccurrence = self.compute_coocurrence_matrix(
            temp_df, self.n_users, self.n_items)

        # Free up some space
        del temp_df

        logger.info("Calculating item similarity")
        if self.similarity_type == sar.SIM_COOCCUR:
            self.item_similarity = item_cooccurrence
        elif self.similarity_type == sar.SIM_JACCARD:
            logger.info("Calculating jaccard")
            self.item_similarity = jaccard(item_cooccurrence)
            # Free up some space
            del item_cooccurrence
        elif self.similarity_type == sar.SIM_LIFT:
            logger.info("Calculating lift")
            self.item_similarity = lift(item_cooccurrence)
            # Free up some space
            del item_cooccurrence
        else:
            raise ValueError("Unknown similarity type: {0}".format(
                self.similarity_type))

        # Calculate raw scores with a matrix multiplication
        logger.info("Calculating recommendation scores")
        self.scores = self.user_affinity.dot(self.item_similarity)

        # Remove items in the train set so recommended items are always novel
        if self.remove_seen:
            logger.info("Removing seen items")
            self.scores[seen_items[:, 0], seen_items[:, 1]] = -np.inf

        logger.info("Done training")