def test_python_lift(cooccurrence1, cooccurrence2, target_matrices):
    L1 = lift(cooccurrence1)
    assert type(L1) == np.ndarray
    assert L1 == target_matrices["lift1"]

    L2 = lift(cooccurrence2)
    assert type(L2) == np.ndarray
    assert L2 == target_matrices["lift2"]
Ejemplo n.º 2
0
def test_python_lift(python_data, target_matrices):
    cooccurrence1, cooccurrence2 = python_data
    L1 = lift(cooccurrence1)
    assert type(L1) == np.ndarray
    assert L1 == target_matrices["lift1"]

    L2 = lift(cooccurrence2)
    assert type(L2) == np.ndarray
    assert L2 == target_matrices["lift2"]
    def fit(self, df, features, col_itemid, col_weights, demo=False):
        """Main fit method for SAR.

        Args:
            df (pd.DataFrame): User item rating dataframe
            features (pd.DataFrame): item feature dataframe
            col_itemid (string): name of the item id column of the feature dataframe
            col_weights (dictionary): mapping feature column names to their (weight, similarity_function) 
            in the similarity metric required to contain key 'ratings' with the weight of the similarity based on user ratings
            col_weights of features that are not 'ratings' should sum to 1 
        """
        num_items = len(features)
        load = False
        experiment_path = DATA_DIR + 'experiment/'
        demo_path = DATA_DIR + 'demo/'
        path = experiment_path
        if os.path.exists(experiment_path+'item_feature_similarity_{}.npy'.format(num_items)):
            load = True
            
        if demo:
            load = True
            path = demo_path

        if load:
            self.load_file(path, num_items, df) #set features_sim_matrix, index2item, item2index
        
        elif self.index2item is None:
            # generate continuous indices if this hasn't been done
            self.set_index(df)

        logger.info("Collecting user affinity matrix")
        if not np.issubdtype(df[self.col_rating].dtype, np.number):
            raise TypeError("Rating column data type must be numeric")

        # copy the DataFrame to avoid modification of the input
        select_columns = [self.col_user, self.col_item, self.col_rating]
        if self.time_decay_flag:
            select_columns += [self.col_timestamp]
        temp_df = df[select_columns].copy()

        if self.time_decay_flag:
            logger.info("Calculating time-decayed affinities")
            temp_df = self.compute_time_decay(df=temp_df, decay_column=self.col_rating)
        else:
            # without time decay use the latest user-item rating in the dataset as the affinity score
            logger.info("De-duplicating the user-item counts")
            temp_df = temp_df.drop_duplicates(
                [self.col_user, self.col_item], keep="last"
            )

        logger.info("Creating index columns")
        # add mapping of user and item ids to indices
        temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].map(self.item2index)
        temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].map(self.user2index)

        if self.normalize:
            logger.info("Calculating normalization factors")
            temp_df[self.col_unity_rating] = 1.0
            if self.time_decay_flag:
                temp_df = self.compute_time_decay(df=temp_df, decay_column=self.col_unity_rating)
            self.unity_user_affinity = self.compute_affinity_matrix(df=temp_df, rating_col=self.col_unity_rating)

        # affinity matrix
        logger.info("Building user affinity sparse matrix")
        self.user_affinity = self.compute_affinity_matrix(df=temp_df, rating_col=self.col_rating)
        
        # calculate item co-occurrence
        logger.info("Calculating item co-occurrence")
        item_cooccurrence = self.compute_coocurrence_matrix(df=temp_df)

        # free up some space
        del temp_df

        self.item_frequencies = item_cooccurrence.diagonal()

        logger.info("Calculating item similarity")
        if self.similarity_type is COOCCUR:
            logger.info("Using co-occurrence based similarity")
            self.item_similarity = item_cooccurrence
        elif self.similarity_type is JACCARD:
            logger.info("Using jaccard based similarity")
            self.item_similarity = jaccard(item_cooccurrence).astype(
                df[self.col_rating].dtype
            )
        elif self.similarity_type is LIFT:
            logger.info("Using lift based similarity")
            self.item_similarity = lift(item_cooccurrence).astype(
                df[self.col_rating].dtype
            )
        elif self.similarity_type is "custom":
            self.item_similarity = col_weights['ratings'] * jaccard(item_cooccurrence).astype(df[self.col_rating].dtype)
            if not load:
                self.features_sim_matrix = self.compute_feature_sim_matrix(col_weights, features, col_itemid)
                self.save_to_file(path)
            #!!! assuming self.features_sim_matrix has scale 1 (i.e. col_weights[features] all sum to 1)
            self.item_similarity += (1-col_weights['ratings'])*self.features_sim_matrix   
        else:
            raise ValueError("Unknown similarity type: {}".format(self.similarity_type))

        # free up some space
        del item_cooccurrence

        logger.info("Done training")
Ejemplo n.º 4
0
    def fit(self, df):
        """Main fit method for SAR

        Args:
            df (pd.DataFrame): User item rating dataframe
        """

        # generate continuous indices if this hasn't been done
        if self.index2item is None:
            self.set_index(df)

        logger.info("Collecting user affinity matrix")
        if not np.issubdtype(df[self.col_rating].dtype, np.number):
            raise TypeError("Rating column data type must be numeric")

        # copy the DataFrame to avoid modification of the input
        temp_df = df[[self.col_user, self.col_item, self.col_rating]].copy()

        if self.time_decay_flag:
            logger.info("Calculating time-decayed affinities")
            # if time_now is None use the latest time
            if not self.time_now:
                self.time_now = df[self.col_timestamp].max()

            # apply time decay to each rating
            temp_df[self.col_rating] *= exponential_decay(
                value=df[self.col_timestamp],
                max_val=self.time_now,
                half_life=self.time_decay_half_life,
            )

            # group time decayed ratings by user-item and take the sum as the user-item affinity
            temp_df = (
                temp_df.groupby([self.col_user, self.col_item]).sum().reset_index()
            )
        else:
            # without time decay use the latest user-item rating in the dataset as the affinity score
            logger.info("De-duplicating the user-item counts")
            temp_df = temp_df.drop_duplicates(
                [self.col_user, self.col_item], keep="last"
            )

        logger.info("Creating index columns")
        # map users and items according to the two dicts. Add the two new columns to temp_df.
        temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].map(self.item2index)
        temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].map(self.user2index)

        # retain seen items for removal at prediction time
        self.seen_items = temp_df[[self.col_user_id, self.col_item_id]].values

        # affinity matrix
        logger.info("Building user affinity sparse matrix")
        self.user_affinity = self.compute_affinity_matrix(
            temp_df, self.n_users, self.n_items
        )

        # calculate item co-occurrence
        logger.info("Calculating item co-occurrence")
        item_cooccurrence = self.compute_coocurrence_matrix(
            temp_df, self.n_users, self.n_items
        )

        # free up some space
        del temp_df

        self.item_frequencies = item_cooccurrence.diagonal()

        logger.info("Calculating item similarity")
        if self.similarity_type is COOCCUR:
            logger.info("Using co-occurrence based similarity")
            self.item_similarity = item_cooccurrence
        elif self.similarity_type is JACCARD:
            logger.info("Using jaccard based similarity")
            self.item_similarity = jaccard(item_cooccurrence).astype(
                df[self.col_rating].dtype
            )
        elif self.similarity_type is LIFT:
            logger.info("Using lift based similarity")
            self.item_similarity = lift(item_cooccurrence).astype(
                df[self.col_rating].dtype
            )
        else:
            raise ValueError("Unknown similarity type: {}".format(self.similarity_type))

        # free up some space
        del item_cooccurrence

        logger.info("Done training")
Ejemplo n.º 5
0
    def fit(self, df):
        """Main fit method for SAR

        Args:
            df (pd.DataFrame): User item rating dataframe
        """

        # generate continuous indices if this hasn't been done
        if self.index2item is None:
            self.set_index(df)

        logger.info("Collecting user affinity matrix")
        if not np.issubdtype(df[self.col_rating].dtype, np.number):
            raise TypeError("Rating column data type must be numeric")

        # copy the DataFrame to avoid modification of the input
        select_columns = [self.col_user, self.col_item, self.col_rating]
        if self.time_decay_flag:
            select_columns += [self.col_timestamp]
        temp_df = df[select_columns].copy()

        if self.time_decay_flag:
            logger.info("Calculating time-decayed affinities")
            temp_df = self.compute_time_decay(df=temp_df,
                                              decay_column=self.col_rating)
        else:
            # without time decay use the latest user-item rating in the dataset as the affinity score
            logger.info("De-duplicating the user-item counts")
            temp_df = temp_df.drop_duplicates([self.col_user, self.col_item],
                                              keep="last")

        logger.info("Creating index columns")
        # add mapping of user and item ids to indices
        temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].map(
            self.item2index)
        temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].map(
            self.user2index)

        if self.normalize:
            logger.info("Calculating normalization factors")
            temp_df[self.col_unity_rating] = 1.0
            if self.time_decay_flag:
                temp_df = self.compute_time_decay(
                    df=temp_df, decay_column=self.col_unity_rating)
            self.unity_user_affinity = self.compute_affinity_matrix(
                df=temp_df, rating_col=self.col_unity_rating)

        # retain seen items for removal at prediction time
        self.seen_items = temp_df[[self.col_user_id, self.col_item_id]].values

        # affinity matrix
        logger.info("Building user affinity sparse matrix")
        self.user_affinity = self.compute_affinity_matrix(
            df=temp_df, rating_col=self.col_rating)

        # calculate item co-occurrence
        logger.info("Calculating item co-occurrence")
        item_cooccurrence = self.compute_coocurrence_matrix(df=temp_df)

        # free up some space
        del temp_df

        self.item_frequencies = item_cooccurrence.diagonal()

        logger.info("Calculating item similarity")
        if self.similarity_type is COOCCUR:
            logger.info("Using co-occurrence based similarity")
            self.item_similarity = item_cooccurrence
        elif self.similarity_type is JACCARD:
            logger.info("Using jaccard based similarity")
            self.item_similarity = jaccard(item_cooccurrence).astype(
                df[self.col_rating].dtype)
        elif self.similarity_type is LIFT:
            logger.info("Using lift based similarity")
            self.item_similarity = lift(item_cooccurrence).astype(
                df[self.col_rating].dtype)
        else:
            raise ValueError("Unknown similarity type: {}".format(
                self.similarity_type))

        # free up some space
        del item_cooccurrence

        logger.info("Done training")
Ejemplo n.º 6
0
    def fit(self, df):
        """Main fit method for SAR

        Args:
            df (pd.DataFrame): User item rating dataframe
        """

        # Generate continuous indices if this hasn't been done
        if self.index2item is None:
            self.set_index(df)

        logger.info("Collecting user affinity matrix")
        if not np.issubdtype(df[self.col_rating].dtype, np.floating):
            raise TypeError("Rating column data type must be floating point")

        # Copy the DataFrame to avoid modification of the input
        temp_df = df[[self.col_user, self.col_item, self.col_rating]].copy()

        if self.time_decay_flag:
            logger.info("Calculating time-decayed affinities")
            # if time_now is None use the latest time
            if not self.time_now:
                self.time_now = df[self.col_timestamp].max()

            # apply time decay to each rating
            temp_df[self.col_rating] *= exponential_decay(
                value=df[self.col_timestamp],
                max_val=self.time_now,
                half_life=self.time_decay_half_life,
            )

            # group time decayed ratings by user-item and take the sum as the user-item affinity
            temp_df = (temp_df.groupby([self.col_user,
                                        self.col_item]).sum().reset_index())
        else:
            # without time decay use the latest user-item rating in the dataset as the affinity score
            logger.info("De-duplicating the user-item counts")
            temp_df = temp_df.drop_duplicates([self.col_user, self.col_item],
                                              keep="last")

        logger.info("Creating index columns")
        # Map users and items according to the two dicts. Add the two new columns to temp_df.
        temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].map(
            self.item2index)
        temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].map(
            self.user2index)

        seen_items = None
        if self.remove_seen:
            # retain seen items for removal at prediction time
            seen_items = temp_df[[self.col_user_id, self.col_item_id]].values

        # Affinity matrix
        logger.info("Building user affinity sparse matrix")
        self.user_affinity = self.compute_affinity_matrix(
            temp_df, self.n_users, self.n_items)

        # Calculate item co-occurrence
        logger.info("Calculating item co-occurrence")
        item_cooccurrence = self.compute_coocurrence_matrix(
            temp_df, self.n_users, self.n_items)

        # Free up some space
        del temp_df

        logger.info("Calculating item similarity")
        if self.similarity_type == sar.SIM_COOCCUR:
            self.item_similarity = item_cooccurrence
        elif self.similarity_type == sar.SIM_JACCARD:
            logger.info("Calculating jaccard")
            self.item_similarity = jaccard(item_cooccurrence)
            # Free up some space
            del item_cooccurrence
        elif self.similarity_type == sar.SIM_LIFT:
            logger.info("Calculating lift")
            self.item_similarity = lift(item_cooccurrence)
            # Free up some space
            del item_cooccurrence
        else:
            raise ValueError("Unknown similarity type: {0}".format(
                self.similarity_type))

        # Calculate raw scores with a matrix multiplication
        logger.info("Calculating recommendation scores")
        self.scores = self.user_affinity.dot(self.item_similarity)

        # Remove items in the train set so recommended items are always novel
        if self.remove_seen:
            logger.info("Removing seen items")
            self.scores[seen_items[:, 0], seen_items[:, 1]] = -np.inf

        logger.info("Done training")