def test_python_jaccard(cooccurrence1, cooccurrence2, target_matrices): J1 = jaccard(cooccurrence1) assert type(J1) == np.ndarray assert J1 == target_matrices["jaccard1"] J2 = jaccard(cooccurrence2) assert type(J2) == np.ndarray assert J2 == target_matrices["jaccard2"]
def test_python_jaccard(python_data, target_matrices): cooccurrence1, cooccurrence2 = python_data J1 = jaccard(cooccurrence1) assert type(J1) == np.ndarray assert J1 == target_matrices["jaccard1"] J2 = jaccard(cooccurrence2) assert type(J2) == np.ndarray assert J2 == target_matrices["jaccard2"]
def fit(self, df, features, col_itemid, col_weights, demo=False): """Main fit method for SAR. Args: df (pd.DataFrame): User item rating dataframe features (pd.DataFrame): item feature dataframe col_itemid (string): name of the item id column of the feature dataframe col_weights (dictionary): mapping feature column names to their (weight, similarity_function) in the similarity metric required to contain key 'ratings' with the weight of the similarity based on user ratings col_weights of features that are not 'ratings' should sum to 1 """ num_items = len(features) load = False experiment_path = DATA_DIR + 'experiment/' demo_path = DATA_DIR + 'demo/' path = experiment_path if os.path.exists(experiment_path+'item_feature_similarity_{}.npy'.format(num_items)): load = True if demo: load = True path = demo_path if load: self.load_file(path, num_items, df) #set features_sim_matrix, index2item, item2index elif self.index2item is None: # generate continuous indices if this hasn't been done self.set_index(df) logger.info("Collecting user affinity matrix") if not np.issubdtype(df[self.col_rating].dtype, np.number): raise TypeError("Rating column data type must be numeric") # copy the DataFrame to avoid modification of the input select_columns = [self.col_user, self.col_item, self.col_rating] if self.time_decay_flag: select_columns += [self.col_timestamp] temp_df = df[select_columns].copy() if self.time_decay_flag: logger.info("Calculating time-decayed affinities") temp_df = self.compute_time_decay(df=temp_df, decay_column=self.col_rating) else: # without time decay use the latest user-item rating in the dataset as the affinity score logger.info("De-duplicating the user-item counts") temp_df = temp_df.drop_duplicates( [self.col_user, self.col_item], keep="last" ) logger.info("Creating index columns") # add mapping of user and item ids to indices temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].map(self.item2index) temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].map(self.user2index) if self.normalize: logger.info("Calculating normalization factors") temp_df[self.col_unity_rating] = 1.0 if self.time_decay_flag: temp_df = self.compute_time_decay(df=temp_df, decay_column=self.col_unity_rating) self.unity_user_affinity = self.compute_affinity_matrix(df=temp_df, rating_col=self.col_unity_rating) # affinity matrix logger.info("Building user affinity sparse matrix") self.user_affinity = self.compute_affinity_matrix(df=temp_df, rating_col=self.col_rating) # calculate item co-occurrence logger.info("Calculating item co-occurrence") item_cooccurrence = self.compute_coocurrence_matrix(df=temp_df) # free up some space del temp_df self.item_frequencies = item_cooccurrence.diagonal() logger.info("Calculating item similarity") if self.similarity_type is COOCCUR: logger.info("Using co-occurrence based similarity") self.item_similarity = item_cooccurrence elif self.similarity_type is JACCARD: logger.info("Using jaccard based similarity") self.item_similarity = jaccard(item_cooccurrence).astype( df[self.col_rating].dtype ) elif self.similarity_type is LIFT: logger.info("Using lift based similarity") self.item_similarity = lift(item_cooccurrence).astype( df[self.col_rating].dtype ) elif self.similarity_type is "custom": self.item_similarity = col_weights['ratings'] * jaccard(item_cooccurrence).astype(df[self.col_rating].dtype) if not load: self.features_sim_matrix = self.compute_feature_sim_matrix(col_weights, features, col_itemid) self.save_to_file(path) #!!! assuming self.features_sim_matrix has scale 1 (i.e. col_weights[features] all sum to 1) self.item_similarity += (1-col_weights['ratings'])*self.features_sim_matrix else: raise ValueError("Unknown similarity type: {}".format(self.similarity_type)) # free up some space del item_cooccurrence logger.info("Done training")
def fit(self, df): """Main fit method for SAR Args: df (pd.DataFrame): User item rating dataframe """ # generate continuous indices if this hasn't been done if self.index2item is None: self.set_index(df) logger.info("Collecting user affinity matrix") if not np.issubdtype(df[self.col_rating].dtype, np.number): raise TypeError("Rating column data type must be numeric") # copy the DataFrame to avoid modification of the input temp_df = df[[self.col_user, self.col_item, self.col_rating]].copy() if self.time_decay_flag: logger.info("Calculating time-decayed affinities") # if time_now is None use the latest time if not self.time_now: self.time_now = df[self.col_timestamp].max() # apply time decay to each rating temp_df[self.col_rating] *= exponential_decay( value=df[self.col_timestamp], max_val=self.time_now, half_life=self.time_decay_half_life, ) # group time decayed ratings by user-item and take the sum as the user-item affinity temp_df = ( temp_df.groupby([self.col_user, self.col_item]).sum().reset_index() ) else: # without time decay use the latest user-item rating in the dataset as the affinity score logger.info("De-duplicating the user-item counts") temp_df = temp_df.drop_duplicates( [self.col_user, self.col_item], keep="last" ) logger.info("Creating index columns") # map users and items according to the two dicts. Add the two new columns to temp_df. temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].map(self.item2index) temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].map(self.user2index) # retain seen items for removal at prediction time self.seen_items = temp_df[[self.col_user_id, self.col_item_id]].values # affinity matrix logger.info("Building user affinity sparse matrix") self.user_affinity = self.compute_affinity_matrix( temp_df, self.n_users, self.n_items ) # calculate item co-occurrence logger.info("Calculating item co-occurrence") item_cooccurrence = self.compute_coocurrence_matrix( temp_df, self.n_users, self.n_items ) # free up some space del temp_df self.item_frequencies = item_cooccurrence.diagonal() logger.info("Calculating item similarity") if self.similarity_type is COOCCUR: logger.info("Using co-occurrence based similarity") self.item_similarity = item_cooccurrence elif self.similarity_type is JACCARD: logger.info("Using jaccard based similarity") self.item_similarity = jaccard(item_cooccurrence).astype( df[self.col_rating].dtype ) elif self.similarity_type is LIFT: logger.info("Using lift based similarity") self.item_similarity = lift(item_cooccurrence).astype( df[self.col_rating].dtype ) else: raise ValueError("Unknown similarity type: {}".format(self.similarity_type)) # free up some space del item_cooccurrence logger.info("Done training")
def fit(self, df): """Main fit method for SAR Args: df (pd.DataFrame): User item rating dataframe """ # generate continuous indices if this hasn't been done if self.index2item is None: self.set_index(df) logger.info("Collecting user affinity matrix") if not np.issubdtype(df[self.col_rating].dtype, np.number): raise TypeError("Rating column data type must be numeric") # copy the DataFrame to avoid modification of the input select_columns = [self.col_user, self.col_item, self.col_rating] if self.time_decay_flag: select_columns += [self.col_timestamp] temp_df = df[select_columns].copy() if self.time_decay_flag: logger.info("Calculating time-decayed affinities") temp_df = self.compute_time_decay(df=temp_df, decay_column=self.col_rating) else: # without time decay use the latest user-item rating in the dataset as the affinity score logger.info("De-duplicating the user-item counts") temp_df = temp_df.drop_duplicates([self.col_user, self.col_item], keep="last") logger.info("Creating index columns") # add mapping of user and item ids to indices temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].map( self.item2index) temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].map( self.user2index) if self.normalize: logger.info("Calculating normalization factors") temp_df[self.col_unity_rating] = 1.0 if self.time_decay_flag: temp_df = self.compute_time_decay( df=temp_df, decay_column=self.col_unity_rating) self.unity_user_affinity = self.compute_affinity_matrix( df=temp_df, rating_col=self.col_unity_rating) # retain seen items for removal at prediction time self.seen_items = temp_df[[self.col_user_id, self.col_item_id]].values # affinity matrix logger.info("Building user affinity sparse matrix") self.user_affinity = self.compute_affinity_matrix( df=temp_df, rating_col=self.col_rating) # calculate item co-occurrence logger.info("Calculating item co-occurrence") item_cooccurrence = self.compute_coocurrence_matrix(df=temp_df) # free up some space del temp_df self.item_frequencies = item_cooccurrence.diagonal() logger.info("Calculating item similarity") if self.similarity_type is COOCCUR: logger.info("Using co-occurrence based similarity") self.item_similarity = item_cooccurrence elif self.similarity_type is JACCARD: logger.info("Using jaccard based similarity") self.item_similarity = jaccard(item_cooccurrence).astype( df[self.col_rating].dtype) elif self.similarity_type is LIFT: logger.info("Using lift based similarity") self.item_similarity = lift(item_cooccurrence).astype( df[self.col_rating].dtype) else: raise ValueError("Unknown similarity type: {}".format( self.similarity_type)) # free up some space del item_cooccurrence logger.info("Done training")
def fit(self, df): """Main fit method for SAR Args: df (pd.DataFrame): User item rating dataframe """ # Generate continuous indices if this hasn't been done if self.index2item is None: self.set_index(df) logger.info("Collecting user affinity matrix") if not np.issubdtype(df[self.col_rating].dtype, np.floating): raise TypeError("Rating column data type must be floating point") # Copy the DataFrame to avoid modification of the input temp_df = df[[self.col_user, self.col_item, self.col_rating]].copy() if self.time_decay_flag: logger.info("Calculating time-decayed affinities") # if time_now is None use the latest time if not self.time_now: self.time_now = df[self.col_timestamp].max() # apply time decay to each rating temp_df[self.col_rating] *= exponential_decay( value=df[self.col_timestamp], max_val=self.time_now, half_life=self.time_decay_half_life, ) # group time decayed ratings by user-item and take the sum as the user-item affinity temp_df = (temp_df.groupby([self.col_user, self.col_item]).sum().reset_index()) else: # without time decay use the latest user-item rating in the dataset as the affinity score logger.info("De-duplicating the user-item counts") temp_df = temp_df.drop_duplicates([self.col_user, self.col_item], keep="last") logger.info("Creating index columns") # Map users and items according to the two dicts. Add the two new columns to temp_df. temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].map( self.item2index) temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].map( self.user2index) seen_items = None if self.remove_seen: # retain seen items for removal at prediction time seen_items = temp_df[[self.col_user_id, self.col_item_id]].values # Affinity matrix logger.info("Building user affinity sparse matrix") self.user_affinity = self.compute_affinity_matrix( temp_df, self.n_users, self.n_items) # Calculate item co-occurrence logger.info("Calculating item co-occurrence") item_cooccurrence = self.compute_coocurrence_matrix( temp_df, self.n_users, self.n_items) # Free up some space del temp_df logger.info("Calculating item similarity") if self.similarity_type == sar.SIM_COOCCUR: self.item_similarity = item_cooccurrence elif self.similarity_type == sar.SIM_JACCARD: logger.info("Calculating jaccard") self.item_similarity = jaccard(item_cooccurrence) # Free up some space del item_cooccurrence elif self.similarity_type == sar.SIM_LIFT: logger.info("Calculating lift") self.item_similarity = lift(item_cooccurrence) # Free up some space del item_cooccurrence else: raise ValueError("Unknown similarity type: {0}".format( self.similarity_type)) # Calculate raw scores with a matrix multiplication logger.info("Calculating recommendation scores") self.scores = self.user_affinity.dot(self.item_similarity) # Remove items in the train set so recommended items are always novel if self.remove_seen: logger.info("Removing seen items") self.scores[seen_items[:, 0], seen_items[:, 1]] = -np.inf logger.info("Done training")