def test_sample_weight(): model = LightFM() train = sp.coo_matrix(np.array([[0, 1], [0, 1]])) with pytest.raises(ValueError): # Wrong number of weights sample_weight = sp.coo_matrix(np.zeros((2, 2))) model.fit(train, sample_weight=sample_weight) with pytest.raises(ValueError): # Wrong shape sample_weight = sp.coo_matrix(np.zeros(2)) model.fit(train, sample_weight=np.zeros(3)) with pytest.raises(ValueError): # Wrong order of entries sample_weight = sp.coo_matrix((train.data, (train.row[::-1], train.col[::-1]))) model.fit(train, sample_weight=np.zeros(3)) sample_weight = sp.coo_matrix((train.data, (train.row, train.col))) model.fit(train, sample_weight=sample_weight) model = LightFM(loss="warp-kos") with pytest.raises(NotImplementedError): model.fit(train, sample_weight=np.ones(1))
def test_state_reset(): model = LightFM() model.fit(train, epochs=1) assert np.mean(model.user_embedding_gradients) > 1.0 model.fit(train, epochs=0) assert np.all(model.user_embedding_gradients == 1.0)
def test_movielens_accuracy_fit(): model = LightFM() model.fit(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_movielens_accuracy_pickle(): model = LightFM(random_state=SEED) model.fit(train, epochs=10) model = pickle.loads(pickle.dumps(model)) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_return_self(): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.int32) model = LightFM() assert model.fit_partial(train) is model assert model.fit(train) is model
LEARNING_RATE = 1e-4 LOSS = 'warp' # Let's fit a WARP model: these generally have the best performance. model = LightFM(loss=LOSS, item_alpha=ITEM_ALPHA, no_components=COMPONENTS, learning_schedule=LEARNING, learning_rate=LEARNING_RATE) print( "Currently using LOSS:{0}, COMPONENTS:{1}, LEARNING:{2}, RATE:{3}".format( LOSS, COMPONENTS, LEARNING, LEARNING_RATE)) # Run 3 epochs and time it. model = model.fit(URM_train, epochs=NUM_EPOCHS, verbose=True) ''' train_precision = precision_at_k(model, URM_train, k=10).mean() test_precision = precision_at_k(model, URM_test, k=10).mean() train_auc = auc_score(model, URM_train).mean() test_auc = auc_score(model, URM_test).mean() print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision)) print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc)) ''' recommender = Recommender(URM_train, model) cumulative_precision = 0.0 cumulative_recall = 0.0
# set the number of threads; can increase this # if more physical cores are available. However, MacOS systems # use a default value of 1 thread if OpenMP is not supported NUM_THREADS = 2 NUM_COMPONENTS = 30 NUM_EPOCHS = 3 ITEM_ALPHA = 1e-6 # Try to fit a WARP model - this is generally the model with the best performance model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS) # run 3 epochs and time it model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS) # compute and print the AUC score train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean() print('Collaborative filtering train AUC: %s' % train_auc) # We pass in the train interactions to exclude them from predictions. # This is to simulate a recommender system where we do not # re-recommend things the user has already interacted with in the train set. test_auc = auc_score(model, test, num_threads=NUM_THREADS).mean() print('Collaborative filtering test AUC: %s' % test_auc) # set the biases to zero to rid of pre-estimated per-item biases model.item_biases *= 0.0 test_auc = auc_score(model, test, num_threads=NUM_THREADS).mean()
class LFM(BaseRecommender, DataLoaderSaver): """ Wrapper over LightFM model """ def __init__( self, no_components=30, k=5, n=10, learning_schedule="adagrad", loss="logistic", learning_rate=0.05, rho=0.95, epsilon=1e-06, item_alpha=0.0, user_alpha=0.0, max_sampled=10, random_state=42, epochs=20, show_progress=True, ): """ Source of descriptions: https://making.lyst.com/lightfm/docs/_modules/lightfm/lightfm.html#LightFM A hybrid latent representation recommender model. The model learns embeddings (latent representations in a high-dimensional space) for users and items in a way that encodes user preferences over items. When multiplied together, these representations produce scores for every item for a given user; items scored highly are more likely to be interesting to the user. The user and item representations are expressed in terms of representations of their features: an embedding is estimated for every feature, and these features are then summed together to arrive at representations for users and items. For example, if the movie 'Wizard of Oz' is described by the following features: 'musical fantasy', 'Judy Garland', and 'Wizard of Oz', then its embedding will be given by taking the features' embeddings and adding them together. The same applies to user features. The embeddings are learned through `stochastic gradient descent <http://cs231n.github.io/optimization-1/>`_ methods. Four loss functions are available: - logistic: useful when both positive (1) and negative (-1) interactions are present. - BPR: Bayesian Personalised Ranking [1]_ pairwise loss. Maximises the prediction difference between a positive example and a randomly chosen negative example. Useful when only positive interactions are present and optimising ROC AUC is desired. - WARP: Weighted Approximate-Rank Pairwise [2]_ loss. Maximises the rank of positive examples by repeatedly sampling negative examples until rank violating one is found. Useful when only positive interactions are present and optimising the top of the recommendation list (precision@k) is desired. - k-OS WARP: k-th order statistic loss [3]_. A modification of WARP that uses the k-th positive example for any given user as a basis for pairwise updates. Two learning rate schedules are available: - adagrad: [4]_ - adadelta: [5]_ Parameters ---------- no_components: int, optional the dimensionality of the feature latent embeddings. k: int, optional for k-OS training, the k-th positive example will be selected from the n positive examples sampled for every user. n: int, optional for k-OS training, maximum number of positives sampled for each update. learning_schedule: string, optional one of ('adagrad', 'adadelta'). loss: string, optional one of ('logistic', 'bpr', 'warp', 'warp-kos'): the loss function. learning_rate: float, optional initial learning rate for the adagrad learning schedule. rho: float, optional moving average coefficient for the adadelta learning schedule. epsilon: float, optional conditioning parameter for the adadelta learning schedule. item_alpha: float, optional L2 penalty on item features. Tip: setting this number too high can slow down training. One good way to check is if the final weights in the embeddings turned out to be mostly zero. The same idea applies to the user_alpha parameter. user_alpha: float, optional L2 penalty on user features. max_sampled: int, optional maximum number of negative samples used during WARP fitting. It requires a lot of sampling to find negative triplets for users that are already well represented by the model; this can lead to very long training times and overfitting. Setting this to a higher number will generally lead to longer training times, but may in some cases improve accuracy. random_state: int seed, RandomState instance, or None The seed of the pseudo random number generator to use when shuffling the data and initializing the parameters. epochs: (int, optional) number of epochs to run """ super().__init__() self.model = LightFM( no_components=no_components, k=k, n=n, learning_schedule=learning_schedule, loss=loss, learning_rate=learning_rate, rho=rho, epsilon=epsilon, item_alpha=item_alpha, user_alpha=user_alpha, max_sampled=max_sampled, random_state=random_state, ) self.epochs = epochs # data self.interactions = None self.train_ui = None self.user_id_code = None self.user_code_id = None self.item_code_id = None self.show_progress = show_progress def preprocess(self): """ Prepare interactions dataset for training model """ data = self.interactions.copy() data["event_value"] = 1 self.user_code_id = dict(enumerate(data["user"].unique())) self.user_id_code = {v: k for k, v in self.user_code_id.items()} data["user_code"] = data["user"].apply(self.user_id_code.get) self.item_code_id = dict(enumerate(data["item"].unique())) item_id_code = {v: k for k, v in self.item_code_id.items()} data["item_code"] = data["item"].apply(item_id_code.get) self.train_ui = sparse.csr_matrix( (data["event_value"], (data["user_code"], data["item_code"]))) def fit(self): """ Fit the model """ self.model.fit( self.train_ui, epochs=self.epochs, num_threads=multiprocessing.cpu_count(), verbose=self.show_progress, ) def recommend( self, target_users, n_recommendations, filter_out_interacted_items=True, show_progress=True, ) -> pd.DataFrame: """ Recommends n_recommendations items for target_users :return: pd.DataFrame (user, item_1, item_2, ..., item_n) """ items_to_recommend = np.arange(len(self.item_code_id)) with ThreadPool() as thread_pool: recommendations = list( tqdm( thread_pool.imap( partial( self.recommend_per_user, n_recommendations=n_recommendations, items_to_recommend=items_to_recommend, ), target_users, ), disable=not self.show_progress, )) return pd.DataFrame(recommendations) def recommend_per_user(self, user, n_recommendations, items_to_recommend): """ Recommends n items per user :param user: User id :param n_recommendations: Number of recommendations :return: list of format [user_id, item1, item2 ...] """ u_code = self.user_id_code.get(user) if u_code is not None: interacted_items = self.train_ui.indices[ self.train_ui.indptr[u_code]:self.train_ui.indptr[u_code + 1]] scores = self.model.predict(int(u_code), items_to_recommend) item_recommendations = items_to_recommend[np.argsort( -scores)][:n_recommendations + len(interacted_items)] item_recommendations = [ self.item_code_id[item] for item in item_recommendations if item not in interacted_items ] return ([user] + item_recommendations + [None] * (n_recommendations - len(item_recommendations)))
def recommendations(): """ Render the trail_recommendations.html page Args: Nothing Returns: the trail_recommendations.html template, this includes hiking trails recommendations based on user-input. Up to 10 trails are provided. Trail options are presented in cards that include a photo taken of the trail, a short description of the trail, and a link to the trail profile page on AllTrails.com """ # Gather user input from ideal hike text selection user_input = request.form.getlist('user_feature_options[]') input_user_features = pd.DataFrame([" ".join(user_input)]) # Gather user filters - location, feature1, feature2 user_location = request.form['user_location'] trail_feature_select1 = request.form['trail_feature_select1'] trail_feature_select2 = request.form['trail_feature_select2'] # Parse user input user_feature_new = parse_input_descriptors(input_user_features) # Make connection to database # Database name dbname = 'pnw_hike' # Set postgres username username = '******' ## Using an engine to connect to the postgres db engine = create_engine('postgres://%s:insight@localhost/%s'%(username, dbname), paramstyle="format") # Connect to make queries using psycopg2 con = None con = psycopg2.connect(database = dbname, user = username, password = '******', port = 5432) # User features user_features_query = """ SELECT * FROM user_features; """ user_features_from_sql = pd.read_sql_query(user_features_query, con, index_col='review_author') # Trail features raw trail_reviews_raw_query = """ SELECT * FROM trail_reviews_raw; """ trail_reviews_raw_from_sql = pd.read_sql_query(trail_reviews_raw_query, con, index_col="index") # Trail urls and filtering info trail_urls_info_query = """ SELECT * FROM trail_urls_info; """ trail_urls_info = pd.read_sql_query(trail_urls_info_query,con, index_col="index") # User features user_features_df = user_features_from_sql.drop(["index", "review_text", "clean_review"], axis = 1) user_features = user_features_df.fillna(0) # Trail features filling blanks with 0 trail_features = trail_reviews_raw_from_sql.fillna(0) # Convert user-feature space to sparse matrix user_features = sparse.csr_matrix(user_features.values) # Create a large sparse dataframe of extant user reviews/ratings interactions = create_interaction_matrix(trail_reviews_raw_from_sql, user_col='review_author', item_col='trail_name', rating_col='review_rating', norm=False, threshold=None) # Align users in the interaction and user matrices due to dropping some trails # Identify which users are in the interaction matrix and not in user feature space key_diff = set(interactions.index).difference(user_features_from_sql.index) where_diff = interactions.index.isin(key_diff) # Filter interactions based on users present in user features interactions = interactions.loc[~interactions.index.isin(interactions[where_diff].index)] # Convert sparse dataframe into a sparse matrix interactions_matrix = sparse.csr_matrix(interactions.values) # Prep for trail dict trail_urls = trail_urls_info[['trail_name', 'trail_url']] # Convert new user features to a sparse matrix user_feature_new_sparse = sparse.csr_matrix(user_feature_new.values) ## Combine new user-feature sparse matrix with current users' sparse matrix new_user_features = concatenate_csc_matrices_by_columns(user_feature_new_sparse, user_features) # Incorporate new user's selections into the interaction matrix interactions_new_user_df = pd.DataFrame().reindex_like(interactions).iloc[0:0] interactions_new_user_df.loc["new_user"] = 0 new_interactions_df = pd.concat([interactions_new_user_df, interactions]) interactions_new_user = sparse.csr_matrix(interactions_new_user_df.values) new_interactions_matrix = concatenate_csc_matrices_by_columns(interactions_new_user, interactions_matrix) # Make trail dict trails_in_interaction_matrix = pd.DataFrame(interactions_new_user_df.columns.T) trail_dict_prep = trails_in_interaction_matrix.merge(trail_urls, on='trail_name') # Add unique identifier to trail dict trail_dict_prep['trail_id'] = trail_dict_prep.index+1 # Make trail dict trails_dict = create_trail_dict(trail_dict_prep, id_col = 'trail_name', name_col = 'trail_id') # With the new interactions df we can defined a user dictionary user_dict = create_user_dict(interactions = new_interactions_df) # Run model with new user features and interactions NUM_THREADS = 4 # The t2.xlarge instance supports up to 4 cores, we'll use all 4 here NUM_COMPONENTS = 30 NUM_EPOCHS = 5 ITEM_ALPHA = 1e-6 # Let's train a WARP model: these generally have the best performance. model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS, random_state=15) # Fit model model = model.fit(interactions=new_interactions_matrix, user_features=new_user_features, epochs=NUM_EPOCHS, num_threads=NUM_THREADS) # Run the model trail_names, trail_overviews, trail_urls, card_image_urls = new_user_recommendation(model, new_interactions_df, user_id="new_user", trail_urls_info=trail_urls_info, user_location=user_location, trail_feature_select1=trail_feature_select1, trail_feature_select2=trail_feature_select2, user_dict=user_dict, trail_dict=trails_dict, nrec_items=1500, threshold=4) # Change 'e' if selected if user_location == 'e': user_location = "all of the Pacific Northwest" return render_template('trail_recommendations.html', trail_names = trail_names, trail_overviews = trail_overviews, trail_urls = trail_urls, card_image_urls = card_image_urls, trail_feature_select1 = trail_feature_select1, trail_feature_select2 = trail_feature_select2, user_location = user_location, input_user_features = user_input)
import numpy as np from lightfm.datasets import fetch_movielens from lightfm import LightFM data = fetch_movielens(min_rating=4.0) #print(repr(data['train'])) #print(repr(data['test'])) model = LightFM(loss='warp') model.fit(data['train'], epochs=50,num_threads=4) def sample_recommendation(model, data, user_ids): n_users, n_items = data['train'].shape for user_id in user_ids: # movies they already like known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices] # movies our model predicts they will like scores = model.predict(user_id, np.arange(n_items)) # rank them in order of most liked to least top_items = data['item_labels'][np.argsort(-scores)] # print out the results print("User %s" % user_id) print(" Known positives:") for x in known_positives[:3]: print(" %s" % x)
for x in qd.getProfiles())) print(user_features) ''' # Creating a user fettu # Split the set in train and test test, train = random_train_test_split(interactions, test_percentage=0.2, random_state=None) # Start training the model print("--- Start model training ---") model = LightFM(no_components=1, learning_rate=0.027, loss='warp') model.fit(train, item_features=item_features, epochs=100, num_threads=4, verbose=False) # model.fit(train,epochs=12,num_threads=4) modelnofeatures = LightFM(no_components=1, learning_rate=0.027, loss='warp') modelnofeatures.fit(train, epochs=100, num_threads=4, verbose=False) # model.fit(train,epochs=12,num_threads=4) ''' with open('saved_model','wb') as f: saved_model={'model':model} pickle.dump(saved_model, f)
print(test.shape) (train_interactions, train_weights) = dataset.build_interactions(train[[3, 1]].values) (test_interactions, test_weights) = dataset.build_interactions(test[[3, 1 ]].values) # arr = sparse.coo_matrix(np.tile(list(range(2,10)), (len(items), 1))) # items['features'] = arr.toarray().tolist() # # item_features = dataset.build_item_features() # # items2 = items.to_dict('records') from lightfm import LightFM model = LightFM(loss='warp', random_state=0) model.fit(train_interactions, epochs=100, num_threads=1) from lightfm.evaluation import recall_at_k from lightfm.evaluation import precision_at_k print("Train recall@7: %.2f" % recall_at_k(model, train_interactions, k=7).mean()) print("Test recall@7: %.2f" % recall_at_k(model, test_interactions, train_interactions, k=7).mean()) print("Train precision@7: %.2f" % precision_at_k(model, train_interactions, k=7).mean()) print("Test precision@7: %.2f" % precision_at_k(model, test_interactions, train_interactions, k=7).mean()) print("Train reciprocal rank: %.2f" % reciprocal_rank(model, train_interactions).mean()) print("Test reciprocal rank: %.2f" %
#pip install numpy #pip install scipy #pip install lightfm ( allow to perform any number of popular recommendation libraries ) import numpy as np from lightfm.datasets import fetch_movielens from lightfm import LightFM from fetch_amazonratingonly import fetch_amazonratingonly #fetch data and format it data = fetch_amazonratingonly(min_rating=3.0) #create model WARP model = LightFM(loss='warp') #Weighted Approximate-Rank Pairwise #train model model.fit(data['matrix'], epochs=30, num_threads=2) # model2 = LightFM(loss='warp-kos') #A modification of WARP that uses the k-th positive example for any given user as a basis for pairwise updates. # #train model # model2.fit(data['matrix'], epochs=30, num_threads=2) # #('logistic', 'warp', 'bpr', 'warp-kos') # model3 = LightFM(loss='bpr') #A modification of WARP that uses the k-th positive example for any given user as a basis for pairwise updates. # #train model # model3.fit(data['matrix'], epochs=30, num_threads=2) # model4 = LightFM(loss='logistic') #A modification of WARP that uses the k-th positive example for any given user as a basis for pairwise updates. # #train model # model4.fit(data['matrix'], epochs=30, num_threads=2)
# from lightfm.datasets import fetch_movielens # data = fetch_movielens(min_rating=5.0) # plt.imshow(data['item_features'].toarray()) # data_dict = {'train':train_mat, # 'test':test_mat, # 'item_features': , # 'item_feature_labels': , # 'item_labels':} ## Create a model instance with the desired latent dimensionality: model = LightFM(no_components=30) ## Assuming train is a (no_users, no_items) sparse matrix (with 1s denoting ## positive, and -1s negative interactions), you can fit a traditional matrix factorization model by calling: model.fit(train_mat, epochs=20) print("Train precision: %.2f" % precision_at_k(model, train_mat, k=5).mean()) print("Test precision: %.2f" % precision_at_k(model, test_mat, k=5).mean()) ## This will train a traditional MF model, as no user or item features have been supplied. ## To get predictions, call model.predict: predictions = model.predict(test_user_ids, test_item_ids) model = LightFM(loss='warp', random_state=2016,
# free memory del job_embeddings del resume_embeddings del interaction_sparse gc.collect() ##### create and train LightFM model ###### NUM_THREADS = 4 NUM_COMPONENTS = 30 NUM_EPOCHS = 50 ITEM_ALPHA = 1e-6 K_num = 5 model = LightFM(loss='warp' , item_alpha=ITEM_ALPHA , no_components=NUM_COMPONENTS) %time model = model.fit(interactions=train, user_features=job_features_sparse, item_features=resume_features_sparse, epochs=NUM_EPOCHS, num_threads=NUM_THREADS) %time test_precision = precision_at_k(model, test, user_features=job_features_sparse, item_features=resume_features_sparse, k=K_num).mean() print('test precision at k: %s' %test_precision) %time train_precision = precision_at_k(model, train, user_features=job_features_sparse, item_features=resume_features_sparse, k=K_num).mean() print('train precision at k: %s' %train_precision) %time test_auc = auc_score(model, test,user_features=job_features_sparse, item_features=resume_features_sparse, num_threads=NUM_THREADS).mean() print('test AUC: %s' %test_auc) %time train_auc = auc_score(model, train,user_features=job_features_sparse, item_features=resume_features_sparse, num_threads=NUM_THREADS).mean() print('train AUC: %s' %train_auc)
import numpy from lightfm import LightFM from ratingsData import fetch_ratings #fetch dataset using our own method fetch_data data = fetch_ratings() #creating a model using LightFM class off lightfm module model = LightFM(loss='warp') model.fit(data['ratings'], epochs=30, num_threads=2) def recommend_match(model, data, user_ids): n_user, n_matches = data['ratings'].shape for user_id in user_ids: scores = model.predict(user_id, numpy.arange(n_matches)) topScores = numpy.argsort(-scores)[:3] print('recommendation for user : %s' % user_id) for x in topScores[:3]: print(" %s" % x) recommend_match(model, data, [1])
def runMF(self, interactions, n_components, learning_rate, loss, k, epoch, n_jobs): from lightfm import LightFM model = LightFM(no_components= n_components, learning_rate = learning_rate, loss=loss,k=k) model.fit(interactions,epochs=epoch,num_threads = n_jobs) return model
games_df.columns = ['user_id', 'title', 'action', 'hours', 'hz'] games_df = games_df[games_df['action'] == 'purchase'] le = LabelEncoder() le.fit(games_df['title']) games_df['title'] = le.transform(games_df['title']) games_df_pivot = games_df.pivot_table(columns=['title'], index=['user_id'], values=['hours']) games_df_pivot.fillna(value=0, inplace=True) games_df_pivot_train = games_df_pivot.sample(frac=0.8) games_df_pivot_test = games_df_pivot.loc[games_df_pivot.index.difference( games_df_pivot_train.index)] games_df_pivot_train_sparse = coo_matrix(games_df_pivot_train.values) games_df_pivot_test_sparse = coo_matrix(games_df_pivot_test.values) model = LightFM(loss='warp', random_state=42) model.fit(games_df_pivot_train_sparse, epochs=150, num_threads=2) return model.predict([3], [1]) # print("Train precision: %.2f" % precision_at_k(model, games_df_pivot_train_sparse, k=5).mean()) # print("Test precision: %.2f" % precision_at_k(model, games_df_pivot_test_sparse, k=5).mean()) # pickle.dump(model,open('model.pickle','wb'))
import numpy as np import pandas as pd from lightfm.datasets import fetch_movielens from lightfm import LightFM from movies import fetch_movies import random movies = fetch_movies() model = LightFM(loss="warp") model.fit(movies, epochs=30, num_threads=2) users = pd.read_csv('users.dat', sep='::') movies_data = pd.read_csv('movies.dat', sep='::') user1 = random.choice(users['UserID']) user2 = random.choice(users['UserID']) user3 = random.choice(users['UserID']) def get_recommendation(users, model, movies_matrix, movies_data): n_items = movies_matrix.shape[1] for user in users: scores = model.predict(user, np.arange(n_items)) topscore = np.argsort(-scores)[:3] print('For User ', user) print('\t Reccomanded Movies :') for movie in topscore: movie_index = np.where(movie == movies_data['MovieID'])[0] movie_title = movies_data['Title'][movie_index[0]]
def process_mpd(playlists_path, target_playlists, output_file, prev_songs_window): max_prev_song = 0 previous_tracks = defaultdict(lambda: defaultdict(int)) playlists_tracks = [] playlists = [] playlists_extra = {'name': []} filenames = os.listdir(playlists_path) for filename in sorted(filenames): if filename.startswith("mpd.slice.") and filename.endswith(".json"): fullpath = os.sep.join((playlists_path, filename)) f = open(fullpath) js = f.read() f.close() mpd_slice = json.loads(js) for playlist in mpd_slice['playlists']: nname = normalize_name(playlist['name']) playlists_extra['name'].append(nname) tracks = defaultdict(int) sorted_tracks = sorted(playlist['tracks'], key=lambda k: k['pos']) prev_track = [] for track in sorted_tracks: tracks[track['track_uri']] += 1 curr_prev_tracks = len(prev_track) for i, song_in_window in enumerate(prev_track): previous_tracks[song_in_window][ track['track_uri']] += (i + 1) / curr_prev_tracks previous_tracks[track['track_uri']][ song_in_window] += (i + 1) / curr_prev_tracks #previous_tracks[song_in_window][track['track_uri']] += 1 #previous_tracks[track['track_uri']][song_in_window] += 1 max_prev_song = max( max_prev_song, previous_tracks[track['track_uri']] [song_in_window]) max_prev_song = max( max_prev_song, previous_tracks[song_in_window][ track['track_uri']]) if len(prev_track) == prev_songs_window: prev_track.pop(0) prev_track.append(track['track_uri']) playlists_tracks.append(tracks) playlists.append(str(playlist['pid'])) top_pop = [] for i in previous_tracks.keys(): top_pop.append((i, np.sum(list(previous_tracks[i].values())))) top_pop = sorted(top_pop, key=lambda x: x[1], reverse=True)[:10000] top_pop = [t[0] for t in top_pop] # Add playlists on testing set test_playlists = [] target = json.load(open(target_playlists)) train_playlists_count = len(playlists) test_playlists_recommended_sum = [] for playlist in target["playlists"]: nname = "" if 'name' in playlist: nname = normalize_name(playlist['name']) playlists_extra['name'].append(nname) playlists.append(str(playlist['pid'])) test_playlists.append(str(playlist['pid'])) if len(playlist['tracks']) == 0: test_playlists_recommended_sum.append(top_pop) playlists_tracks.append({}) continue tracks = defaultdict(int) for track in playlist['tracks']: tracks[track['track_uri']] += 1 playlists_tracks.append(tracks) recommended_pop = defaultdict(list) for t in tracks.keys(): for pt in previous_tracks[t].keys(): if pt not in tracks: recommended_pop[pt].append(previous_tracks[t][pt] / max_prev_song) recommended_pop_sum = [(t, np.sum(recommended_pop[t])) for t in recommended_pop.keys()] recommended_pop_sum = sorted(recommended_pop_sum, key=lambda x: x[1], reverse=True) recommended_pop_sum = [t[0] for t in recommended_pop_sum] test_playlists_recommended_sum.append(recommended_pop_sum) print("Data loaded. Creating features matrix") dv = DictVectorizer() interaction_matrix = dv.fit_transform(playlists_tracks) lb = LabelBinarizer(sparse_output=True) pfeat = lb.fit_transform(playlists_extra['name']) playlist_features = pfeat # Need to hstack playlist_features eye = sparse.eye(playlist_features.shape[0], playlist_features.shape[0]).tocsr() playlist_features_concat = sparse.hstack((eye, playlist_features)) item_prev = [] highlevel = [] for track in dv.feature_names_: try: f = get_audio_features_dict(track.replace('spotify:track:', ''), False) except ValueError: print("Failed loading json", track) f = None curr_highlevel = {} if f is not None: curr_highlevel = {k: v for k, v in f.items() if 'class_f' in k} highlevel.append(curr_highlevel) ifeat_highlevel = DictVectorizer().fit_transform(highlevel) item_prev = ifeat_highlevel eye = sparse.eye(item_prev.shape[0], item_prev.shape[0]).tocsr() item_feat = sparse.hstack((eye, item_prev)) print("Features matrix created. Training model") model = LightFM(loss='warp', no_components=200, max_sampled=30, item_alpha=1e-06, user_alpha=1e-06, random_state=SEED) model = model.fit(interaction_matrix, user_features=playlist_features_concat, item_features=item_feat, epochs=150, num_threads=32) print("Model Trained") user_biases, user_embeddings = model.get_user_representations( playlist_features_concat) item_biases, item_embeddings = model.get_item_representations(item_feat) fuse_perc = 0.7 with open(output_file, 'w') as fout: print('team_info,cocoplaya,creative,[email protected]', file=fout) for i, playlist in enumerate(test_playlists): playlist_pos = train_playlists_count + i y_pred = user_embeddings[playlist_pos].dot( item_embeddings.T) + item_biases topn = np.argsort(-y_pred)[:len(playlists_tracks[playlist_pos]) + 4000] rets = [(dv.feature_names_[t], float(y_pred[t])) for t in topn] songids = [ s for s, _ in rets if s not in playlists_tracks[playlist_pos] ] songids_dict = {s: 1 for s in songids} max_score = max(len(songids), len(test_playlists_recommended_sum[i])) pop_sum = { s: (max_score - p) for p, s in enumerate(test_playlists_recommended_sum[i]) } fuse_sum = [] for p, s in enumerate(songids): pop_val_sum = 0 if s in pop_sum: pop_val_sum = pop_sum[s] fuse_sum.append( (s, ((max_score - p) * fuse_perc + pop_val_sum * (1 - fuse_perc)) / 2)) for s in pop_sum.keys(): if s not in songids_dict: fuse_sum.append((s, (pop_sum[s] * (1 - fuse_perc)) / 2)) fuse_sum = sorted(fuse_sum, key=lambda x: x[1], reverse=True) print(' , '.join([playlist] + [x[0] for x in fuse_sum[:500]]), file=fout)
ratings.nonzero()[0])) / (ratings.shape[0] * ratings.shape[1]) * 100 X = csr_matrix(ratings) n_users, n_items = ratings_df.shape user_ids = ratings_df.index.values artist_names = ap.sort_values("artistID")["name"].unique() Xcoo = X.tocoo() data = Dataset() data.fit(np.arange(n_users), np.arange(n_items)) interactions, weights = data.build_interactions( zip(Xcoo.row, Xcoo.col, Xcoo.data)) train, test = random_train_test_split(interactions) model = LightFM(learning_rate=0.05, loss='warp') model.fit(train, epochs=10, num_threads=2) # Generating the list of artists at start-up: artIDs = ap['artistID'].unique() numarts = len(ap['artistID'].unique()) listart = "" for it, artName in enumerate(ap['name'].unique()): listart = listart + '<input type="checkbox" name="' + str( artIDs[it]) + '" value="' + str(artName) + '">' + artName + '<br>' # get_recommendation from Jupyter notebook: def get_recommendation(userid, ratings=ratings): X = csr_matrix(ratings) svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=0) X_matrix_svd = svd.fit_transform(X)
import numpy as np from lightfm.datasets import fetch_movielens from lightfm import LightFM data = fetch_movielens(min_rating=4.0) print(repr(data["train"])) print(repr(data["test"])) # creating a model model = LightFM(loss="warp") # training the model model.fit(data["train"], epochs=30, num_threads=2) def recommendations(model, data, user_ids): # num of users and movies in the matrix number_users, number_items = data["train"].shape for user_id in user_ids: # movies they already like liked_movies = data["item_labels"][data["train"].tocsr() [user_id].indices] # movies we predict they will like M_list = model.predict(user_id, np.arange(number_items)) # rank them in order of most liked to least
def main(spark, train_data, validation_data): spark_session = SparkSession.builder.appName('extension1').master('yarn').config('spark.executor.memory', '15g').config('spark.driver.memory', '15g').getOrCreate() spark.conf.set("spark.sql.execution.arrow.enabled", "true") ##################################################################### #LIGHTFM Model # Read data from parquet train_df = spark.read.parquet('hdfs:/user/smt570/small_train.parquet') train_df.createOrReplaceTempView('train') train_df = train_df.select('user_id','book_id','rating') val = spark.read.parquet('hdfs:/user/smt570/small_val.parquet') val.createOrReplaceTempView('val') val_df = val.select('user_id','book_id','rating') #remove ratings less than 3 from ground truth val_df = val_df.filter(val_df.rating >= 3) #all positive instances for training (rating >=3) keep their values, anything else becomes 0 eq = udf(lambda x: x if x >=3 else 0, IntegerType()) train_df = train_df.withColumn('rating',eq(train_df.rating)) #need to sort first train_df = train_df.orderBy('user_id') print('Building input sparse matrices...') #convert to pandas for pre-processing train_df = train_df.toPandas() val_df = val_df.toPandas() #initialize dicts transf_train = dict() transf_val = dict() enc = preprocessing.LabelEncoder() #transform data values for train and val transf_train['user_id']=enc.fit_transform(train_df['user_id'].values) transf_train['book_id'] = enc.fit_transform(train_df['book_id'].values) transf_train['rating']=enc.fit_transform(train_df['rating'].values) transf_val['user_id']=enc.fit_transform(val_df['user_id'].values) transf_val['book_id'] = enc.fit_transform(val_df['book_id'].values) transf_val['rating']=enc.fit_transform(val_df['rating'].values) #get size of COO matrix n_users = len(np.unique(transf_train['user_id'])) n_items = len(np.unique(transf_train['book_id'])) #create COO matrices train = coo_matrix((transf_train['rating'],(transf_train['user_id'],transf_train['book_id'])),shape=(n_users,n_items)) val = coo_matrix((transf_val['rating'],(transf_val['user_id'],transf_val['book_id'])),shape=(n_users,n_items)) #Build LightFM model print('Building LightFM model...') model = LightFM(loss = 'warp', no_components = 30) #Train LightFM model and check time to fit print('Training LightFM model...') start_time = time.time() model.fit(train) print('Run time: {} mins'.format((time.time() - start_time)/60)) #Get data ready for evaluation, use top k predictions for metrics print('Evaluating...') pak_train = precision_at_k(model,train,k=125).mean() pak_val = precision_at_k(model,val,k=125).mean() print('Train precision@K = {}:'.format(pak_train)) print('Test precision@K = {}:'.format(pak_val)) auc_train = auc_score(model, train).mean() auc_test = auc_score(model, val).mean() print("Train AUC Score: {}".format(auc_train)) print("Test AUC Score: {}".format(auc_test)) ################################################################### #ALS Model # Read data from parquet train = spark.read.parquet(train_data) train.createOrReplaceTempView('train') train_data = train.select('user_id','book_id','rating') train_data = train_data.filter(train_data.rating !=0) val = spark.read.parquet(validation_data) val.createOrReplaceTempView('val') val_data = val.filter(val.rating >= 3) val_data = val.select('user_id','book_id','rating') #creating ground truth df w = Window.partitionBy('user_id').orderBy(col('rating').desc()) actual = val_data.withColumn("sorted_vals_by_rating", F.collect_list('book_id').over(w)) actual = actual.groupBy('user_id').agg(F.max('sorted_vals_by_rating').alias('items')) # Go through parameters # Build ALS model print('Building ALS model...') als=ALS(maxIter=5,regParam=0.1,rank=2,userCol="user_id",itemCol="book_id",ratingCol="rating",coldStartStrategy="drop",nonnegative=True) #Train ALS model print('Training ALS model...') start_time = time.time() model = als.fit(train_data) print('Run time: {} mins'.format((time.time() - start_time)/60)) # Make predictions on val_data print('Making predictions...') predictions = model.transform(val_data) #### #MAP (Method 1) predictions = model.transform(val_data) #model makes top k predictions for all users preds = model.recommendForAllUsers(125) #remove StructType preds = preds.withColumn('recommendations',explode('recommendations')).select('*') preds = preds.select('user_id','recommendations.*') #build predictions df: group books by user_id, store as single array of books in rating column w = Window.partitionBy('user_id').orderBy(col('rating').desc()) perUserPredictedItemsDF = preds.select('user_id', 'book_id', 'rating', F.rank().over(w).alias('rank')).where('rank <= 500').groupBy('user_id').agg(expr('collect_list(book_id) as books')) windowSpec = Window.partitionBy('user_id').orderBy(col('rating').desc()) perUserActualItemsDF = val.select('user_id', 'book_id', 'rating', F.rank().over(windowSpec).alias('rank')).groupBy('user_id').agg(expr('collect_list(book_id) as books')) #build df of predictions and ground truth, convert to RDD perUserItemsRDD = perUserPredictedItemsDF.join(perUserActualItemsDF, 'user_id').rdd.map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(perUserItemsRDD) pak = rankingMetrics.precisionAt(125) print('Precision at k is {}'.format(pak))
pos1_train, pos1_test = random_train_test_split(pos1_spr , test_percentage=0.25 , random_state = None) ### create and train LightFM model ### NUM_THREADS = 4 NUM_COMPONENTS = 5 NUM_EPOCHS = 30 ITEM_ALPHA = 1e-6 pos1_model = LightFM(loss='warp' , item_alpha=ITEM_ALPHA , no_components=NUM_COMPONENTS) %time pos1_model = pos1_model.fit(pos1_train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS) train_auc = auc_score(pos1_model, pos1_train, num_threads=NUM_THREADS).mean() print('train AUC: %s' %train_auc) test_auc = auc_score(pos1_model, pos1_test, num_threads=NUM_THREADS).mean() print('test AUC: %s' %test_auc) train_precision = precision_at_k(pos1_model, pos1_train, k=10).mean() print('train precision at k: %s' %train_precision) test_precision = precision_at_k(pos1_model, pos1_test, k=10).mean() print('test precision at k: %s' %test_precision) #################### 4 original resume and FULL job description with td-ifd embeddings #################### stages = [] for one_job_id in jobIDs: pos_tfidf = GenerateTfidfEmbedding(one_job_id, job_text, resume_text)
from lightfm import LightFM # Fetch data and format it data = fetch_movielens(min_rating=4.0) # Print training and testing data print(repr(data['train'])) print(repr(data['test'])) # Create model model1 = LightFM(loss='warp') model2 = LightFM(loss='logistic') model3 = LightFM(loss='bpr') model4 = LightFM(loss='warp-kos') # Train model model1.fit(data['train'], epochs=30, num_threads=2) model2.fit(data['train'], epochs=30, num_threads=2) model3.fit(data['train'], epochs=30, num_threads=2) model4.fit(data['train'], epochs=30, num_threads=2) def sample_recommendation(model, data, user_ids): # Number of users and movies in training data n_users, n_items = data['train'].shape # Generate recommendations for each user we input for user_id in user_ids: # Movies they already like # CSR stands for Compressed Sparse Row format. We find all the movies in the training dataset that user_id likes (>=4.0 rating) and we find its indices to index the item labels dataset to get the actual movie names
'book_id', 'rating') # train_csr = scipy.sparse.csr_matrix((train_df['rating'].values, (train_df['user_id'].values, train_df['book_id'].values))) # val_csr = scipy.sparse.csr_matrix((val_df['rating'].values, (val_df['user_id'].values, val_df['book_id'].values)), shape = train_csr.shape) assert train_csr.shape == val_csr.shape for epoch in epochs: for rank in ranks: model = LightFM(no_components=rank, loss='warp', learning_rate=0.05) start = time.time() model.fit(train_coo, epochs=epoch, num_threads=10) time_taken_to_fit = time.time() - start total = 0 avg_precision = None if calculate_precision_at_k is True: for k in ks: _p = precision_at_k(model, test_interactions=val_csr, train_interactions=train_csr, k=k) avg_precision = _p.sum() / len(_p)
features_generator = ((item_id, ele) for item_id in list_features.keys() for ele in list_features[item_id]) item_features = train.build_item_features(features_generator, normalize=False) print('End Loading Features.') # Train the Model print('Training...') start = time() model = LightFM(no_components=args.emb_K, loss=args.loss, learning_rate=args.lr, random_state=0) model.fit(train_interactions, item_features=item_features, epochs=args.epoch, num_threads=args.num_threads, verbose=True) print('End Training in {0}.'.format(time() - start)) with open(weight_directory + '_step{0}_LFM.pickle'.format(args.epoch), 'wb') as dump: pickle.dump(model, dump, protocol=pickle.HIGHEST_PROTOCOL) # # Evaluation print("Evaluation...") with open( result_directory + '_top{0}_ep{1}_LFM.tsv'.format(args.topk, args.epoch), 'w') as out: for user_id in range(df_train[0].nunique()):
missing_n = 100 epoch = 5 data = np.load( os.path.join(tensorflow_data_3_dir, str(missing_n), 'ori_matrix_sample_{}.npy'.format(missing_n))) a = np.where(data == -1) data[a[0], a[1]] = 0 print(np.sum(data)) data = coo_matrix(data) # print(data.toarray()) '''repr()函数将对象转化为供解释器读取的形式''' result = np.zeros(data.shape) # create model model = LightFM(no_components=30, loss='bpr') # warp = weighted approximate-rank pairwise print(datetime.datetime.now()) model.fit(data, epochs=epoch, num_threads=2, verbose=True) print(datetime.datetime.now()) n_users, n_items = data.shape for i in range(n_users): scores = model.predict(i, np.arange(n_items)) result[i] = scores np.save( os.path.join(baseline_output_dir, 'bpr_{}_wtreview.npy'.format(missing_n)), result)
# #fetch data and format it data = fetch_movielens(min_rating=4.0) #print training and testing data print(repr(data['train'])) print(repr(data['test'])) #CHALLENGE part 2 of 3 - use 3 different loss functions (so 3 different models), compare results, print results for #the best one. - Available loss functions are warp, logistic, bpr, and warp-kos. #create model model = LightFM(loss='warp') #train model model.fit(data['train'], epochs=30, num_threads=2) #CHALLENGE part 3 of 3 - Modify this function so that it parses your dataset correctly to retrieve #the necessary variables (products, songs, tv shows, etc.) #then print out the recommended results def sample_recommendation(model, data, user_ids): #number of users and movies in training data n_users, n_items = data['train'].shape #generate recommendations for each user we input for user_id in user_ids: #movies they already like
from lightfm.data import Dataset print(get_ratings()) dataset = Dataset() dataset.fit((x['User-ID'] for x in get_ratings()), (x['ISBN'] for x in get_ratings())) num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()), item_features=(x['Book-Author'] for x in get_book_features())) (interactions, weights) = dataset.build_interactions( (x['User-ID'], x['ISBN']) for x in get_ratings()) print(repr(interactions)) item_features = dataset.build_item_features( ((x['ISBN'], [x['Book-Author']]) for x in get_book_features())) print(repr(item_features)) from lightfm import LightFM model = LightFM(loss='bpr') model.fit(interactions, item_features=item_features) from lightfm.evaluation import precision_at_k print("Train precision: %.2f" % precision_at_k(model, interactions, k=5).mean())
import numpy as np from lightfm.datasets import fetch_movielens from lightfm import LightFM from lightfm.evaluation import precision_at_k,auc_score data = fetch_movielens(min_rating=4.0) print(repr(data['train'])) print(repr(data['test'])) # model with warp model_warp = LightFM(loss='warp') model_warp.fit(data['train'], epochs=30, num_threads=2) #model with bpr model_bpr = LightFM(loss='bpr') model_bpr.fit(data['train'], epochs=30, num_threads=2) def recommender(model, data, user_ids): n_users, n_items = data['train'].shape for user_id in user_ids : known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices] scores = model.predict(user_id, np.arange(n_items)) top_items = data['item_labels'][np.argsort(-scores)]
# print('Optimal parameters:') # params = ['epochs', 'learning_rate', 'no_components', 'alpha', 'max_sampled'] # for (p, x_) in zip(params, res_fm.x): # print('{}: {}'.format(p, x_)) ######## train the model ######## model = LightFM( loss='warp', learning_rate=0.036281404040243825, no_components=29, user_alpha=0.00048625731451155697, item_alpha=0.00048625731451155697, max_sampled=37, ) # model.fit(train_data, user_features, food_features, epochs=10, num_threads=20) model.fit(all_data, epochs=197, num_threads=10) # patks = evaluation.precision_at_k(model, val_data, # train_interactions=None, # # user_features = user_features, # # item_features = food_features, # k=20, num_threads=20) # mapatk = np.mean(patks) # print(mapatk) ######## predict ######## preds = [] food_ids_vocab = np.array(list(food_ids_set)) usr_ids_vocab = np.array(list(usr_ids_set))
import numpy as np from lightfm.datasets import fetch_movielens from lightfm import LightFM # fetch data and format it data = fetch_movielens(min_rating=4.0) # print training and testing data print(repr(data['train'])) print(repr(data['test'])) # create model model = LightFM(loss='warp') #train model model.fit(data['train'], epochs=30, num_threads=2) def sample_recommendation(model, data, user_ids): # number of users and movies in training datasets n_users, n_items = data['train'].shape # generate recommendations for each user we input for user_id in user_ids: # movies they already like known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices] # movies our model predicts they will like scores = model.predict(user_id, np.arange(n_items))
train = data['train'] user_idxs = data['user_idxs'] idx_to_userid = data['idx_to_userid'] userid_to_idx = data['userid_to_idx'] idx_to_itemid = data['idx_to_itemid'] itemid_to_idx = data['itemid_to_idx'] fundid_names_df = pd.read_csv('./funds-dataset/fundid_to_name.csv',encoding='cp950') fundid_to_names = {} for d in fundid_names_df.to_dict('records'): fundid_to_names[d['基金代碼']] = d['基金中文名稱'] #%% t1 = time.time() model_lr = LightFM(learning_rate=0.01, loss='warp') model_lr.fit(train, epochs=10) t2 = time.time() print('model built (lightfm) cost :{:.1f} s'.format(t2-t1)) train_precision = precision_at_k(model_lr, train, k=10).mean() test_precision = precision_at_k(model_lr, test, k=10).mean() train_recall = recall_at_k(model_lr,train,k=10).mean() test_recall = recall_at_k(model_lr,test,k=10).mean() train_auc = auc_score(model_lr, train).mean() test_auc = auc_score(model_lr, test).mean() ## on test : Recall- 19.30%, Precision- 1.93%, (AUC-0.91) print('Recall: train {:.2f}%, test {:.2f}%'.format(100*train_recall,100*test_recall)) print('Precision: train {:.2f}% , test {:.2f}%.'.format(100*train_precision, 100*test_precision)) print('AUC: train {:.2f}, test {:.2f}.'.format(train_auc, test_auc))
def build_model(self) -> None: """ Fits model for user-variant recommendations and similar variant recommendations. """ if hasattr(self, 'input_file'): logging.info(f'Training the main model with dataset {self.input_file}...') else: logging.info('Training the model...') train_validation, test = train_test_split( self.dataset.interactions, **self.config.VALIDATION_PARAMS ) train, validation = train_test_split( train_validation, **self.config.VALIDATION_PARAMS ) logging.info(f'train: Type; {type(train)}, Shape; {train.shape}') logging.info(f'validation: Type; {type(validation)}, Shape; {validation.shape}') logging.info(f'test: Type; {type(test)}, Shape; {test.shape}') model = LightFM(**self.config.LIGHTFM_PARAMS) warp_auc: List[float] = [] no_improvement_rounds = 0 best_auc = 0.0 epochs = self.config.FIT_PARAMS['epochs'] early_stopping_rounds = self.config.FIT_PARAMS['early_stopping_rounds'] logging.info( f'Training model until validation AUC has not improved in {early_stopping_rounds} epochs...' ) for epoch in range(epochs): logging.info(f'Epoch {epoch}...') if no_improvement_rounds >= early_stopping_rounds: break model.fit( interactions=train, item_features=self.dataset.item_features, epochs=self.config.FIT_PARAMS['epochs_per_round'], num_threads=self.config.FIT_PARAMS['core_count'], ) warp_auc.append( auc_score( model=model, test_interactions=validation, item_features=self.dataset.item_features, ).mean() ) if warp_auc[-1] > best_auc: best_auc = warp_auc[-1] no_improvement_rounds = 0 else: no_improvement_rounds += 1 logging.info(f'[{epoch}]\tvalidation_warp_auc: {warp_auc[-1]}') self.num_epochs = len(warp_auc) - early_stopping_rounds logging.info(f'Stopping. Best Iteration:') logging.info( f'[{self.num_epochs - 1}]\tvalidation_warp_auc: {warp_auc[self.num_epochs - 1]}' ) logging.info(f'Calculating AUC score on test set...') test_score = auc_score( model=model, test_interactions=test, item_features=self.dataset.item_features, ).mean() logging.info(f'Test Set AUC Score: {test_score}') self.model = model self.test_score = test_score
row = tbl3['UserId'].values - 1 col = tbl3['movie_id_index'].values shape = (10000, len(movie_id)) sparse_matrix = coo_matrix((data,(row,col)), shape = shape) print(repr(sparse_matrix)) #tbl2 # #data2 = tbl2['IfExists'].values #row2 = tbl2['MovieId'].values - 1 #col2 = tbl2['GenreId'].values - 1 # # #sparse_matrix2 = coo_matrix((data2,(row2,col2)), shape = (max(row2)+1,max(col2)+1)) #print(repr(sparse_matrix2)) #print(str(sparse_matrix2.getrow(1))) from lightfm.datasets import fetch_movielens from lightfm import LightFM model = LightFM(loss='warp') model.fit(sparse_matrix, epochs=30, num_threads=2) n_users, n_items = sparse_matrix.shape scores = model.predict(0,np.arange(n_items)) top_items = np.argsort(-scores) top_items
from lightfm import LightFM # Set the number of threads; you can increase this # ify you have more physical cores available. NUM_THREADS = 2 NUM_COMPONENTS = 30 NUM_EPOCHS = 3 ITEM_ALPHA = 1e-6 # Let's fit a WARP model: these generally have the best performance. model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS) # Run 3 epochs and time it. model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS) # Import the evaluation routines from lightfm.evaluation import auc_score # Compute and print the AUC score train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean() print('Collaborative filtering train AUC: %s' % train_auc) # We pass in the train interactions to exclude them from predictions. # This is to simulate a recommender system where we do not # re-recommend things the user has already interacted with in the train # set. test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS).mean() print('Collaborative filtering test AUC: %s' % test_auc)
def do_fiber_training(visualization = False): if not os.path.isfile(rc.RECOMMENDER_TRAINING) or not os.path.isfile(rc.RECOMMENDER_MODEL): yarn_data_matrix = pickle.load(open( rc.YARN_DATA_MATRIX, "rb" )) yarn_data_train = sps.coo_matrix( yarn_data_matrix[:int(len(yarn_data_matrix)*0.5)] ) > 0 yarn_data_test = sps.coo_matrix( yarn_data_matrix[int(len(yarn_data_matrix)*0.5):] ) > 0 if visualization: print yarn_data_train.shape[0],yarn_data_test.shape[0], len(yarn_data_matrix) # Taken from: https://github.com/lyst/lightfm/blob/master/examples/stackexchange/hybrid_crossvalidated.ipynb # Set the number of threads; you can increase this # ify you have more physical cores available. NUM_THREADS = 2 NUM_COMPONENTS = 30 NUM_EPOCHS = 3 ITEM_ALPHA = 1e-6 # Let's fit a WARP model: these generally have the best performance. model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS) # Run 3 epochs and time it. model = model.fit(yarn_data_train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS) # Compute and print the AUC score train_auc = auc_score(model, yarn_data_train, num_threads=NUM_THREADS).mean() print('Collaborative filtering train AUC: %s' % train_auc) # We pass in the train interactions to exclude them from predictions. # This is to simulate a recommender system where we do not # re-recommend things the user has already interacted with in the train # set. test_auc = auc_score(model, yarn_data_test, train_interactions=yarn_data_train, num_threads=NUM_THREADS).mean() print('Collaborative filtering test AUC: %s' % test_auc) pickle.dump(yarn_data_matrix,open(rc.RECOMMENDER_TRAINING, 'wb')) pickle.dump(model,open(rc.RECOMMENDER_MODEL, 'wb')) else: yarn_data_matrix = pickle.load(open(rc.RECOMMENDER_TRAINING, 'rb')) model = pickle.load(open(rc.RECOMMENDER_MODEL, 'rb')) translation_dict = pickle.load(open(rc.YARN_TRANSLATION_DATA, 'rb')) print len(yarn_data_matrix) for matrix_id in xrange(0,len(yarn_data_matrix)): print matrix_id predictions = model.predict(matrix_id,yarn_data_matrix[matrix_id]) matches = [] predictions += abs(np.min(predictions)) # make non-negative _max = np.max(predictions) # find max for normalization predictions /= _max # Normalize predictions for prediction in xrange(0,len(predictions)): if predictions[prediction] > 0.9: matches.append([translation_dict[prediction],prediction,predictions[prediction]]) print translation_dict[matrix_id],matches