def test_feature_inference_fails(): # On predict if we try to use feature inference and supply # higher ids than the number of features that were supplied to fit # we should complain no_users, no_items = (10, 100) no_features = 20 train = sp.coo_matrix((no_users, no_items), dtype=np.int32) user_features = sp.csr_matrix((no_users, no_features), dtype=np.int32) item_features = sp.csr_matrix((no_items, no_features), dtype=np.int32) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) with pytest.raises(AssertionError): model.predict(np.array([no_features], dtype=np.int32), np.array([no_features], dtype=np.int32))
def test_input_dtypes(): dtypes = (np.int32, np.int64, np.float32, np.float64) no_users, no_items = (10, 100) no_features = 20 for dtype in dtypes: train = sp.coo_matrix((no_users, no_items), dtype=dtype) user_features = sp.coo_matrix((no_users, no_features), dtype=dtype) item_features = sp.coo_matrix((no_items, no_features), dtype=dtype) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) model.predict(np.random.randint(0, no_users, 10).astype(np.int32), np.random.randint(0, no_items, 10).astype(np.int32), user_features=user_features, item_features=item_features)
def test_predict(num_threads=2): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.int32) model = LightFM() model.fit_partial(train) for uid in range(no_users): scores_arr = model.predict(np.repeat(uid, no_items), np.arange(no_items)) scores_int = model.predict(uid, np.arange(no_items)) assert np.allclose(scores_arr, scores_int) scores_parallel = model.predict(np.repeat(uid, no_items), np.arange(no_items), num_threads=num_threads) assert np.allclose(scores_parallel, scores_arr) scores_no_prec = model.predict(np.repeat(uid, no_items), np.arange(no_items), num_threads=num_threads, precompute_representations=False) assert np.allclose(scores_parallel, scores_no_prec) scores_no_prec_serial = model.predict(np.repeat(uid, no_items), np.arange(no_items), num_threads=1, precompute_representations=False) assert np.allclose(scores_parallel, scores_no_prec_serial)
def test_feature_inference_fails(): # On predict if we try to use feature inference and supply # higher ids than the number of features that were supplied to fit # we should complain no_users, no_items = (10, 100) no_features = 20 train = sp.coo_matrix((no_users, no_items), dtype=np.int32) user_features = sp.csr_matrix((no_users, no_features), dtype=np.int32) item_features = sp.csr_matrix((no_items, no_features), dtype=np.int32) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) with pytest.raises(ValueError): model.predict(np.array([no_features], dtype=np.int32), np.array([no_features], dtype=np.int32))
def test_input_dtypes(): dtypes = (np.int32, np.int64, np.float32, np.float64) no_users, no_items = (10, 100) no_features = 20 for dtype in dtypes: train = sp.coo_matrix((no_users, no_items), dtype=dtype) user_features = sp.coo_matrix((no_users, no_features), dtype=dtype) item_features = sp.coo_matrix((no_items, no_features), dtype=dtype) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) model.predict(np.random.randint(0, no_users, 10).astype(np.int32), np.random.randint(0, no_items, 10).astype(np.int32), user_features=user_features, item_features=item_features)
def test_matrix_types(): mattypes = (sp.coo_matrix, sp.lil_matrix, sp.csr_matrix, sp.csc_matrix) dtypes = (np.int32, np.int64, np.float32, np.float64) no_users, no_items = (10, 100) no_features = 20 for mattype in mattypes: for dtype in dtypes: train = mattype((no_users, no_items), dtype=dtype) user_features = mattype((no_users, no_features), dtype=dtype) item_features = mattype((no_items, no_features), dtype=dtype) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) model.predict(np.random.randint(0, no_users, 10).astype(np.int32), np.random.randint(0, no_items, 10).astype(np.int32), user_features=user_features, item_features=item_features)
def test_matrix_types(): mattypes = (sp.coo_matrix, sp.lil_matrix, sp.csr_matrix, sp.csc_matrix) dtypes = (np.int32, np.int64, np.float32, np.float64) no_users, no_items = (10, 100) no_features = 20 for mattype in mattypes: for dtype in dtypes: train = mattype((no_users, no_items), dtype=dtype) user_features = mattype((no_users, no_features), dtype=dtype) item_features = mattype((no_items, no_features), dtype=dtype) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) model.predict( np.random.randint(0, no_users, 10).astype(np.int32), np.random.randint(0, no_items, 10).astype(np.int32), user_features=user_features, item_features=item_features, )
def test_predict(num_threads=2): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.int32) model = LightFM() model.fit_partial(train) for uid in range(no_users): scores_arr = model.predict(np.repeat(uid, no_items), np.arange(no_items)) scores_int = model.predict(uid, np.arange(no_items)) assert np.allclose(scores_arr, scores_int) scores_parallel = model.predict(np.repeat(uid, no_items), np.arange(no_items), num_threads=num_threads) assert np.allclose(scores_parallel, scores_arr) scores_no_prec = model.predict(np.repeat(uid, no_items), np.arange(no_items), num_threads=num_threads, precompute_representations=False) assert np.allclose(scores_parallel, scores_no_prec) scores_no_prec_serial = model.predict(np.repeat(uid, no_items), np.arange(no_items), num_threads=1, precompute_representations=False) assert np.allclose(scores_parallel, scores_no_prec_serial)
def test_movielens_accuracy_fit(): model = LightFM() model.fit(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_movielens_accuracy(): model = LightFM(random_state=SEED) model.fit_partial(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_hogwild_accuracy(): # Should get comparable accuracy with 2 threads model = LightFM(random_state=SEED) model.fit_partial(train, epochs=10, num_threads=2) train_predictions = model.predict(train.row, train.col, num_threads=2) test_predictions = model.predict(test.row, test.col, num_threads=2) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_movielens_excessive_regularization(): # Should perform poorly with high regularization model = LightFM(no_components=10, item_alpha=1.0, user_alpha=1.0) model.fit_partial(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) < 0.6 assert roc_auc_score(test.data, test_predictions) < 0.6
def test_regularization(): # Let's regularize model = LightFM(no_components=50, item_alpha=0.0001, user_alpha=0.0001) model.fit_partial(train, epochs=30) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.80 assert roc_auc_score(test.data, test_predictions) > 0.75
def test_movielens_accuracy_pickle(): model = LightFM(random_state=SEED) model.fit(train, epochs=10) model = pickle.loads(pickle.dumps(model)) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_movielens_accuracy_resume(): model = LightFM() for _ in range(10): model.fit_partial(train, epochs=1) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_zeros_negative_accuracy(): # Should get the same accuracy when zeros are used to # denote negative interactions train.data[train.data == -1] = 0 model = LightFM(random_state=SEED) model.fit_partial(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_predict(): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.int32) model = LightFM() model.fit_partial(train) for uid in range(no_users): scores_arr = model.predict(np.repeat(uid, no_items), np.arange(no_items)) scores_int = model.predict(uid, np.arange(no_items)) assert np.allclose(scores_arr, scores_int)
def test_movielens_accuracy_fit(): model = LightFM(random_state=SEED) model.fit(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_movielens_accuracy(): model = LightFM() model.fit_partial(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_overfitting(): # Let's massivly overfit model = LightFM(no_components=50, random_state=SEED) model.fit_partial(train, epochs=30) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) overfit_train = roc_auc_score(train.data, train_predictions) overfit_test = roc_auc_score(test.data, test_predictions) assert overfit_train > 0.99 assert overfit_test < 0.75
def test_predict(): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.int32) model = LightFM() model.fit_partial(train) for uid in range(no_users): scores_arr = model.predict(np.repeat(uid, no_items), np.arange(no_items)) scores_int = model.predict(uid, np.arange(no_items)) assert np.allclose(scores_arr, scores_int)
class LightFM_Recommender: def __init__(self, train, icm, no_components=10, k=5, n=10, item_alpha=0.0, user_alpha=0.0, loss='warp', learning_rate=0.05, rho=0.95, epsilon=1e-6, max_sampled=10, learning_schedule='adagrad'): self.train = train self.icm = icm self.model = LightFM(loss=loss, k=k, n=n, item_alpha=item_alpha, user_alpha=user_alpha, no_components=no_components, learning_rate=learning_rate, rho=rho, epsilon=epsilon, max_sampled=max_sampled, learning_schedule=learning_schedule) self.pid_array = np.arange(train.shape[1], dtype=np.int32) def fit(self, epochs): self.model.fit(epochs=epochs, interactions=self.train, item_features=self.icm, verbose=True) def filter_seen(self, user_id, scores): start_pos = int(self.train.indptr[user_id]) end_pos = int(self.train.indptr[user_id + 1]) user_profile = self.train.indices[start_pos:end_pos] scores[user_profile] = -1000000 #-np.inf return scores def scores(self, user_id): return self.model.predict(user_id, self.pid_array, item_features=self.icm) def recommend(self, user_id, at=10): scores = self.model.predict(user_id, self.pid_array, item_features=self.icm) scores = self.filter_seen(user_id, scores) # rank items ranking = scores.argsort()[::-1] return ranking[:at] def recommendALL(self, userList, at=10): res = np.array([]) n=0 for i in userList: n+=1 recList = self.recommend(i[0], at) tuple = np.concatenate((i, recList)) if (res.size == 0): res = tuple else: res = np.vstack([res, tuple]) return res
def test_movielens_accuracy_resume(): model = LightFM(random_state=SEED) for _ in range(10): model.fit_partial(train, epochs=1) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_predict_not_fitted(): model = LightFM() with pytest.raises(ValueError): model.predict(np.arange(10), np.arange(10)) with pytest.raises(ValueError): model.predict_rank(1) with pytest.raises(ValueError): model.get_user_representations() with pytest.raises(ValueError): model.get_item_representations()
def test_movielens_accuracy_pickle(): model = LightFM() model.fit(train, epochs=10) model = pickle.loads(pickle.dumps(model)) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def peuimportelenom(): noms= request.form.getlist("dblst_artists") sugg= [] #print(noms) for el in noms: artiste= ap[ap.name== el] lind= list(artiste.artistID)[0] -1 vecteur[lind]= artiste.playCountScaled.median() # création de la matrice X= np.vstack((ratings,vecteur)) # On importe le code du jupyter notebook n_users, n_items = X.shape Xcsr = csr_matrix(X) Xcoo = Xcsr.tocoo() data = Dataset() data.fit(np.arange(n_users), np.arange(n_items)) interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) train, test = random_train_test_split(interactions) model = LightFM(learning_rate=0.05, loss='warp') model.fit(train, epochs=10, num_threads=2) scores = model.predict(0, vecteur) top_items = ap["name"].unique()[np.argsort(-scores)] sugg= top_items[:10] return render_template("page.html", artist_names= artist_names, noms= noms, sugg= sugg)
def recommendSOAnswers(i_train, i_test, i_user_graph, i_item_graph, n_users, n_items, n_tags): interactions = loadInteractions(i_train, n_users, n_items) u_features = loadUserFeatures(i_user_graph, n_users) i_features = loadItemFeatures(i_item_graph, n_items, n_tags) test_users, test_items, labels = loadTest(i_test) model = LightFM(learning_rate=0.05, loss='logistic') model.fit(interactions, user_features=u_features, item_features=i_features, epochs=5, verbose=True, num_threads=10) result = model.predict(test_users, test_items, item_features=i_features, user_features=u_features, num_threads=10) y_score = np.array([result]) y_true = np.array([labels]) print(result) print(len(y_score)) print(len(y_true)) score = label_ranking_average_precision_score(y_true, y_score) print(score)
class Warp(RecSys): def __init__(self, NUM_TRACKS, no_components=10, learning_rate=0.05, epochs=1): super().__init__() self.NUM_TRACKS = NUM_TRACKS self.no_components = no_components self.learning_rate = learning_rate self.epochs = epochs self.model = LightFM(no_components=self.no_components, learning_schedule='adagrad', loss='warp', learning_rate=self.learning_rate) def get_scores(self, dataset, targets): self.model.fit(interactions=dataset, epochs=self.epochs, num_threads=mp.cpu_count(), verbose=True) scores = np.empty((len(targets), dataset.shape[1]), dtype=np.float32) tracks = [i for i in range(self.NUM_TRACKS)] for i, target in enumerate(targets): new_row = self.model.predict(target, tracks) discard = np.argpartition(new_row, -1000)[:-1000] new_row[discard] = 0 scores[i] = new_row return sparse.csr_matrix(scores, dtype=np.float32)
def get_recommendations(users_ids): results = dict() losses = ['warp', 'bpr', 'warp-kos'] n_items = full_data['coo_matrix'].shape[1] for loss in losses: # Create model model = LightFM(loss=loss) # Train model # The dataset is given 'epoch' time to the algorithm # Numb_threads : parallel computation, not be higher than the number of physical core model.fit(full_data['coo_matrix'], epochs=10, num_threads=2) print('********* With {} algorithm *********\n'.format(loss)) for user in users_ids: scores = model.predict(user, np.arange(n_items)) top_scores = np.argsort(-scores)[:3] print('Recommendations for user {}:'.format(user)) for x in top_scores.tolist(): for artist, dict_artist in full_data['artists'].items(): if int(x) == dict_artist['id']: print(' - {}'.format(dict_artist['name'])) print('\n') # Get it pretty
def sample_recommendation( model: LightFM, dataset: pd.DataFrame, raw_data: pd.DataFrame, item_features, user_ids, recommendations_num: int = 10) -> Tuple[List[str], List[str]]: for user_id in user_ids: # Retrieve the item's IDs items_map = [item_id for item_id in dataset.mapping()[2].values()] # Retrieve the product_code for each item ID items_names = [item_id for item_id in dataset.mapping()[2].keys()] # Construct a dataframe with product_codes and item ID as index items = pd.DataFrame(items_names, index=items_map) items.columns = ['product_code'] # Retrieve the known items known_items = raw_data[raw_data.cac == 'cac_' + str(user_id)]['product_code'][:5].values known_item_ids = items[items['product_code'].isin( known_items)].index.tolist() # Predict items scores = model.predict(user_ids, np.arange(recommendations_num), item_features=item_features) i_idx = [x for x in np.argsort(-scores)] # Remove known items i_idx = [x for x in i_idx if x not in known_item_ids] top_items = items[~items['product_code'].isin(known_items)].loc[i_idx] return top_items['product_code'].values.tolist(), known_items.tolist()
def test_get_representations(): model = LightFM(random_state=SEED) model.fit_partial(train, epochs=10) num_users, num_items = train.shape for (item_features, user_features) in ((None, None), ((sp.identity(num_items) + sp.random(num_items, num_items)), (sp.identity(num_users) + sp.random(num_users, num_users)))): test_predictions = model.predict(test.row, test.col, user_features=user_features, item_features=item_features) item_biases, item_latent = model.get_item_representations( item_features) user_biases, user_latent = model.get_user_representations( user_features) assert item_latent.dtype == np.float32 assert user_latent.dtype == np.float32 predictions = ( (user_latent[test.row] * item_latent[test.col]).sum(axis=1) + user_biases[test.row] + item_biases[test.col]) assert np.allclose(test_predictions, predictions, atol=0.000001)
def func(): ratings = list(Rating.objects.all()) films = list(Film.objects.all()) users = list(User.objects.all()) interactions_matrix = [] # a = Rating.objects.filter(user_id=1, movie_id='0068646') # print(int(a[0].rating)) for user in users: rating_of_user = [] for film in films: rating = Rating.objects.filter(user_id=user.id, movie_id=film.movie_id) if rating: rating_of_user.append(int(rating[0].rating)) else: rating_of_user.append(0) print(rating_of_user[:20]) interactions_matrix.append(rating_of_user) interactions_matrix = coo_matrix(interactions_matrix) model = LightFM(learning_rate=0.02, loss='bpr') model.fit(interactions_matrix, epochs=10) print(model.predict(np.int32([4, 5, 6]), np.int32([0, 1, 2]))) pickle.dump(model, open("model.p", "wb"))
def test_movielens_excessive_regularization(): # Should perform poorly with high regularization model = LightFM(no_components=10, item_alpha=1.0, user_alpha=1.0) model.fit_partial(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) < 0.6 assert roc_auc_score(test.data, test_predictions) < 0.6
def test_zeros_negative_accuracy(): # Should get the same accuracy when zeros are used to # denote negative interactions train.data[train.data == -1] = 0 model = LightFM() model.fit_partial(train, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_regularization(): # Let's regularize model = LightFM(no_components=50, item_alpha=0.0001, user_alpha=0.0001) model.fit_partial(train, epochs=30) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) > 0.80 assert roc_auc_score(test.data, test_predictions) > 0.75
def test_overfitting(): # Let's massivly overfit model = LightFM(no_components=50) model.fit_partial(train, epochs=30) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) overfit_train = roc_auc_score(train.data, train_predictions) overfit_test = roc_auc_score(test.data, test_predictions) assert overfit_train > 0.99 assert overfit_test < 0.75
def pred_i(df, user_id): """ Takes in data dictionary and external user id, and outputs LightFM's predictions (converted to external workout ids) and their respective scores. Note: this function is deployed to web application """ model = LightFM(loss='warp') model.fit(df['all_ui_matrix']) workout_ids = np.asarray([ i for i in range(df['user_item_interactions']['workout_id'].nunique()) ]) # get LightFM scores, by internal indices scores = model.predict(get_internal_user_id(df['user_map'], user_id), workout_ids) # internal indices ordered by scores (descending) internal_indices_ranked = np.argsort(-scores) # LightFM scores corresponding to ranked indices scores_ranked = scores[internal_indices_ranked] # external indices order by scores (decending) external_indices_ranked = [ get_external_workout_id(df['item_map'], i) for i in internal_indices_ranked ] return external_indices_ranked, scores_ranked
def test_zero_weights_accuracy(): # When very small weights are used # accuracy should be no better than # random. weights = train.copy() weights.data = np.zeros(train.getnnz(), dtype=np.float32) for loss in ('logistic', 'bpr', 'warp'): model = LightFM(loss=loss, random_state=SEED) model.fit_partial(train, sample_weight=weights, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert 0.45 < roc_auc_score(train.data, train_predictions) < 0.55 assert 0.45 < roc_auc_score(test.data, test_predictions) < 0.55
def fit_model_and_create_predictions(): model = LightFM(loss='warp') users = get_users() usr_cat_matrix = create_item_matrix(users) model.fit(usr_cat_matrix, epochs=30, num_threads=2) for user in users: never_bought_cats = u_never_bought_cats(user, usr_cat_matrix) save_user_prediction(get_top_100(model.predict(user, never_bought_cats)))
def get_recommendations(user_id, artist_name, n_items, X): # initialize the model model = LightFM(learning_rate=0.05, loss='bpr', random_state=42) model.fit(X, epochs=10, num_threads=2) # predict scores = model.predict(user_id, np.arange(n_items)) top_items = artist_name[np.argsort(-scores)] return (top_items[:10])
def test_hogwild_accuracy(): # Should get comparable accuracy with 2 threads model = LightFM() model.fit_partial(train, epochs=10, num_threads=2) train_predictions = model.predict(train.row, train.col, num_threads=2) test_predictions = model.predict(test.row, test.col, num_threads=2) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
class LightFMRecommender(Recommender): """ PURE LIGHTFM COLLABORATIVE FILTERING """ N_CONFIG = 0 def __init__(self, train, test, validation, targets, subfolder="../", log_filename='lightfmcf_config.txt'): super(LightFMRecommender, self).__init__(train, test, validation, targets, subfolder, log_filename) self.configuration_txt = "PURE LIGHTFM COLLABORATIVE FILTERING" def fit(self, item_alpha=1e-5, user_alpha=1e-4, learning_schedule='adadelta', num_components=250, epochs=30, threads=2): self.item_alpha = item_alpha self.user_alpha = user_alpha self.learning_schedule = learning_schedule self.num_components = num_components self.epochs = epochs self.threads = threads def train(self, verbose=True): start_time = time.time() if verbose: print("LightFM training started!") # Let's fit a WARP model: these generally have the best performance. self.model = LightFM(loss='warp', item_alpha=self.item_alpha, user_alpha=self.user_alpha, learning_schedule=self.learning_schedule, no_components=self.num_components) # Run 3 epochs and time it. self.model = self.model.fit(self.URM_train, epochs=self.epochs, num_threads=self.threads) if verbose: print("LightFM training model fitted in {:.2f} seconds".format( time.time() - start_time)) def compute_predicted_ratings(self, playlist_id): return self.model.predict(user_ids=playlist_id, item_ids=np.arange(self.n_tracks), item_features=None, user_features=None, num_threads=self.threads)
def fit_lightfm_model(): """ Fit the lightFM model returns d_user_pred, list_user, list_coupon list_coupon = list of test coupons list_user = list of user ID d_user_pred : key = user, value = predicted ranking of coupons in list_coupon """ #Load data Mui_train = spi.mmread("../Data/Data_translated/biclass_user_item_train_mtrx.mtx") uf = spi.mmread("../Data/Data_translated/user_feat_mtrx.mtx") itrf = spi.mmread("../Data/Data_translated/train_item_feat_mtrx.mtx") itef = spi.mmread("../Data/Data_translated/test_item_feat_mtrx.mtx") #Print shapes as a check print "user_features shape: %s,\nitem train features shape: %s,\nitem test features shape: %s" % (uf.shape, itrf.shape, itef.shape) #Load test coupon and user lists cplte = pd.read_csv("../Data/Data_translated/coupon_list_test_translated.csv") ulist = pd.read_csv("../Data/Data_translated/user_list_translated.csv") list_coupon = cplte["COUPON_ID_hash"].values list_user = ulist["USER_ID_hash"].values #Build model no_comp, lr, ep = 10, 0.01, 5 model = LightFM(no_components=no_comp, learning_rate=lr, loss='warp') model.fit_partial(Mui_train, user_features = uf, item_features = itrf, epochs = ep, num_threads = 4, verbose = True) test = sps.csr_matrix((len(list_user), len(list_coupon)), dtype = np.int32) no_users, no_items = test.shape pid_array = np.arange(no_items, dtype=np.int32) #Create and initialise dict to store predictions d_user_pred = {} for user in list_user : d_user_pred[user] = [] # Loop over users and compute predictions for user_id, row in enumerate(test): sys.stdout.write("\rProcessing user " + str(user_id)+"/ "+str(len(list_user))) sys.stdout.flush() uid_array = np.empty(no_items, dtype=np.int32) uid_array.fill(user_id) predictions = model.predict(uid_array, pid_array,user_features = uf, item_features = itef, num_threads=4) user = str(list_user[user_id]) # apply MinMaxScaler for blending later on MMS = MinMaxScaler() pred = MMS.fit_transform(np.ravel(predictions)) d_user_pred[user] = pred # Pickle the predictions for future_use d_pred = {"list_coupon" : list_coupon.tolist(), "d_user_pred" : d_user_pred} with open("../Data/Data_translated/d_pred_lightfm.pickle", "w") as f: pickle.dump(d_pred, f, protocol = pickle.HIGHEST_PROTOCOL) return d_user_pred, list_user, list_coupon
class RecSys(object): """ Create rec sys model with lightfm """ def __init__(self, user_column_name: str = "user", item_column_name: str = "item", rating_column_name: str = "rating"): self.__user = user_column_name self.__item = item_column_name self.__rating = rating_column_name self.__model = LightFM(learning_rate=0.05, loss='bpr') self.__users = {} self.__current_user_num = 0 self.__items = {} self.__current_item_num = 0 def get_user(self, user): if user not in self.__users: self.__users[user] = self.__current_user_num self.__current_user_num += 1 return self.__users[user] def get_item(self, item): if item not in self.__items: self.__items[item] = self.__current_item_num self.__current_item_num += 1 return self.__items[item] def __df_to_sparsematrix( self, df: pandas.DataFrame) -> scipy.sparse.coo.coo_matrix: sparsematrix = scipy.sparse.dok_matrix( (df[self.__user].value_counts().shape[0], df[self.__item].value_counts().shape[0]), dtype=numpy.int32) for _, row in df[[self.__user, self.__item, self.__rating]].iterrows(): sparsematrix[self.get_user(row[0]), self.get_item(row[1])] = row[2] return sparsematrix.tocoo(copy=True) def fit(self, df: pandas.DataFrame): self.__model.fit(self.__df_to_sparsematrix(df), epochs=20) def predict(self, users: numpy.array, items: numpy.array, num_threads: int = 1) -> numpy.array: return self.__model.predict( numpy.array([self.get_user(x) for x in users]), numpy.array([self.get_item(x) for x in items]), num_threads=num_threads)
class LightFMRecommender(object): def __init__(self, n_comp=30, loss='warp-kos', learning='adagrad', alpha=1e-3): alpha = 1e-3 self.model = LightFM(no_components=30, loss='warp-kos', learning_schedule='adagrad', user_alpha=alpha, item_alpha=alpha) # self.model = LightFM(no_components=n_comp, # loss=loss, # learning_schedule= learning, # user_alpha=alpha, item_alpha=alpha) def fit(self, urm, epochs=100): self.urm = urm self.n_tracks = urm.shape[1] for epoch in range(epochs): self.model.fit_partial(urm.getCSR(), epochs=1) def get_pred_row(self, user_id): return self.model.predict(user_id, np.arange(self.n_tracks)) def s_recommend(self, user_id, nRec=10): scores = self.model.predict(user_id, np.arange(self.n_tracks)) top_items = np.argsort(-scores) recommended_items = self._filter_seen(user_id, top_items) return recommended_items[0:nRec] def _filter_seen(self, user_id, ranking): seen = self.urm.extractTracksFromPlaylist(user_id) unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True) return ranking[unseen_mask] def m_recommend(self, target_ids, nRec=10): results = [] for tid in target_ids: results.append(self.s_recommend(tid, nRec)) return results
def test_user_supplied_features_accuracy(): model = LightFM() model.fit_partial(train, user_features=train_user_features, item_features=train_item_features, epochs=10) train_predictions = model.predict(train.row, train.col, user_features=train_user_features, item_features=train_item_features) test_predictions = model.predict(test.row, test.col, user_features=test_user_features, item_features=test_item_features) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_movielens_genre_accuracy(): item_features = movielens_data.get_movielens_item_metadata(use_item_ids=False) assert item_features.shape[1] < item_features.shape[0] model = LightFM() model.fit_partial(train, item_features=item_features, epochs=10) train_predictions = model.predict(train.row, train.col, item_features=item_features) test_predictions = model.predict(test.row, test.col, item_features=item_features) assert roc_auc_score(train.data, train_predictions) > 0.75 assert roc_auc_score(test.data, test_predictions) > 0.69
def test_movielens_genre_accuracy(): item_features = fetch_movielens(indicator_features=False, genre_features=True)['item_features'] assert item_features.shape[1] < item_features.shape[0] model = LightFM(random_state=SEED) model.fit_partial(train, item_features=item_features, epochs=10) train_predictions = model.predict(train.row, train.col, item_features=item_features) test_predictions = model.predict(test.row, test.col, item_features=item_features) assert roc_auc_score(train.data, train_predictions) > 0.75 assert roc_auc_score(test.data, test_predictions) > 0.69
def test_zero_weights_accuracy(): # When very small weights are used # accuracy should be no better than # random. weights = train.copy() weights.data = np.zeros(train.getnnz(), dtype=np.float32) for loss in ('logistic', 'bpr', 'warp'): model = LightFM(loss=loss, random_state=SEED) model.fit_partial(train, sample_weight=weights, epochs=10) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert 0.45 < roc_auc_score(train.data, train_predictions) < 0.55 assert 0.45 < roc_auc_score(test.data, test_predictions) < 0.55
def test_movielens_both_accuracy(): """ Accuracy with both genre metadata and item-specific features shoul be no worse than with just item-specific features (though more training may be necessary). """ item_features = movielens_data.get_movielens_item_metadata(use_item_ids=True) model = LightFM() model.fit_partial(train, item_features=item_features, epochs=15) train_predictions = model.predict(train.row, train.col, item_features=item_features) test_predictions = model.predict(test.row, test.col, item_features=item_features) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.75
def test_movielens_both_accuracy(): """ Accuracy with both genre metadata and item-specific features shoul be no worse than with just item-specific features (though more training may be necessary). """ item_features = fetch_movielens(indicator_features=True, genre_features=True)['item_features'] model = LightFM(random_state=SEED) model.fit_partial(train, item_features=item_features, epochs=15) train_predictions = model.predict(train.row, train.col, item_features=item_features) test_predictions = model.predict(test.row, test.col, item_features=item_features) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.75
def test_predict_scores(num_threads=2): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.float32) train = sp.rand(no_users, no_items, format='csr') model = LightFM() model.fit_partial(train) # Compute scores and check if results equal to model.predict predict_input = sp.csr_matrix(np.ones((no_users, no_items))) scores = model.predict_score(predict_input, num_threads=num_threads).todense() for uid in range(no_users): scores_arr = model.predict(np.repeat(uid, no_items), np.arange(no_items)) score_slice = np.array(scores)[uid, :] assert np.array_equal(score_slice, scores_arr) # check if precompute and parallelization work correctly scores_serial = model.predict_score(predict_input, num_threads=1).todense() scores_no_prec = model.predict_score(predict_input, num_threads=num_threads, precompute_representations=False ).todense() scores_ser_no_prec = model.predict_score(predict_input, num_threads=1, precompute_representations=False ).todense() assert np.array_equal(scores, scores_serial) assert np.array_equal(scores, scores_no_prec) assert np.array_equal(scores, scores_ser_no_prec) # Compute ranks and compares with ranks computed from scores ranks = model.predict_rank(predict_input, num_threads=num_threads).todense() def rank_scores(s): # ranks from scores as in http://stackoverflow.com/a/14672797/5251962 u, v = np.unique(s, return_inverse=True) return len(s) - 1 - (np.cumsum(np.bincount(v)) - 1)[v] check_ranks = np.apply_along_axis(rank_scores, 1, scores) assert np.array_equal(ranks, check_ranks) # Train set exclusions. All scores should be zero # if train interactions is dense. scores = model.predict_score(predict_input, train_interactions=predict_input).todense() assert np.all(scores == 0) # Make sure invariants hold when there are ties model.user_embeddings = np.zeros_like(model.user_embeddings) model.item_embeddings = np.zeros_like(model.item_embeddings) model.user_biases = np.zeros_like(model.user_biases) model.item_biases = np.zeros_like(model.item_biases) scores = model.predict_score(predict_input, num_threads=num_threads).todense() assert np.all(scores.min(axis=1) == 0) assert np.all(scores.max(axis=1) == 0) # Wrong input dimensions with pytest.raises(ValueError): model.predict_score(sp.csr_matrix((5, 5)), num_threads=num_threads)
def do_fiber_training(visualization = False): if not os.path.isfile(rc.RECOMMENDER_TRAINING) or not os.path.isfile(rc.RECOMMENDER_MODEL): yarn_data_matrix = pickle.load(open( rc.YARN_DATA_MATRIX, "rb" )) yarn_data_train = sps.coo_matrix( yarn_data_matrix[:int(len(yarn_data_matrix)*0.5)] ) > 0 yarn_data_test = sps.coo_matrix( yarn_data_matrix[int(len(yarn_data_matrix)*0.5):] ) > 0 if visualization: print yarn_data_train.shape[0],yarn_data_test.shape[0], len(yarn_data_matrix) # Taken from: https://github.com/lyst/lightfm/blob/master/examples/stackexchange/hybrid_crossvalidated.ipynb # Set the number of threads; you can increase this # ify you have more physical cores available. NUM_THREADS = 2 NUM_COMPONENTS = 30 NUM_EPOCHS = 3 ITEM_ALPHA = 1e-6 # Let's fit a WARP model: these generally have the best performance. model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS) # Run 3 epochs and time it. model = model.fit(yarn_data_train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS) # Compute and print the AUC score train_auc = auc_score(model, yarn_data_train, num_threads=NUM_THREADS).mean() print('Collaborative filtering train AUC: %s' % train_auc) # We pass in the train interactions to exclude them from predictions. # This is to simulate a recommender system where we do not # re-recommend things the user has already interacted with in the train # set. test_auc = auc_score(model, yarn_data_test, train_interactions=yarn_data_train, num_threads=NUM_THREADS).mean() print('Collaborative filtering test AUC: %s' % test_auc) pickle.dump(yarn_data_matrix,open(rc.RECOMMENDER_TRAINING, 'wb')) pickle.dump(model,open(rc.RECOMMENDER_MODEL, 'wb')) else: yarn_data_matrix = pickle.load(open(rc.RECOMMENDER_TRAINING, 'rb')) model = pickle.load(open(rc.RECOMMENDER_MODEL, 'rb')) translation_dict = pickle.load(open(rc.YARN_TRANSLATION_DATA, 'rb')) print len(yarn_data_matrix) for matrix_id in xrange(0,len(yarn_data_matrix)): print matrix_id predictions = model.predict(matrix_id,yarn_data_matrix[matrix_id]) matches = [] predictions += abs(np.min(predictions)) # make non-negative _max = np.max(predictions) # find max for normalization predictions /= _max # Normalize predictions for prediction in xrange(0,len(predictions)): if predictions[prediction] > 0.9: matches.append([translation_dict[prediction],prediction,predictions[prediction]]) print translation_dict[matrix_id],matches
def fit_model(week_ID, no_comp, lr, ep): """ Fit the lightFM model to all weeks in list_week_ID. Then print the results for MAPat10 args : week_ID validation test week no_comp, lr, ep = (int, float, int) number of components, learning rate, number of epochs for lightFM model returns: d_user_pred, list_user, list_coupon list_coupon = list of test coupons list_user = list of user ID d_user_pred : key = user, value = predicted ranking of coupons in list_coupon """ print("Fit lightfm model for %s" % week_ID) #Load data Mui_train = spi.mmread("../Data/Validation/%s/biclass_user_item_train_mtrx_%s.mtx" % (week_ID, week_ID)) uf = spi.mmread("../Data/Validation/%s/user_feat_mtrx_%s.mtx" % (week_ID, week_ID)) itrf = spi.mmread("../Data/Validation/%s/train_item_feat_mtrx_%s.mtx" % (week_ID, week_ID)) itef = spi.mmread("../Data/Validation/%s/test_item_feat_mtrx_%s.mtx" % (week_ID, week_ID)) #Print shapes as a check print("user_features shape: %s,\nitem train features shape: %s,\nitem test features shape: %s" % (uf.shape, itrf.shape, itef.shape)) #Load test coupon and user lists cplte = pd.read_csv("../Data/Validation/" + week_ID + "/coupon_list_test_validation_" + week_ID +".csv") ulist = pd.read_csv("../Data/Validation/" + week_ID + "/user_list_validation_" + week_ID +".csv") list_coupon = cplte["COUPON_ID_hash"].values list_user = ulist["USER_ID_hash"].values #Build model no_comp, lr, ep = 10, 0.01, 5 model = LightFM(no_components=no_comp, learning_rate=lr, loss='warp') model.fit_partial(Mui_train, user_features = uf, item_features = itrf, epochs = ep, num_threads = 4, verbose = True) test = sps.csr_matrix((len(list_user), len(list_coupon)), dtype = np.int32) no_users, no_items = test.shape pid_array = np.arange(no_items, dtype=np.int32) #Create and initialise dict to store predictions d_user_pred = {} for user in list_user : d_user_pred[user] = [] # Loop over users and compute predictions for user_id, row in enumerate(test): sys.stdout.write("\rProcessing user " + str(user_id)+"/ "+str(len(list_user))) sys.stdout.flush() uid_array = np.empty(no_items, dtype=np.int32) uid_array.fill(user_id) predictions = model.predict(uid_array, pid_array,user_features = uf, item_features = itef, num_threads=4) user = str(list_user[user_id]) d_user_pred[user] = predictions # Pickle the predictions for future_use d_pred = {"list_coupon" : list_coupon.tolist(), "d_user_pred" : d_user_pred} with open("../Data/Validation/%s/d_pred_lightfm_%s.pickle" % (week_ID, week_ID), "w") as f: pickle.dump(d_pred, f, protocol = pickle.HIGHEST_PROTOCOL) return d_user_pred, list_user, list_coupon