def evaluation(): print("\nStarting evaluation our model...") model = LightFM(loss='warp') train = fetch_movielens()['train'] test = fetch_movielens()['test'] model.fit_partial(train, epochs=30, num_threads=2) train_precision = precision_at_k(model, train, k=10).mean() test_precision = precision_at_k(model, test, k=10).mean() train_auc = auc_score(model, train).mean() test_auc = auc_score(model, test).mean() print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision)) print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))
def test_movielens_genre_accuracy(): item_features = fetch_movielens(indicator_features=False, genre_features=True)['item_features'] assert item_features.shape[1] < item_features.shape[0] model = LightFM(random_state=SEED) model.fit_partial(train, item_features=item_features, epochs=10) train_predictions = model.predict(train.row, train.col, item_features=item_features) test_predictions = model.predict(test.row, test.col, item_features=item_features) assert roc_auc_score(train.data, train_predictions) > 0.75 assert roc_auc_score(test.data, test_predictions) > 0.69
def test_movielens_genre_accuracy(): item_features = movielens_data.get_movielens_item_metadata( use_item_ids=False) assert item_features.shape[1] < item_features.shape[0] model = LightFM() model.fit_partial(train, item_features=item_features, epochs=10) train_predictions = model.predict(train.row, train.col, item_features=item_features) test_predictions = model.predict(test.row, test.col, item_features=item_features) assert roc_auc_score(train.data, train_predictions) > 0.75 assert roc_auc_score(test.data, test_predictions) > 0.69
def main(): current_stage = 6 model = LightFM(no_components=30) dataset = Dataset() for c in range(0, current_stage + 1): click_train = pd.read_csv( train_path + "/underexpose_train_click-{}.csv".format(c), header=None, names=["user_id", "item_id", "time"], ) click_test = pd.read_csv( test_path + "/underexpose_test_click-{}.csv".format(c), header=None, names=["user_id", "item_id", "time"], ) dataset.fit_partial(click_train["user_id"], click_train["item_id"]) num_users, num_items = dataset.interactions_shape() log('Num users: {}, num_items {}.'.format(num_users, num_items))
def graph_accuracies_cutoff(data): print("\nTraining models with different sampling cutoffs and recording their accuracies...") # array used to store the values at each step size precisions = [] aucs = [] # iterate over the range of cutoffs and measure the accuracies for c in range(1,MAX_CUTOFF): test_model = LightFM(loss="warp", max_sampled=c) current_trained = test_model.fit(data["train"], epochs=5) precisions.append(precision_at_k(current_trained, data["test"], k=PRECISION_K).mean()) aucs.append(auc_score(current_trained, data["test"]).mean()) print("Done!") x_axis = range(1,MAX_CUTOFF) # plot the graph plot_accuracies(x_axis, precisions, aucs, "cutoff", "magnitude of the accuracy metric", \ ["precisions@10", "AUROC"], 3, "accuracies_cutoff.png")
def graph_accuracies_epochs(data): print("\nTraining models with varying epochs from 0 to %d and recording their accuracies..." % MAX_EPOCHS) # array used to store the values at each epoch precisions = [] aucs = [] # setup the model test_model = LightFM(loss="warp") # iterate over the range of epochs and measure the accuracies for e in range(MAX_EPOCHS): current_trained = test_model.fit(data["train"], epochs=e) precisions.append(precision_at_k(current_trained, data["test"], k=PRECISION_K).mean()) aucs.append(auc_score(current_trained, data["test"]).mean()) print("Done!") x_axis = np.arange(MAX_EPOCHS) # plot the graph plot_accuracies(x_axis, precisions, aucs, "number of epochs", "magnitude of the accuracy metric", \ ["precisions@10", "AUROC"], 2, "accuracies_epochs.png")
def test_warp_precision_max_sampled(): model = LightFM(learning_rate=0.05, max_sampled=1, loss='warp', random_state=SEED) # This is equivalent to a no-op pass # over the training data model.max_sampled = 0 model.fit_partial(train, epochs=1) (train_precision, test_precision, full_train_auc, full_test_auc) = _get_metrics(model, train, test) # The AUC should be no better than random assert full_train_auc < 0.55 assert full_test_auc < 0.55
def graph_accuracies_step_size(data): print("\nTraining models with an epoch of 5 at different step sizes and recording their accuracies...") # array used to store the values at each step size precisions = [] aucs = [] # iterate over the range of step sizes and measure the accuracies for s in np.arange(0.1, MAX_STEP + STEP_INCREMENT, STEP_INCREMENT): test_model = LightFM(loss="warp", learning_rate=s) current_trained = test_model.fit(data["train"], epochs=5) precisions.append(precision_at_k(current_trained, data["test"], k=PRECISION_K).mean()) aucs.append(auc_score(current_trained, data["test"]).mean()) print("Done!") x_axis = np.arange(0.1, MAX_STEP + STEP_INCREMENT, STEP_INCREMENT) # plot the graph plot_accuracies(x_axis, precisions, aucs, "initial step size", "magnitude of the accuracy metric", \ ["precisions@10", "AUROC"], 2, "accuracies_step_size.png")
def test_movielens_accuracy_sample_weights(): # Scaling weights down and learning rate up # by the same amount should result in # roughly the same accuracy scale = 0.5 weights = train.copy() weights.data = np.ones(train.getnnz(), dtype=np.float32) * scale for loss, exp_score in (('logistic', 0.74), ('bpr', 0.84), ('warp', 0.89)): model = LightFM(loss=loss, random_state=SEED) model.learning_rate * 1.0 / scale model.fit_partial(train, sample_weight=weights, epochs=10) (train_precision, test_precision, full_train_auc, full_test_auc) = _get_metrics(model, train, test) assert full_train_auc > exp_score
def test_user_supplied_features_accuracy(): model = LightFM(random_state=SEED) model.fit_partial(train, user_features=train_user_features, item_features=train_item_features, epochs=10) train_predictions = model.predict(train.row, train.col, user_features=train_user_features, item_features=train_item_features) test_predictions = model.predict(test.row, test.col, user_features=test_user_features, item_features=test_item_features) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76
def test_full_batch_predict_wo_features(): no_components = 2 top_k = 5 ds = RandomDataset(density=1.0) model = LightFM(no_components=no_components) model.fit_partial(ds.train) user_ids = [0, 1, 2] # Single process model.batch_setup({0: ds.item_ids}) recoms = model.batch_predict( user_ids=user_ids, chunk_id=0, top_k=top_k, ) for user_id in user_ids: assert user_id in recoms assert len(recoms[user_id][0]) == top_k
def test_movielens_excessive_regularization(): for loss in ('logistic', 'warp', 'bpr', 'warp-kos'): # Should perform poorly with high regularization. # Check that regularization does not accumulate # until it reaches infinity. model = LightFM(no_components=10, item_alpha=1.0, user_alpha=1.0, loss=loss, random_state=SEED) model.fit_partial(train, epochs=10, num_threads=4) train_predictions = model.predict(train.row, train.col) test_predictions = model.predict(test.row, test.col) assert roc_auc_score(train.data, train_predictions) < 0.65 assert roc_auc_score(test.data, test_predictions) < 0.65
def test_full_batch_predict(): no_components = 2 top_k = 5 ds = RandomDataset() model = LightFM(no_components=no_components) model.fit_partial(ds.train, user_features=ds.user_features, item_features=ds.item_features) user_ids = [0, 1, 2] chunks = {0: ds.item_ids} # Single process model.batch_setup(item_chunks=chunks, user_features=ds.user_features, item_features=ds.item_features, n_process=1) recoms = model.batch_predict( user_ids=user_ids, chunk_id=0, top_k=top_k, ) for user_id in user_ids: assert user_id in recoms assert len(recoms[user_id][0]) == top_k initial_recoms = recoms model.batch_cleanup() model.batch_setup(item_chunks=chunks, user_features=ds.user_features, item_features=ds.item_features, n_process=2) # Multiple processes recoms = model.batch_predict( user_ids=user_ids, chunk_id=0, top_k=top_k, ) for user_id in user_ids: assert user_id in recoms assert_array_almost_equal(recoms[user_id], initial_recoms[user_id])
def test_warp_precision_adadelta_multithreaded(): model = LightFM(learning_schedule='adadelta', rho=0.95, epsilon=0.000001, loss='warp') model.fit_partial(train, epochs=10, num_threads=4) train_precision = precision_at_k(model, train, 10) test_precision = precision_at_k(model, test, 10) full_train_auc = full_auc(model, train) full_test_auc = full_auc(model, test) assert train_precision > 0.45 assert test_precision > 0.07 assert full_train_auc > 0.94 assert full_test_auc > 0.9
def test_input_dtypes(): no_users, no_items = 10, 100 no_features = 20 for dtype in dtypes: train = sp.coo_matrix((no_users, no_items), dtype=dtype) user_features = sp.coo_matrix((no_users, no_features), dtype=dtype) item_features = sp.coo_matrix((no_items, no_features), dtype=dtype) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) model.predict( np.random.randint(0, no_users, 10).astype(np.int32), np.random.randint(0, no_items, 10).astype(np.int32), user_features=user_features, item_features=item_features, )
def test_feature_inference_fails(): # On predict if we try to use feature inference and supply # higher ids than the number of features that were supplied to fit # we should complain no_users, no_items = 10, 100 no_features = 20 train = sp.coo_matrix((no_users, no_items), dtype=np.int32) user_features = sp.csr_matrix((no_users, no_features), dtype=np.int32) item_features = sp.csr_matrix((no_items, no_features), dtype=np.int32) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) with pytest.raises(ValueError): model.predict(np.array([no_features], dtype=np.int32), np.array([no_features], dtype=np.int32))
def objective(params): # unpack epochs, learning_rate, no_components = params model = LightFM(loss=loss, random_state=random_state, learning_rate=learning_rate, no_components=no_components) model.fit(train, epochs=epochs, num_threads=4, verbose=True) patks = auc_score(model, test, num_threads=4) maptk = np.mean(patks) # Make negative because we want to _minimize_ objective out = -maptk # Handle some weird numerical shit going on if np.abs(out + 1) < 0.01 or out < -1.0: return 0.0 else: return out
def objective(params): epochs, learning_rate, no_components, item_alpha, scale = params # 'k_os' user_alpha = item_alpha * scale model = LightFM(loss=loss, random_state=2019, learning_rate=learning_rate, no_components=no_components, user_alpha=user_alpha, item_alpha=item_alpha) model.fit(train, item_features=item_features, epochs=epochs, num_threads=threads, verbose=True) patks = function_to_optimize(model, test, item_features=item_features, num_threads=threads) mapatk = np.mean(patks) # Make negative because we want to minimize objective out = -mapatk # Handle some weird numerical shit going on if np.abs(out + 1) < 0.01 or out < -1.0: return 0.0 else: return out
def trainTheModel(): movielens = fetch_movielens() train = movielens['train'] test = movielens['test'] user_features = None item_features = movielens['item_features'] model = LightFM(learning_rate=0.05, loss='warp') model.fit_partial(train, item_features=item_features, epochs=10) train_precision = precision_at_k( model, train, item_features=item_features, k=10).mean() test_precision = precision_at_k( model, test, item_features=item_features, k=10).mean() train_auc = auc_score(model, train).mean() test_auc = auc_score(model, test).mean() return model, user_features, item_features, movielens['item_labels'], movielens['item_feature_labels']
def test_not_enough_features_fails(): no_users, no_items = (10, 100) no_features = 20 train = sp.coo_matrix((no_users, no_items), dtype=np.int32) user_features = sp.csr_matrix((no_users - 1, no_features), dtype=np.int32) item_features = sp.csr_matrix((no_items - 1, no_features), dtype=np.int32) model = LightFM() with pytest.raises(Exception): model.fit_partial(train, user_features=user_features, item_features=item_features)
def lightfm_train(train, num_components, num_epochs): '''Train a LightFM collaborative filtering model from a training set. Returns: LightFM recommendation system model.''' # Set parameters for model NUM_THREADS = 1 NUM_COMPONENTS = num_components NUM_EPOCHS = num_epochs ITEM_ALPHA = 1e-6 # Recommended by LightFM # Let's fit a WARP model: these generally have the best performance. model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS) # Fit model model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS) return model
def build(self): print('start', datetime.datetime.now()) df = pd.read_csv(self.source_file) number_of_users = df['user_id'].max() number_of_items = df['item_id'].max() train = sp.coo_matrix((df['rating'], (df['user_id'], df['item_id']))) # Load the MovieLens 100k dataset. Only five # star ratings are treated as positive. # data = fetch_movielens(min_rating=5.0) # Instantiate and train the model model = LightFM(loss='warp') model.fit(train, epochs=30, num_threads=2) prediction = model.predict(np.array([3]), np.array([2])) print(prediction) pickle.dump(model, open('lightfm.p', 'wb')) # Evaluate the trained model # test_precision = precision_at_k(model, data['test'], k=5).mean() # print(test_precision) return model
def _train(self, verbose=True): start_time = time.time() if verbose: print("LightFM training started!") # Let's fit a WARP model: these generally have the best performance. self.model = LightFM(loss=self.loss, item_alpha=self.item_alpha, user_alpha=self.user_alpha, learning_schedule=self.learning_schedule, no_components=self.num_components) # Run 3 epochs and time it. self.model = self.model.fit(self.URM, epochs=self.epochs, num_threads=self.threads) if verbose: print("LightFM training model fitted in {:.2f} seconds".format( time.time() - start_time))
def test_matrix_types(): mattypes = (sp.coo_matrix, sp.lil_matrix, sp.csr_matrix, sp.csc_matrix) dtypes = (np.int32, np.int64, np.float32, np.float64) no_users, no_items = (10, 100) no_features = 20 for mattype in mattypes: for dtype in dtypes: train = mattype((no_users, no_items), dtype=dtype) user_features = mattype((no_users, no_features), dtype=dtype) item_features = mattype((no_items, no_features), dtype=dtype) model = LightFM() model.fit_partial(train, user_features=user_features, item_features=item_features) model.predict(np.random.randint(0, no_users, 10).astype(np.int32), np.random.randint(0, no_items, 10).astype(np.int32), user_features=user_features, item_features=item_features) model.predict_rank(train, user_features=user_features, item_features=item_features)
def init_model(self, no_components=10, k=5, n=10, learning_schedule='adagrad', loss='logistic', learning_rate=0.05, rho=0.95, epsilon=1e-06, item_alpha=0.0, user_alpha=0.0, max_sampled=10, random_state=None): """ Initialize model to be evaluated. :param no_components:(int, optional) – the dimensionality of the feature latent embeddings. :param k:(int, optional) – for k-OS training, the k-th positive example will be selected from the n positive examples sampled for every user. :param n:(int, optional) – for k-OS training, maximum number of positives sampled for each update. :param learning_schedule:(string, optional) – one of (‘adagrad’, ‘adadelta’). :param loss:(string, optional) – one of (‘logistic’, ‘bpr’, ‘warp’, ‘warp-kos’): the loss function. :param learning_rate:(float, optional) – initial learning rate for the adagrad learning schedule. :param rho:(float, optional) – moving average coefficient for the adadelta learning schedule. :param epsilon:(float, optional) – conditioning parameter for the adadelta learning schedule. :param item_alpha:(float, optional) – L2 penalty on item features. :param user_alpha:(float, optional) – L2 penalty on user features. :param max_sampled:(int, optional) – maximum number of negative samples used during WARP fitting. :param random_state:(int seed, RandomState instance, or None) """ self.model = LightFM(no_components=no_components, k=k, n=n, learning_schedule=learning_schedule, loss=loss, learning_rate=learning_rate, rho=rho, epsilon=epsilon, item_alpha=item_alpha, user_alpha=user_alpha, max_sampled=max_sampled, random_state=random_state)
def __init__( self, URM_train, ICM_train, no_components=1024, k=5, n=10, learning_schedule="adagrad", loss="logistic", learning_rate=0.05, rho=0.95, epsilon=1e-06, item_alpha=0.0, user_alpha=0.0, max_sampled=10, random_state=None, ): super(LightFMRecommender, self).__init__(URM_train) self.URM_train = check_matrix(URM_train.copy(), "csr") self.ICM_train = check_matrix(ICM_train.copy(), "csr") # ICM_train_dense = pd.DataFrame(self.ICM_train.todense()) # ICM_train_dense.index = ICM_train_dense.index.map(lambda x: item_mapper[str(x)]) # self.ICM_train = sps.csr_matrix(ICM_train_dense.values) self.model = LightFM( no_components=no_components, k=k, n=n, learning_schedule=learning_schedule, loss=loss, learning_rate=learning_rate, rho=rho, epsilon=epsilon, item_alpha=item_alpha, user_alpha=user_alpha, max_sampled=max_sampled, random_state=random_state, )
def test_logistic_precision(): model = LightFM() model.fit_partial(train, epochs=10) train_precision = precision_at_k(model, train, 10) test_precision = precision_at_k(model, test, 10) full_train_auc = full_auc(model, train) full_test_auc = full_auc(model, test) assert train_precision > 0.3 assert test_precision > 0.03 assert full_train_auc > 0.79 assert full_test_auc > 0.74
def test_warp_precision(): model = LightFM(learning_rate=0.05, loss='warp', random_state=SEED) model.fit_partial(train, epochs=10) (train_precision, test_precision, full_train_auc, full_test_auc) = _get_metrics(model, train, test) assert train_precision > 0.45 assert test_precision > 0.07 assert full_train_auc > 0.94 assert full_test_auc > 0.9
def train(self, X, y, lemmapos_list): # MODEL self.clf = LightFM(no_components = self.num_components, learning_schedule = 'adagrad', loss = 'warp', \ learning_rate = 0.05, epsilon = 1e-06, item_alpha = 0.0, user_alpha = 1e-6, \ max_sampled = self.max_sampled, random_state = None) # DATA # training data # X: list of vectors # each vector is the initial representation for a sentence (more precisely, for a predicate with context) # --> these are the user features in the training set # y: list of IDs for frames # the frame IDs are the labels for the representations # --> these are used to create the interaction matrix for the training set such that LightFM can deal with it # y_interactionLabels: interaction matrix is of size (num sentences in y) x (num frames) with 1 indicating the frame label for a predicate in its context sentence y_interactionLabels = self.createInteractionMatrix(y) # FIT self.clf = self.clf.fit(interactions = y_interactionLabels, user_features = X, item_features = None, \ sample_weight = None, epochs = self.num_epochs, num_threads = 2, verbose = True)
def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30, n_jobs=4): ''' Function to run matrix-factorization algorithm Required Input - - interactions = dataset create by create_interaction_matrix - n_components = number of embeddings you want to create to define Item and user - loss = loss function other options are logistic, brp - epoch = number of epochs to run - n_jobs = number of cores used for execution Expected Output - Model - Trained model ''' x = sparse.csr_matrix(interactions.values) model = LightFM(no_components=n_components, loss=loss, k=k) model.fit(x, epochs=epoch, num_threads=n_jobs) return model