def test_bloom(compression_ratio, expected_rmse): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) user_embeddings = BloomEmbedding(interactions.num_users, 32, compression_ratio=compression_ratio, num_hash_functions=2) item_embeddings = BloomEmbedding(interactions.num_items, 32, compression_ratio=compression_ratio, num_hash_functions=2) network = BilinearNet(interactions.num_users, interactions.num_items, user_embedding_layer=user_embeddings, item_embedding_layer=item_embeddings) model = ExplicitFactorizationModel(loss='regression', n_iter=10, batch_size=1024, learning_rate=1e-2, l2=1e-5, representation=network, use_cuda=CUDA) model.fit(train) print(model) rmse = rmse_score(model, test) print(rmse) assert rmse - EPSILON < expected_rmse
def train_spotlight_models(train, test, dataset_testing, embedding_dims, n_iters, batch_sizes, l2s, learning_rates, is_save = False): """ takes train, test, dataset_testing datasets as spotlight.interactions. train multiple spotlight models using ExplicitFactorizationModel, with given parameters. parameters are given in embedding_dims, n_iters, batch_sizes, l2s, learning_rates. return predictions of train, test, dataset_testing datasets as well as rmse on train and test. """ # initialize train_rmses and test_rmses, these store rmse on train and test set train_rmses = np.array([]) test_rmses = np.array([]) # initialize preds_train_trains, preds_train_tests, preds_tests; these store predictions of models on train, test and dataset_testing datasets preds_train_trains = [] preds_train_tests = [] preds_tests = [] # traverse all parameter combinations # embedding_din, n_iter, batch_size, l2 regularization, learning_rate for embedding_dim in embedding_dims: for n_iter in n_iters: for batch_size in batch_sizes: for l2 in l2s: for learning_rate in learning_rates: # initialize model with parameter, ues GPU is torch.cuda.is_available() returns True, otherwise use CPU model = ExplicitFactorizationModel(loss='regression', embedding_dim=embedding_dim, # latent dimensionality n_iter=n_iter, # number of epochs of training batch_size=batch_size, # minibatch size l2=l2, # strength of L2 regularization learning_rate=learning_rate, use_cuda=torch.cuda.is_available()) # print which model is being trained print("embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate)) # fit model model.fit(train, verbose=True) # find rmse on train train_rmse = rmse_score(model, train) # find rmse on test test_rmse = rmse_score(model, test) # store rmse on train and test sets train_rmses = np.append(train_rmses, train_rmse) test_rmses = np.append(test_rmses, test_rmse) # print train and test rmses print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse)) # if is_save given, save the models to disk if is_save: torch.save(model, "models/embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate)) # find predictions of train, test and dataset_testing datasets preds_train_train = model.predict(train.user_ids,train.item_ids) preds_train_test = model.predict(test.user_ids,test.item_ids) preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids) #store those predictions preds_train_trains.append(preds_train_train) preds_train_tests.append(preds_train_test) preds_tests.append(preds_test) # return stored predictions on train, test, dataset_testing; return rmses on train and test return preds_train_trains, preds_train_tests, preds_tests, train_rmses, test_rmses
def best_params_spotlight(losses, n_iters, batch_sizes, l2s, learning_rates, embedding_dims, train_data, t=Timer()): rmses = dict() params = dict() t.start() for loss in losses: params['loss'] = loss for n_iter in n_iters: params['n_iter'] = n_iter for batch_size in batch_sizes: params['batch_size'] = batch_size for l2 in l2s: params['l2'] = l2 for learning_rate in learning_rates: params['learning_rate'] = learning_rate for embedding_dim in embedding_dims: params['embedding_dim'] = embedding_dim model = ExplicitFactorizationModel( loss='regression', embedding_dim= embedding_dim, # latent dimensionality n_iter=n_iter, # number of epochs of training batch_size=batch_size, # minibatch size l2=l2, # strength of L2 regularization learning_rate=learning_rate, use_cuda=torch.cuda.is_available()) params['model'] = model train_tr_data, test_tr_data = random_train_test_split( train_data, random_state=np.random.RandomState(42)) model.fit(train_tr_data, verbose=True) rmse = rmse_score(model, test_tr_data) rmses[rmse] = params print( "-----------Time: {}, Loss: {}, n_iter: {}, l2: {}, batch_size: {}, learning_rate: {}, embedding_dim: {}, rmse: {}-------------\n\n" .format(t.stop(), loss, n_iter, l2, batch_size, learning_rate, embedding_dim, rmse)) # restart timer t.start()
def train(user_ids, item_ids, ratings, num_dimensions, verbose): dataset = Interactions(np.array(user_ids, dtype=np.int32), np.array(item_ids, dtype=np.int32), ratings=np.array(ratings, dtype=np.float32)) is_cuda_available = False if device.type == 'cpu' else True m = ExplicitFactorizationModel(loss='logistic', use_cuda=is_cuda_available, embedding_dim=num_dimensions) m.fit(dataset, verbose=verbose) user_embeddings = m._net.user_embeddings.weight.detach().cpu().numpy() return user_embeddings
def test_explicit_serialization(data): train, test = data model = ExplicitFactorizationModel(loss='regression', n_iter=3, batch_size=1024, learning_rate=1e-3, l2=1e-5, use_cuda=CUDA) model.fit(train) rmse_original = rmse_score(model, test) rmse_recovered = rmse_score(_reload(model), test) assert rmse_original == rmse_recovered
def build_model(data, loss, embedding_dim, n_iter, batch_size, l2, learning_rate, **kwargs): model = ExplicitFactorizationModel( loss=loss, embedding_dim=embedding_dim, # latent dimensionality n_iter=n_iter, # number of epochs of training batch_size=batch_size, # minibatch size l2=l2, # strength of L2 regularization learning_rate=learning_rate, use_cuda=torch.cuda.is_available()) train, test = random_train_test_split( data, random_state=np.random.RandomState(42)) model.fit(train, verbose=True) test_rmse = rmse_score(model, test) return test_rmse
def test_poisson(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ExplicitFactorizationModel(loss='poisson', n_iter=10, batch_size=1024, learning_rate=1e-3, l2=1e-6) model.fit(train) rmse = rmse_score(model, test) assert rmse < 1.0
def test_poisson(): interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ExplicitFactorizationModel(loss='poisson', n_iter=10, batch_size=1024, learning_rate=1e-3, l2=1e-6) model.fit(train) rmse = rmse_score(model, test) assert rmse < 1.0
def test_check_input(): # Train for single iter. interactions = movielens.get_movielens_dataset('100K') train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ExplicitFactorizationModel(loss='regression', n_iter=1, batch_size=1024, learning_rate=1e-3, l2=1e-6) model.fit(train) # Modify data to make imcompatible with original model. train.user_ids[0] = train.user_ids.max() + 1 with pytest.raises(ValueError): model.fit(train)
def train_spotlight_models_using_all_data(train, dataset_testing, embedding_dims, n_iters, batch_sizes, l2s, learning_rates, verbose=True): """ takes train dataset as spotlight.interactions. train multiple spotlight models using ExplicitFactorizationModel, with given parameters. parameters are given in embedding_dims, n_iters, batch_sizes, l2s, learning_rates. saves trained models into disk """ # store predictions on test set preds_tests = [] # traverse all parameter combinations # embedding_din, n_iter, batch_size, l2 regularization, learning_rate for embedding_dim in embedding_dims: for n_iter in n_iters: for batch_size in batch_sizes: for l2 in l2s: for learning_rate in learning_rates: # initialize model model = ExplicitFactorizationModel(loss='regression', embedding_dim=embedding_dim, # latent dimensionality n_iter=n_iter, # number of epochs of training batch_size=batch_size, # minibatch size l2=l2, # strength of L2 regularization learning_rate=learning_rate, use_cuda=torch.cuda.is_available()) # print if given True if verbose: print("embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate)) # fit model using train dataset model.fit(train, verbose=verbose) preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids) preds_tests.append(preds_test) # save model to disk torch.save(model, "models_all_data/embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate)) # return stored predictions on dataset_testing return preds_tests
def test_logistic(): interactions = movielens.get_movielens_dataset('100K') # Convert to binary interactions.ratings = (interactions.ratings > 3).astype(np.float32) # Convert from (0, 1) to (-1, 1) interactions.ratings = interactions.ratings * 2 - 1 train, test = random_train_test_split(interactions, random_state=RANDOM_STATE) model = ExplicitFactorizationModel(loss='logistic', n_iter=10, batch_size=1024, learning_rate=1e-3, l2=1e-6, use_cuda=CUDA) model.fit(train) rmse = rmse_score(model, test) assert rmse - EPSILON < 1.05
def obtener_modelos(self): """ Método obtener_modelos. Obtiene, entrena y guarda el modelo escogido. Este método solo se utiliza en la interfaz de texto. """ global train, modelo # Se obtiene el modelo, se entrena con parámetros por defecto y se guarda if self.opcion_modelo == 1: modelo = ExplicitFactorizationModel(loss='logistic', use_cuda=torch.cuda.is_available()) modelo.fit(train, verbose=True) guardar_modelos_dl(modelo, 'el modelo de factorización explícito') elif self.opcion_modelo == 2: modelo = ImplicitFactorizationModel(loss='bpr', use_cuda=torch.cuda.is_available()) modelo.fit(train, verbose=True) guardar_modelos_dl(modelo, 'el modelo de factorización implícito') else: modelo = ImplicitSequenceModel(loss='bpr', representation='pooling', use_cuda=torch.cuda.is_available()) modelo.fit(train, verbose=True) guardar_modelos_dl(modelo, 'el modelo de secuencia explícito')
from spotlight.interactions import Interactions user_ids = np.array(lite['user']).astype(np.int32) item_ids = np.array(lite['item']).astype(np.int32) ratings = np.array(lite['rating']).astype(np.float32) times = np.array(lite['time']).astype(np.int32) dataset = Interactions(user_ids, item_ids, ratings, times) # Prepare train test train, test = user_based_train_test_split(dataset) # train, test = random_train_test_split(dataset) # Test baseline model = ExplicitFactorizationModel(n_iter=20) model.fit(train, verbose=True) print('RMSE', rmse_score(model, test)) from scipy.sparse import coo_matrix ratings = coo_matrix((dataset.ratings, (dataset.user_ids, dataset.item_ids)), shape=(dataset.num_users, dataset.num_items)).tocsr() train_seq = train.to_sequence(SEQ_LEN) test_seq = test.to_sequence(SEQ_LEN) model = ExplicitSequenceModel(n_iter=30, representation='lstm', batch_size=1) model.fit(train_seq, ratings, verbose=True) SEQ_ID = 0 user_batch = train_seq.user_ids[SEQ_ID]
MRR_fra_rating_5 = RR_fra_rating_5.mean() print("MRR of fraction of 5* ratings: ", MRR_fra_rating_5) """#### So, the best model of Question 1 is Fraction of 5* ratings(fra_rating_5) ## Question 2 """ from spotlight.factorization.explicit import ExplicitFactorizationModel from spotlight.factorization.implicit import ImplicitFactorizationModel #ExplicitFactorizationModel emodel = ExplicitFactorizationModel(n_iter=10, embedding_dim=32, use_cuda=False) emodel.fit(exp_train, verbose=True) score_emodel = scoreAll(emodel) print(calc_reciprank(exp_validation, score_emodel, train=exp_train).mean()) #ImplicitFactorizationModel imodel = ImplicitFactorizationModel(n_iter=10, loss='bpr', embedding_dim=32, use_cuda=False) imodel.fit(exp_train, verbose=True) score_imodel_32_on_exp = scoreAll(imodel) print(calc_reciprank(exp_validation, score_imodel_32_on_exp, train=exp_train).mean()) #ImplicitFactorizationModel is more effective #tune the number of latent factors
item_ids = np.array(df['beer']) ratings = np.array(df['rating']).astype('float32') #Explicit Factorization Model explicit_interactions = Interactions(user_ids, item_ids, ratings) explicit_interactions.tocoo().todense().shape explicit_model = ExplicitFactorizationModel(loss='regression', embedding_dim=32, n_iter=10, batch_size=250, learning_rate=0.01) explicit_model.fit(explicit_interactions) user_df = pd.DataFrame(explicit_interactions.tocoo().todense()) #Spotlight Model #This function uses the Spotlight model to make recommendations for a given user def spotlight_predictions(user_id): app_user = pd.DataFrame(user_df.iloc[user_id]).T app_user_preds = pd.DataFrame({ 'beer': beer_encoder.classes_, 'value': explicit_model.predict(np.array(app_user)), #needs to be passed as an array }).sort_values('value').tail(20)
embedding_dim=5, # latent dimensionality n_iter=10, # number of epochs of training batch_size=256, # minibatch size l2=1e-9, # strength of L2 regularization learning_rate=2e-2, use_cuda=torch.cuda.is_available()) # model = ImplicitFactorizationModel(loss='bpr', # embedding_dim=128, # latent dimensionality # n_iter=10, # number of epochs of training # batch_size=256, # minibatch size # l2=1e-9, # strength of L2 regularization # learning_rate=1e-2, # use_cuda=torch.cuda.is_available()) from spotlight.cross_validation import random_train_test_split train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42)) print('Split into \n {} and \n {}.'.format(train, test)) model.fit(train, verbose=True) torch.save(model, 'spotlight.model') from spotlight.evaluation import rmse_score train_rmse = rmse_score(model, train) test_rmse = rmse_score(model, test) print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse)) predictions = model.predict(test.user_ids, test.item_ids) print(((predictions > 0.5) == (test.ratings > 0)).sum() / len(predictions))
from spotlight.cross_validation import random_train_test_split from spotlight.datasets.movielens import get_movielens_dataset from spotlight.evaluation import rmse_score from spotlight.factorization.explicit import ExplicitFactorizationModel dataset = get_movielens_dataset(variant='100K') train, test = random_train_test_split(dataset) model = ExplicitFactorizationModel(n_iter=1) model.fit(train) rmse = rmse_score(model, test) print(rmse)
## Build interactions object (building torch tensors underneath) log.info("Building interactions object") interactions = Interactions( item_ids=ratings_df.movie_int.astype(np.int32).values, user_ids=ratings_df.user_int.astype(np.int32).values, num_items=len(ratings_df.movie_int.unique()), num_users=len(ratings_df.user_int.unique()), ratings=ratings_df.vote.astype(np.float32).values) ## Build Explicit Matrix Factorization Model # We use logistic loss since the interaction rating is binary (-1, 1) log.info( "Training the recommendation engine model using Explicit Matrix Factorization" ) model = ExplicitFactorizationModel(loss='logistic', n_iter=10) model.fit(interactions) # Prepare to get predictions out for each user full_movies = movie_ind.movie_int.unique() recommendations = [] # Convert datetime to string to ensure serialization success timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] batch_count = 0 for device, user_row in user_ind.iterrows(): # Get list of all movies this user voted on log.info("Generating recommendations for user {}".format(device)) user = user_row.user_int user_votes = ratings_df[ratings_df.user_int == user].movie_int.unique() # Calculate difference in the two lists - rate those movies only m = np.setdiff1d(full_movies, user_votes)
def trainModelUntilOverfit(dataset, modelSteps, modelIterations, numberDataSplits, embedding_dim, learning_rate): numUsers = dataset.num_users numMovies = dataset.num_items train, test = random_train_test_split(dataset, 0.2) print('Split into \n {} and \n {}.'.format(train, test)) #add random seed seed = np.random.RandomState(seed=55555) model = ExplicitFactorizationModel(n_iter=modelIterations, embedding_dim=embedding_dim, learning_rate=learning_rate, random_state=seed) rmseResults = np.empty((modelSteps * numberDataSplits, 2)) indexPreviousClosest = ["0"] if (numberDataSplits > 1): arraySplits = dataSplit(train, numberDataSplits) print("Data set split into", len(arraySplits), "*", (arraySplits[1])) # Each model step fits the entire dataset arrayOfSteps = [] splitCounter = 0 fullStepCounter = 0 # increases each time the entire data set has been visited currentStep = 0 # increases at every split of the data set (does not reset) for i in range(modelSteps * numberDataSplits): print("\nStarting step", fullStepCounter) print("Data split", splitCounter) if (numberDataSplits == 1): model.fit(train, verbose=True) elif (numberDataSplits > 1): print(arraySplits[splitCounter]) model.fit(arraySplits[splitCounter], verbose=True) else: print("Invalid number of data splits") break #predictions for any user are made for all items, matrix has shape (944, 1683) modelPredict = np.empty((numUsers, numMovies)) for userIndex in range(numUsers): modelPredict[userIndex, :] = model.predict(userIndex) # We take the transpose for tsne formatting (should be more rows than columns) modelPredict = modelPredict.T #Measure the model's effectiveness (how good predictions are): rmse = rmse_score(model, test) rmseTrain = rmse_score(model, train) rmseTest = rmse_score(model, test) print("RMSE TEST:", rmseTest, "\n") rmseResults[i, :] = [rmseTrain, rmseTest] arrayOfSteps += [i] if (stopTraining(rmseResults, arrayOfSteps)): rmseResults = rmseResults[:len(arrayOfSteps)] break if (numberDataSplits > 1): splitCounter += 1 if (splitCounter >= len(arraySplits)): splitCounter = 0 fullStepCounter += 1 currentStep += 1 return (model, rmseResults)