def train_spotlight_models(train, test, dataset_testing, embedding_dims, n_iters, batch_sizes, l2s, learning_rates, is_save = False): """ takes train, test, dataset_testing datasets as spotlight.interactions. train multiple spotlight models using ExplicitFactorizationModel, with given parameters. parameters are given in embedding_dims, n_iters, batch_sizes, l2s, learning_rates. return predictions of train, test, dataset_testing datasets as well as rmse on train and test. """ # initialize train_rmses and test_rmses, these store rmse on train and test set train_rmses = np.array([]) test_rmses = np.array([]) # initialize preds_train_trains, preds_train_tests, preds_tests; these store predictions of models on train, test and dataset_testing datasets preds_train_trains = [] preds_train_tests = [] preds_tests = [] # traverse all parameter combinations # embedding_din, n_iter, batch_size, l2 regularization, learning_rate for embedding_dim in embedding_dims: for n_iter in n_iters: for batch_size in batch_sizes: for l2 in l2s: for learning_rate in learning_rates: # initialize model with parameter, ues GPU is torch.cuda.is_available() returns True, otherwise use CPU model = ExplicitFactorizationModel(loss='regression', embedding_dim=embedding_dim, # latent dimensionality n_iter=n_iter, # number of epochs of training batch_size=batch_size, # minibatch size l2=l2, # strength of L2 regularization learning_rate=learning_rate, use_cuda=torch.cuda.is_available()) # print which model is being trained print("embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate)) # fit model model.fit(train, verbose=True) # find rmse on train train_rmse = rmse_score(model, train) # find rmse on test test_rmse = rmse_score(model, test) # store rmse on train and test sets train_rmses = np.append(train_rmses, train_rmse) test_rmses = np.append(test_rmses, test_rmse) # print train and test rmses print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse)) # if is_save given, save the models to disk if is_save: torch.save(model, "models/embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate)) # find predictions of train, test and dataset_testing datasets preds_train_train = model.predict(train.user_ids,train.item_ids) preds_train_test = model.predict(test.user_ids,test.item_ids) preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids) #store those predictions preds_train_trains.append(preds_train_train) preds_train_tests.append(preds_train_test) preds_tests.append(preds_test) # return stored predictions on train, test, dataset_testing; return rmses on train and test return preds_train_trains, preds_train_tests, preds_tests, train_rmses, test_rmses
def train_spotlight_models_using_all_data(train, dataset_testing, embedding_dims, n_iters, batch_sizes, l2s, learning_rates, verbose=True): """ takes train dataset as spotlight.interactions. train multiple spotlight models using ExplicitFactorizationModel, with given parameters. parameters are given in embedding_dims, n_iters, batch_sizes, l2s, learning_rates. saves trained models into disk """ # store predictions on test set preds_tests = [] # traverse all parameter combinations # embedding_din, n_iter, batch_size, l2 regularization, learning_rate for embedding_dim in embedding_dims: for n_iter in n_iters: for batch_size in batch_sizes: for l2 in l2s: for learning_rate in learning_rates: # initialize model model = ExplicitFactorizationModel(loss='regression', embedding_dim=embedding_dim, # latent dimensionality n_iter=n_iter, # number of epochs of training batch_size=batch_size, # minibatch size l2=l2, # strength of L2 regularization learning_rate=learning_rate, use_cuda=torch.cuda.is_available()) # print if given True if verbose: print("embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate)) # fit model using train dataset model.fit(train, verbose=verbose) preds_test = model.predict(dataset_testing.user_ids,dataset_testing.item_ids) preds_tests.append(preds_test) # save model to disk torch.save(model, "models_all_data/embedding_dim={}, n_iter={}, batch_size={}, l2={}, learning_rate={}".format(embedding_dim, n_iter, batch_size, l2, learning_rate)) # return stored predictions on dataset_testing return preds_tests
embedding_dim=5, # latent dimensionality n_iter=10, # number of epochs of training batch_size=256, # minibatch size l2=1e-9, # strength of L2 regularization learning_rate=2e-2, use_cuda=torch.cuda.is_available()) # model = ImplicitFactorizationModel(loss='bpr', # embedding_dim=128, # latent dimensionality # n_iter=10, # number of epochs of training # batch_size=256, # minibatch size # l2=1e-9, # strength of L2 regularization # learning_rate=1e-2, # use_cuda=torch.cuda.is_available()) from spotlight.cross_validation import random_train_test_split train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42)) print('Split into \n {} and \n {}.'.format(train, test)) model.fit(train, verbose=True) torch.save(model, 'spotlight.model') from spotlight.evaluation import rmse_score train_rmse = rmse_score(model, train) test_rmse = rmse_score(model, test) print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse)) predictions = model.predict(test.user_ids, test.item_ids) print(((predictions > 0.5) == (test.ratings > 0)).sum() / len(predictions))
test = np.load('data/loocv_test.npz') train_feat = train['train_feat'].astype('int64') train_scor = train['train_scor'][:, None].astype('float32') test_feat = test['test_feat'].astype('int64') test_scor = test['test_scor'][:, None].astype('float32') model = ExplicitFactorizationModel( loss='regression', embedding_dim=64, # latent dimensionality n_iter=20, # number of epochs of training batch_size=1024 * 4, # minibatch size l2=1e-9, # strength of L2 regularization learning_rate=1e-3, use_cuda=torch.cuda.is_available()) def features(feat, scor): user = feat[:, 0].astype('int64') item = feat[:, 1].astype('int64') y = scor[:, 0].astype('float32') return user, item, y train = Interactions(*features(train_feat, train_scor)) test_user, test_item, test_y = features(test_feat, test_scor) model.fit(train, verbose=True) pred_y = model.predict(test_user, test_item) rmse = np.sqrt(((pred_y - test_y)**2.0).mean()) print(rmse)
full_movies = movie_ind.movie_int.unique() recommendations = [] # Convert datetime to string to ensure serialization success timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] batch_count = 0 for device, user_row in user_ind.iterrows(): # Get list of all movies this user voted on log.info("Generating recommendations for user {}".format(device)) user = user_row.user_int user_votes = ratings_df[ratings_df.user_int == user].movie_int.unique() # Calculate difference in the two lists - rate those movies only m = np.setdiff1d(full_movies, user_votes) user_rank = 0 # for each movie and prediction for a given user, create a recommendation row for movie, pred in zip(m, model.predict(user_ids=user, item_ids=m)): batch_count += 1 user_rank += 1 log.debug('...movie {}'.format(user_rank)) # For each prediction, make a recommendation row recommendations.append({ 'user_id': device, 'rank': user_rank, 'movie_id': movie_ind[movie_ind.movie_int == movie].index[0], 'pred_rating': float(pred), 'pred_time': timestamp
def trainModelUntilOverfit(dataset, modelSteps, modelIterations, numberDataSplits, embedding_dim, learning_rate): numUsers = dataset.num_users numMovies = dataset.num_items train, test = random_train_test_split(dataset, 0.2) print('Split into \n {} and \n {}.'.format(train, test)) #add random seed seed = np.random.RandomState(seed=55555) model = ExplicitFactorizationModel(n_iter=modelIterations, embedding_dim=embedding_dim, learning_rate=learning_rate, random_state=seed) rmseResults = np.empty((modelSteps * numberDataSplits, 2)) indexPreviousClosest = ["0"] if (numberDataSplits > 1): arraySplits = dataSplit(train, numberDataSplits) print("Data set split into", len(arraySplits), "*", (arraySplits[1])) # Each model step fits the entire dataset arrayOfSteps = [] splitCounter = 0 fullStepCounter = 0 # increases each time the entire data set has been visited currentStep = 0 # increases at every split of the data set (does not reset) for i in range(modelSteps * numberDataSplits): print("\nStarting step", fullStepCounter) print("Data split", splitCounter) if (numberDataSplits == 1): model.fit(train, verbose=True) elif (numberDataSplits > 1): print(arraySplits[splitCounter]) model.fit(arraySplits[splitCounter], verbose=True) else: print("Invalid number of data splits") break #predictions for any user are made for all items, matrix has shape (944, 1683) modelPredict = np.empty((numUsers, numMovies)) for userIndex in range(numUsers): modelPredict[userIndex, :] = model.predict(userIndex) # We take the transpose for tsne formatting (should be more rows than columns) modelPredict = modelPredict.T #Measure the model's effectiveness (how good predictions are): rmse = rmse_score(model, test) rmseTrain = rmse_score(model, train) rmseTest = rmse_score(model, test) print("RMSE TEST:", rmseTest, "\n") rmseResults[i, :] = [rmseTrain, rmseTest] arrayOfSteps += [i] if (stopTraining(rmseResults, arrayOfSteps)): rmseResults = rmseResults[:len(arrayOfSteps)] break if (numberDataSplits > 1): splitCounter += 1 if (splitCounter >= len(arraySplits)): splitCounter = 0 fullStepCounter += 1 currentStep += 1 return (model, rmseResults)
train_seq = train.to_sequence(SEQ_LEN) test_seq = test.to_sequence(SEQ_LEN) model = ExplicitSequenceModel(n_iter=30, representation='lstm', batch_size=1) model.fit(train_seq, ratings, verbose=True) SEQ_ID = 0 user_batch = train_seq.user_ids[SEQ_ID] item_batch = train_seq.sequences[SEQ_ID] print('seq', item_batch) item_batch = np.trim_zeros(item_batch) truth = np.array([ ratings[u, i] for u, i in np.broadcast(user_batch, item_batch) ]).reshape(1, -1) pred = model.predict(item_batch, truth) print(pred) print(truth) user_batch = test_seq.user_ids[SEQ_ID] item_batch = test_seq.sequences[SEQ_ID] print('seq', item_batch) item_batch = np.trim_zeros(item_batch) truth = np.array([ ratings[u, i] for u, i in np.broadcast(user_batch, item_batch) ]).reshape(1, -1) pred = model.predict(item_batch, truth) print(pred) print(truth)