def train_and_save(data, save_to='db'): start = time.time() print("> Training the NMF model over", data.shape, "items") mf = MF(data, K=20, alpha=0.001, beta=0.01, iterations=800) mf.train() saved_model = mf.full_matrix() end = time.time() print("> Elapsed Time to Train = ", end - start) if save_to == 'pickle': np.save('NMF', saved_model) if save_to == 'db': savetodb(saved_model) return 0
def worker(fold, n_users, n_items, dataset_dir): traFilePath = dataset_dir + 'ratings__' + str(fold + 1) + '_tra.txt' trasR = loadSparseR(n_users, n_items, traFilePath) print( dataset_dir.split('/')[-2] + ':', trasR.shape, trasR.nnz, '%.2f' % (trasR.nnz / float(trasR.shape[0]))) tra_tuple = np.array([(user, item, trasR[user, item]) for user, item in np.asarray(trasR.nonzero()).T ]) # triad tstFilePath = dataset_dir + 'ratings__' + str(fold + 1) + '_tst.txt' tstsR = loadSparseR(n_users, n_items, tstFilePath) tst_tuple = np.array([(user, item, tstsR[user, item]) for user, item in np.asarray(tstsR.nonzero()).T ]) # triad sampler = Sampler(trasR=trasR, negRatio=.0, batch_size=batch_size) mf = MF(n_users, n_items, eval_metrics, range_of_ratings, reg, n_factors, batch_size) scores = mf.train(fold + 1, tra_tuple, tst_tuple, sampler) print('fold=%d:' % fold, ','.join(['%s' % eval_metric for eval_metric in eval_metrics]), '=', ','.join(['%.6f' % (score) for score in scores])) return scores
def matrix_factorization(): prefix = 'Data/' # ------------------------------- Learning ------------------------------- # # Load training data training_user_movie_pairs = base.load_from_csv( os.path.join(prefix, 'data_train.csv')) training_labels = base.load_from_csv( os.path.join(prefix, 'output_train.csv')) # Concatenating data user_movie_rating_triplets = np.hstack( (training_user_movie_pairs, training_labels.reshape((-1, 1)))) # Build the learning matrix rating_matrix = base.build_rating_matrix(user_movie_rating_triplets) # Build the model model = MF(rating_matrix, K=30, alpha=1e-5, beta=0.02, iterations=2000) with base.measure_time('Training'): print('Training matrix factorization...') model.train() # Save the predicted matrix predicted_matrix = np.matrix(model.full_matrix()) with open('predicted_matrix.txt', 'wb') as f: for line in predicted_matrix: np.savetxt(f, line, fmt='%.5f') # -----------------------Submission: Running model on provided test_set---------------------------- # df = pd.read_csv("Data/data_test.csv") R = pd.read_csv('predicted_matrix.txt', sep=" ", header=None) R = R.values users = df['user_id'].values movies = df['movie_id'].values ratings = [] for u, m in zip(users, movies): if (R[u - 1][m - 1] > 5.00): ratings.append(5.00) else: ratings.append(R[u - 1][m - 1]) fname = base.make_submission(ratings, df.values.squeeze(), 'MatrixFactorization') print('Submission file "{}" successfully written'.format(fname))
def doTrain(K, alpha, beta, gamma, iterations, maxError): print('>>> K=' + str(K) + ', alpha=' + str(alpha) + ', beta=' + str(beta) + ', gamma=' + str(gamma) + ', iterations=' + str(iterations) + ', maxError=' + str(maxError)) inCsv = 'ml-latest-small/ratings.csv' inTestCsv = 'ml-latest-small/trainRatings.csv' outModel = 'trainedModel.pkl' ratings = readCsv(inCsv) trainSubset = readCsv(inTestCsv) maxUserId, maxMovieId = getMaxIds(ratings) R = getRatingsMatrix(trainSubset, maxUserId, maxMovieId) mf = MF(R, K, alpha, beta, gamma, iterations, maxError) print('Training...') training_process = mf.train() print('Done. Mse = ' + str(mf.get_mse())) print('Serializing model to ' + outModel) with open(outModel, 'wb') as output: pickle.dump(mf, output, pickle.HIGHEST_PROTOCOL) print('Done serializing model to ' + outModel)
import numpy as np from mf import MF # A rating matrix with ratings from 5 users on 4 items # zero entries are unknown values R = np.array([ [5, 3, 0, 1], [4, 0, 0, 1], [1, 1, 0, 5], [1, 0, 0, 4], [0, 1, 5, 4], ]) # Perform training and obtain the user and item matrices mf = MF(R, K=2, alpha=0.1, beta=0.01, iterations=20) training_process = mf.train() print(mf.P) print(mf.Q) print(mf.full_matrix())
index2userid = {y: x for x, y in userid2index.items()} index2itemid = {y: x for x, y in itemid2index.items()} nonzero_row, nonzero_col = rating_matrix.nonzero() inds = zip(nonzero_row.tolist(), nonzero_col.tolist()) import sys sys.path.append('../tpmrec/') from mf import MF mf = MF(rating_matrix, inds, 10, 0.0001, 0.01) mf.train(10) for userindex in range(1000): userid = index2userid[userindex] if len(userid2itemindexes[userid]) > 20: continue pr = mf.predict() user_predict = pr[userindex, :] top_item_indexes = np.argsort(user_predict)[::-1][:10] print "userid = ", userid for itemid in userid2itemindexes[userid]: print itemid, itemid2name[itemid] print "recommend item" for itemindex in top_item_indexes: itemid = index2itemid[itemindex] print itemid, itemid2name[itemid]
self.P = np.vstack([self.P, new_lf]) print('in MF') print(len(self.P)) def remove_new_user(self): self.P = np.delete(self.P, 168, 0) def predict(self): prediction_mat = np.matmul(self.P, self.Q.T) return prediction_mat rating_data = np.load('rating_data.npy') mf = MF(rating_data, rating_data) _, _ = mf.train(epoch=20, verbose=False) num_total_rest = 80 restaurants = pickle.load(open('restaurants.dict', 'rb')) @app.route('/get_restaurants') def get_restaurants(): num_sample_rest = 5 rand_ints = [] while len(rand_ints) < 5: rand_int = randrange(num_total_rest) if not rand_int in rand_ints: rand_ints.append(rand_int)
import pickle from mf import MF from recommender.utils.dataprocess import ratings, meanRatings userCount = ratings['userId'].max() movieCount = ratings['movieIndex'].max() + 1 mf = MF(movieCount, userCount, meanRatings, alpha=0.01, reg=0.01, iterations=20, K=20) mf.train(ratings) pickle.dump(mf, open('pkl/mfModel.pkl', 'wb'))
import pandas as pd import numpy as np from mf import MF df_train = pd.read_csv('all/train.csv') df_train = df_train[0:10000] R = np.array( df_train.pivot(index='User', columns='Track', values='Rating').fillna(0)) d_mf = MF(R, K=20, alpha=0.001, beta=0.01, iterations=100) training_process = d_mf.train() print() print("P x Q:") print(d_mf.full_matrix()) print()