def __init__(self): self._model= RSVD.load("objects/") self._original_movieIDs=np.load("objects/original_movieIDs") self._ratings= np.load("objects/ratings") self._moviesInformations = functions.loadMoviesInformations('data_movilens1m/movies.dat') self._usersInformations = functions.loadUsersInformations('data_movilens1m/users.dat') self._dissim = functions.getSimilaritiesFromModel(self._model.u) self._movieMean=np.load("objects/movieMean") self._userMean=np.load("objects/userMean") # PCA pour les films les plus notes self._mostRated = functions.getNMostRatedMovies(self._ratings,2000) j = 0 self._subModel=np.empty((len(self._mostRated),self._model.u.shape[1])) #subRatings doit etre du meme type que model self._subMovieMean = np.empty((len(self._mostRated),2)) for i,row in enumerate(self._model.u): if i in self._mostRated: self._subMovieMean[j] = self._movieMean[i] self._subModel[j]=row j= j +1 # PCA pour les films les plus notes self._mostRating = functions.getNMostRatingUsers(self._ratings) j = 0 self._subModel2=np.empty((len(self._mostRating),self._model.v.shape[1])) #subRatings doit etre du meme type que model self._subMovieMean2 = np.empty((len(self._mostRating),2)) for i,row in enumerate(self._model.v): if i in self._mostRating: self._subMovieMean2[j] = self._userMean[i] self._subModel2[j]=row j= j +1
def makeModel(itemIdMap, userIdMap, ratings, factors=20, learnRate=0.001, regularization=0.011): """ Makes a RSVD model from ratings """ ratings = np.array(ratings, rating_t) np.random.shuffle(ratings) n = int(ratings.shape[0] * 0.8) # train = ratings[:n] test = ratings[n:] # v = int(train.shape[0] * 0.9) # val = train[v:] # train = train[:v] # Increasing training data v = int(ratings.shape[0] * 0.9) val = ratings[v:] train = ratings[:v] dims = (len(itemIdMap), len(userIdMap)) model = RSVD.train(factors, train, dims, probeArray=val, learnRate=learnRate, regularization=regularization, maxEpochs=1000) sqerr=0.0 for itemID,userID,rating in test: err = rating - model(itemID,userID) sqerr += err * err sqerr /= test.shape[0] print "Test RMSE: ", np.sqrt(sqerr) return model
def findFactorsAndErrors(ratingsDataset): ratings=ratingsDataset.ratings() # create train, validation and test sets. n = int(ratings.shape[0]*0.8) train = ratings[:n] test = ratings[n:] v = int(train.shape[0]*0.9) val = train[v:] train = train[:v] dims = (ratingsDataset.movieIDs().shape[0], ratingsDataset.userIDs().shape[0]) factors = [] errors = [] # lambda_f ne doit pas depasser 1 # default values #probeArray=None #maxEpochs=100 #minImprovement=0.000001 #learnRate=0.001 #regularization=0.011 #randomize=False #randomNoise=0.005 for factor in range(1, 100): model = RSVD.train(factor, train, dims, probeArray=val, maxEpochs = 1000, regularization=0.011) sqerr=0.0 for movieID,userID,rating in test: err = rating - model(movieID,userID) sqerr += err * err sqerr /= test.shape[0] factors.append(factor) errors.append(np.sqrt(sqerr)) # returns a dict, do result['best_factor'] to get the corresponding value return {'factors':factors, 'errors':errors}
dataset = MovieLensDataset.loadDat("data/movielense/ratings.dat") ratings = dataset.ratings() # make sure that the ratings a properly shuffled np.random.shuffle(ratings) # create train, validation and test sets. n = int(ratings.shape[0] * 0.8) train = ratings[:n] test = ratings[n:] v = int(train.shape[0] * 0.9) val = train[v:] train = train[:v] from rsvd import RSVD dims = (dataset.movieIDs().shape[0], dataset.userIDs().shape[0]) model = RSVD.train(20, train, dims, probeArray=val, maxEpochs=100, learnRate=0.0005, regularization=0.005) sqerr = 0.0 for movieID, userID, rating in test: err = rating - model(movieID, userID) sqerr += err * err sqerr /= test.shape[0] print "Test RMSE: ", np.sqrt(sqerr) import IPython IPython.embed()
np.random.shuffle(ratings) # create train, validation and test sets. n = int(ratings.shape[0] * 0.8) train = ratings[:n] test = ratings[n:] v = int(train.shape[0] * 0.9) val = train[v:] train = train[:v] from rsvd import RSVD dims = (dataset.movieIDs().shape[0], dataset.userIDs().shape[0]) model = RSVD.train(20, train, dims, probeArray=val, maxEpochs=100, learnRate=0.0005, regularization=0.005) sqerr = 0.0 for movieID, userID, rating in test: err = rating - model(movieID, userID) sqerr += err * err sqerr /= test.shape[0] print "Test RMSE: ", np.sqrt(sqerr) import IPython IPython.embed()
""" Test ---- This is a test """ #log# Automatic Logger file. *** THIS MUST BE THE FIRST LINE *** #log# DO NOT CHANGE THIS LINE OR THE TWO BELOW #log# opts = Struct({'__allownew': True, 'logfile': 'ipython_log.py'}) #log# args = [] #log# It is safe to make manual edits below here. #log#----------------------------------------------------------------------- import numpy as np from rsvd import RSVD print 'load ratings' ratings = np.load('data/ratings_float.arr') probeRatings = np.load('data/probe_ratings_float.arr') model = RSVD.train(20, ratings, (17770, 480189), probeRatings, 100, randomize=False) print "model trained..." model.save("models/t_20_001_011_100")
train = ratings[:n] test = ratings[n:] v = int(train.shape[0]*0.9) val = train[v:] train = train[:v] dims = (ratingsDataset.movieIDs().shape[0], ratingsDataset.userIDs().shape[0]) factor = 40 lambdas = [] errors = [] # lambda_f ne doit pas depasser 1 # maxEpochs = 1000 for lambda_f in np.arange(0.0, 0.05, 0.0005): model = RSVD.train(factor, train, dims, probeArray=val, maxEpochs = 1000, regularization=lambda_f) sqerr=0.0 for movieID,userID,rating in test: err = rating - model(movieID,userID) sqerr += err * err sqerr /= test.shape[0] print "-------------------------------------------------" print "Pour lambda = ",lambda_f, " Test RMSE: ", np.sqrt(sqerr) print "-------------------------------------------------" lambdas.append(lambda_f) errors.append(np.sqrt(sqerr)) # print the lamdas and errors vectors
print "ok" with open(ARTISTS_LUT_PATH, 'w') as outfile: json.dump(ARTISTS_LUT, outfile) with open(INV_ARTISTS_LUT_PATH, 'w') as outfile: json.dump(INV_ARTISTS_LUT, outfile) else: with open(ARTISTS_LUT_PATH,'r') as jsonStream: ARTISTS_LUT = json.load(jsonStream) with open(INV_ARTISTS_LUT_PATH,'r') as jsonStream: INV_ARTISTS_LUT = json.load(jsonStream) f_corpus = formatData(CORPUS,LEXICON,PLAYLISTS_LUT,ARTISTS_LUT) CORPUS.clear() #no need to keep corpus in memory print "ok" print "\nNumber of playlists:",len(PLAYLISTS_LUT),"\nNumber of artists:",len(ARTISTS_LUT) ### Split the dataset print "\nGenerating the training/validation and test set ...", sys.stdout.flush() trainSet,validSet,testSet = splitDataset(f_corpus) print "ok" ### train our MF model """prototype: train(factors,ratingsArray,dims,probeArray=None,\ maxEpochs=100,minImprovement=0.000001,\ learnRate=0.001,regularization=0.011,\ randomize=False, randomNoise=0.005)""" model = RSVD.train(25,trainSet,(len(ARTISTS_LUT),len(PLAYLISTS_LUT)),validSet,learnRate=0.001,regularization=0.00000,minImprovement=0.0000001,maxEpochs=700) model.save("./")
f_testing = open(TESTING_FILENAME, 'r') f_out = open(OUTPUT_FILENAME + fn, 'w') print "Making %d predictions..." % NUM_TESTING start_time = time.time() i = 0 j = 0 for line in f_testing: user, movie, date = line.strip().split() f_out.write(str(model(int(user), int(movie))) + '\n') i += 1 if i % (NUM_TESTING / INCR) == 0: j += 100.0 / INCR sys.stdout.write("\r%.1f%% done (elapsed time: %f s)." % (j, time.time() - start_time)) sys.stdout.flush() f_testing.close() print "Predictions complete (elapsed time: %f s)." % (time.time() - start_time) f_out.close() if __name__ == '__main__': print "Importing ratings..." ratings = np.fromfile(TRAINING_FILENAME, dtype=rating_t) print "Importing probes..." probeRatings = np.fromfile(PROBE_FILENAME, dtype=rating_t) print "running svd..." model = RSVD.train(10, ratings, (17770, 458292), probeRatings) print "predicting..." predict(model)
def predict(model, fn=""): f_testing = open(TESTING_FILENAME, "r") f_out = open(OUTPUT_FILENAME + fn, "w") print "Making %d predictions..." % NUM_TESTING start_time = time.time() i = 0 j = 0 for line in f_testing: user, movie, date = line.strip().split() f_out.write(str(model(int(user), int(movie))) + "\n") i += 1 if i % (NUM_TESTING / INCR) == 0: j += 100.0 / INCR sys.stdout.write("\r%.1f%% done (elapsed time: %f s)." % (j, time.time() - start_time)) sys.stdout.flush() f_testing.close() print "Predictions complete (elapsed time: %f s)." % (time.time() - start_time) f_out.close() if __name__ == "__main__": print "Importing ratings..." ratings = np.fromfile(TRAINING_FILENAME, dtype=rating_t) print "Importing probes..." probeRatings = np.fromfile(PROBE_FILENAME, dtype=rating_t) print "running svd..." model = RSVD.train(10, ratings, (17770, 458292), probeRatings) print "predicting..." predict(model)