def impute_to_file(self, tastings, k=100, min_values=2, verbose=True): # create a data file in Movielens format with the tastings data self.save_tastings_to_movielens_format_file(tastings) # for logging/testing purposes we may like this verbose if verbose: recsys.algorithm.VERBOSE = True svd = SVD() # load source data, perform SVD, save to zip file source_file = self.file_location(self.tastings_movielens_format) svd.load_data(filename=source_file, sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) outfile = self.file_location(self.tastings_recsys_svd) svd.compute(k=k, min_values=min_values, pre_normalize=None, mean_center=True, post_normalize=True, savefile=outfile) return svd
def SVDloadData(): svd = SVD() recsys.algorithm.VERBOSE = True dat_file = '/home/commons/RecSys/MOVIEDATA/MOVIEDATA/ml-1m/ratings.dat' svd.load_data(filename=dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) print svd.get_matrix() return svd
def recommend(dimension=100): svd = SVD() svd.load_data(filename='rating.dat', sep='\t', format={'col':2, 'row':1, 'value':0, 'ids': int}) k = dimension svd.compute(k=k, min_values=1, pre_normalize=None, mean_center=True, post_normalize=True) game_recdict={} for item in svd.recommend(1, is_row=False): appid=item[0] game=Game(appid) if (game.success==1): game_recdict[game.rec]=[game.appid, game.genre, game.name, game.img] sorted_list=sorted(game_recdict.keys(), reverse=True) print ("Games Recommended:") for i in sorted_list: # image urllib.urlretrieve(game_recdict[i][3], "local-filename.jpg") image = plt.imread("local-filename.jpg") plt.imshow(image) plt.show() #name print game_recdict[i][2]
def setup(): global users, items, svd print 'Reading items...' items = _read_items(os.path.join(MOVIELENS_DATA_PATH, 'movies.dat')) users = [] svd = SVD() svd.load_data(filename=os.path.join(MOVIELENS_DATA_PATH, 'ratings.dat'), sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int})
def getSVD(): filename = "/home/udaysagar/Documents/Classes/239/recsys/model/movielens.zip" if os.path.exists(filename): return SVD("./model/movielens") else: svd = SVD() svd.load_data(filename='./data/movielens/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='./model/movielens') return svd
def calculate_SVD_features(): print "Thanks for input, calculating..." svd = SVD() recsys.algorithm.VERBOSE = True dat_file = 'feature_matrix.csv' svd.load_data(filename=dat_file, sep=',', format = {'col':0, 'row':1, 'value': 2, 'ids': int}) svd.compute(k=100, min_values=0, pre_normalize=None, mean_center=False, post_normalize=True) return svd
def calculate_SVD_users(): print "Thanks for input, calculating..." svd = SVD() recsys.algorithm.VERBOSE = True dat_file = 'user_data_working.csv' svd.load_data(filename=dat_file, sep=',', format = {'col':0, 'row':1, 'value': 2, 'ids': int}) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=True) shutil.copy('user_data_original.csv','user_data_working.csv') return svd
def get_model(model_name,datasource_name,start,end,model_params): if not model_name in model_data: model_data[model_name] = (datasource_name,start,end,model_params) if not os.path.exists(model_dir+model_name): #initialize model with new data svd = SVD() svd.load_data(filename=data_dir+datasource_name+'.csv', sep=',', format={'col':0, 'row':1, 'value':2, 'ids': int}) models[model_name] = svd else: if not model_name in models: models[model_name] = SVD(filename=model_dir+model_name)
def impute_to_file(self, tastings, k=100, min_values=2, verbose=True): # create a data file in Movielens format with the tastings data self.save_tastings_to_movielens_format_file(tastings) # for logging/testing purposes we may like this verbose if verbose: recsys.algorithm.VERBOSE = True svd = SVD() # load source data, perform SVD, save to zip file source_file = self.file_location(self.tastings_movielens_format) svd.load_data(filename=source_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) outfile = self.file_location(self.tastings_recsys_svd) svd.compute(k=k, min_values=min_values, pre_normalize=None, mean_center=True, post_normalize=True, savefile=outfile) return svd
def SVDloadData(): svd = SVD() recsys.algorithm.VERBOSE = True dat_file = '/home/commons/RecSys/MOVIEDATA/MOVIEDATA/ml-1m/ratings.dat' svd.load_data(filename=dat_file, sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) print svd.get_matrix() return svd
def setup(): global users, items, svd print 'Reading items...' items = _read_items(os.path.join(MOVIELENS_DATA_PATH, 'movies.dat')) users = [] svd = SVD() svd.load_data(filename=os.path.join(MOVIELENS_DATA_PATH, 'ratings.dat'), sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int })
def Compute(): svd = SVD() svd.load_data(filename='./ml-1m/ratings.dat', sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) svd.compute(k=100, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='./mvsvd')
def quickstart(): svd = SVD() recsys.algorithm.VERBOSE = True # load movielens data dat_file = DATA_DIR + 'ml-1m-ratings.dat' svd.load_data(filename=dat_file, sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) # compute svd k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True) pdb.set_trace() # movie id's ITEMID1 = 1 # toy story ITEMID2 = 1221 # godfather II # get movies similar to toy story print svd.similar(ITEMID1) # get predicted rating for given user & movie MIN_RATING = 0.0 MAX_RATING = 5.0 USERID = 1 ITEMID = 1 # get predicted rating for user1 and item1, mapped onto min max pred = svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING) actual = svd.get_matrix().value(ITEMID, USERID) print 'predicted rating = {0}'.format(pred) print 'actual rating = {0}'.format(actual) print 'which users should see Toy Story?:' print svd.recommend(ITEMID)
def get_model(model_name, datasource_name, start, end, model_params): if not model_name in model_data: model_data[model_name] = (datasource_name, start, end, model_params) if not os.path.exists(model_dir + model_name): #initialize model with new data svd = SVD() svd.load_data(filename=data_dir + datasource_name + '.csv', sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) models[model_name] = svd else: if not model_name in models: models[model_name] = SVD(filename=model_dir + model_name)
def calculate_SVD_features(): print "Thanks for input, calculating..." svd = SVD() recsys.algorithm.VERBOSE = True dat_file = 'feature_matrix.csv' svd.load_data(filename=dat_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) svd.compute(k=100, min_values=0, pre_normalize=None, mean_center=False, post_normalize=True) return svd
def compute(aws_region, s3_bucket, filename, sep, col_index, row_index, value_index, ids_type): download_from_s3(aws_region, s3_bucket, filename) svd = SVD() print 'Loading data to SVD module' svd.load_data(filename='./data/' + filename, sep=sep, format={'col':int(col_index), 'row':int(row_index), 'value':int(value_index), 'ids': ids_type}) k = derive_latent_dimensions(svd, energy_level=0.6) print 'Stating to compute SVD at ', strftime("%Y-%m-%d %H:%M:%S", gmtime()) svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='./models/recommender') print "SVD model saved at ", strftime("%Y-%m-%d %H:%M:%S", gmtime()) sys.exit() # to make sure that process finishes at the end
def calculate_SVD_users(): print "Thanks for input, calculating..." svd = SVD() recsys.algorithm.VERBOSE = True dat_file = 'user_data_working.csv' svd.load_data(filename=dat_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=True) shutil.copy('user_data_original.csv', 'user_data_working.csv') return svd
def loadSVD(): filename = 'favRate.dat' svd = SVD() svd.load_data(filename=filename, sep='::', format={'col':0, 'row':1, 'value':2}) svd.save_data("svd.dat", False) K=20 svd.compute(k=K, min_values=1, pre_normalize="rows", mean_center=False, post_normalize=True, savefile='.') #svd.recommend(USERID, n=10, only_unknowns=True, is_row=False) sparse_matrix = svd.get_matrix() sim_matrix = svd.get_matrix_similarity() print sparse_matrix #print sim_matrix #1173893,1396943 sim = svd.similar(897346, 10) filename = 'swoffering.yaml' titleStream = file(filename, 'r') titleList = yaml.load(titleStream) #print sim for row in sim: (offid, similar) = row print offid, titleList[str(offid)], similar
def quickstart(): svd = SVD() recsys.algorithm.VERBOSE = True # load movielens data dat_file = 'ml-1m/ratings.dat' svd.load_data(filename=dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) # compute svd k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True) pdb.set_trace() # movie id's ITEMID1 = 1 # toy story ITEMID2 = 1221 # godfather II # get movies similar to toy story svd.similar(ITEMID1) # get predicted rating for given user & movie MIN_RATING = 0.0 MAX_RATING = 5.0 USERID = 1 ITEMID = 1 # get predicted rating pred = svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING) actual = svd.get_matrix().value(ITEMID, USERID) print 'predicted rating = {0}'.format(pred) print 'actual rating = {0}'.format(actual) # which users should see Toy Story? svd.recommend(ITEMID)
def test_load_pickle(): svd = SVD() svd.load_data(os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.pickle'), pickle=True) assert_true(isinstance(svd.get_data(), Data))
class Recommender: def __init__(self, datafile_path=None): self.svd = SVD() self.matrix = None self.datafile_path = datafile_path self.predict_matrix = None self.load_local_data(self.datafile_path, 100, 0) def load_web_data(self, filename, film_names_with_rate_list, K, min_values, MAX_COUNT_USER_FILMS=None, MAX_COUNT_FILM_USERS=None): self.matrix = rm.MatrixCreator(MAX_COUNT_USER_FILMS, MAX_COUNT_FILM_USERS).\ create_matrix_by_film_titles(film_names_with_rate_list) self.matrix.save_rating_matrix_as_file(filename) self.datafile_path = filename self.__compute_matrix(K, min_values) def load_local_data(self, filename, K, min_values): self.matrix = rm.MatrixCreator().restore_from_file(filename) self.datafile_path = filename self.__compute_matrix(K, min_values) def get_predictions_for_all_users(self, min_rate=1, max_rate=10, top=None, K=None, min_values=0): if K: self.__compute_matrix(K) self.predict_matrix = np.zeros((len(self.matrix.users_indexes_map), len(self.matrix.films_indexes_map))) for user in self.matrix.users_indexes_map.keys(): for film in self.matrix.films_indexes_map.keys(): user_index = self.matrix.users_indexes_map[user] film_index = self.matrix.films_indexes_map[film] self.predict_matrix[user_index][film_index] = self.svd.predict( user_index, film_index, MIN_VALUE=min_rate, MAX_VALUE=max_rate) return self.predict_matrix def predict_for_user(self, user_index, min_rate=1, max_rate=10, top=None, repeat=False, K=None, min_values=None): """ :param K: to change the number of properties :return: {Film : int(rate), ...} or [(Film, int(rate)), ...] if top is not None """ if K: self.__compute_matrix(K) prediction = {} np_matrix = self.matrix.get_rating_matrix() for index in xrange(np_matrix.shape[1]): rate = self.svd.predict(user_index, index, MIN_VALUE=min_rate, MAX_VALUE=max_rate) film = self.matrix.indexes_films_map[index] prediction[film] = rate if not repeat: fake_user_index = self.matrix.indexes_with_fake_user_ids.keys()[0] user = self.matrix.indexes_users_map[fake_user_index] films = user.get_preferences().keys() prediction = [(x, prediction[x]) for x in prediction if x not in films] if top: prediction = sorted(prediction.items(), key=operator.itemgetter(1)) prediction = list(reversed(prediction[-top:])) return prediction def predict_for_all_fake_users(self, min_rate=1, max_rate=10, top=None, K=None, min_values=0): """ :param K: to change the number of properties :return: [{Film : int(rate), ...}, ...] """ if K: self.__compute_matrix(K) predictions = [] for user_index in self.matrix.indexes_with_fake_user_ids.keys(): prediction = self.predict_for_user(user_index, min_rate, max_rate, top) predictions.append(prediction) return predictions def predicted_rating_submatrix(self, user_indexes): self.__compute_matrix(100) predicted = np.empty((1, self.matrix.rating_matrix.shape[1]), int) for index in user_indexes: row = [] for film_index in xrange(self.matrix.rating_matrix.shape[1]): row.append( self.svd.predict(index, film_index, MIN_VALUE=1, MAX_VALUE=10)) predicted = np.append(predicted, [row], axis=0) return predicted[1:] def predicted_rating_submatrix_for_fake(self): return self.predicted_rating_submatrix( self.matrix.indexes_with_fake_user_ids.keys()) def __compute_matrix(self, K, min_values=0, pre_normalize=None, mean_center=True, post_normalize=True): self.svd.load_data(self.datafile_path, sep=' ', format={ 'col': 1, 'row': 0, 'value': 2, 'ids': int }) self.svd.compute(K, min_values, pre_normalize, mean_center, post_normalize, savefile=None) def filter_films_data(self, min_user_votes): film_indexes = [] counter = collections.Counter() with open(self.datafile_path, 'rb') as my_file: r = csv.reader(my_file) for row in r: user_index, film_index, rate = row[0].split(' ') counter[int(film_index)] += 1 for k, v in counter.iteritems(): if v < min_user_votes: film_indexes.append(k) copyfile(self.datafile_path + '_user_map', self.datafile_path + '_' + str(min_user_votes) + '_user_map') new_indexes = {} with open(self.datafile_path + '_film_map', 'rb') as read_file: r = csv.reader(read_file) with open( self.datafile_path + '_' + str(min_user_votes) + '_film_map', 'wb') as write_file: wr = csv.writer(write_file, delimiter=' ') index = 0 for row in r: film_index, film_id = row[0].split(' ') if int(film_index) in film_indexes: continue new_indexes[film_index] = index wr.writerow([index, film_id]) index += 1 with open(self.datafile_path, 'rb') as read_file: r = csv.reader(read_file) with open(self.datafile_path + '_' + str(min_user_votes), 'wb') as write_file: wr = csv.writer(write_file, delimiter=' ') for row in r: user_index, film_index, rate = row[0].split(' ') if int(film_index) in film_indexes: continue wr.writerow([user_index, new_indexes[film_index], rate])
# This code can be run in real time but the model has to be pre-computed import recsys.algorithm from recsys.algorithm.factorize import SVD ''' SVD recommendation only for unknown movies ''' # Lets make things Verbose recsys.algorithm.VERBOSE = True # Loading the computed model svd = SVD(filename='movielens_small') svd.load_data(filename='ratings_small.csv', sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) svd.create_matrix() # Loading the movielens file of movies which has a mapping of movies to movie-id loop = True while (loop): ratings_file = open('ratings_small.csv', 'r+') movie_lens = open('movies.csv', 'r+') user_found = False movie_found = False USERID = int(input("Enter user id: ")) # Check if the user_id exists. Since currently we are using the small database, we need to check each and every field. # If using the complete database, just check if the number lies in the range.
class NewsRec(): def __init__(self): self.svd = SVD() self.test_set = [] def load_data(self, filename='train_set_for_svd'): self.svd.load_data(filename, sep='\t', format={ 'value': 0, 'row': 2, 'col': 1, 'ids': int }) def load_test(self, filename='test_set_for_svd'): with open(filename, 'r') as f: for line in f: strs = line.split('\t') self.test_set.append((int(strs[1]), int(strs[2]))) def recom(self, user_id, recom_num=3, only_unknown=True): try: #index = self.svd._matrix._matrix.col_index(user_id) index = user_id return self.svd.recommend(index, recom_num, only_unknowns=only_unknown, is_row=False) except IndexError as e: return -1 def compute(self, k=100): self.svd.compute(k=k, min_values=None, pre_normalize=None, mean_center=False, post_normalize=True) def test(self, recom_num=3): hit_cnt = 0 self.ret = [] for user, item in self.test_set: re = self.recom(user, recom_num) #print re if type(re) != type([]): continue try: #item_index = self.svd._matrix._matrix.row_index(item) item_index = item except KeyError as e: continue for rec_index, rec_rate in re: self.ret.append((user, rec_index)) if item_index == rec_index: hit_cnt += 1 if hit_cnt == 0: return user_sum = len(self.test_set) recom_sum = recom_num * user_sum precise = float(hit_cnt) / recom_sum recall = float(hit_cnt) / user_sum f = 2.0 / ((1.0 / precise) + (1.0 / recall)) print 'hit:', hit_cnt print 'precise:', precise print 'recall:', recall print 'F:', f def print_ret(self, filename): string = ["userid,newsid\n"] for user, item in self.ret: string.append(str(user)) string.append(',') string.append(str(item)) string.append('\n') with open(filename, 'w') as f: f.write("".join(string))
class NewsRec(): def __init__(self): self.svd = SVD() self.test_set = [] def load_data(self,filename = 'train_set_for_svd'): self.svd.load_data(filename,sep='\t',format={'value':0,'row':2,'col':1,'ids':int}) def load_test(self,filename = 'test_set_for_svd'): with open(filename,'r') as f: for line in f: strs = line.split('\t') self.test_set.append((int(strs[1]),int(strs[2]))) def recom(self,user_id,recom_num=3,only_unknown=True): try: #index = self.svd._matrix._matrix.col_index(user_id) index = user_id return self.svd.recommend(index,recom_num,only_unknowns=only_unknown,is_row=False) except IndexError as e: return -1 def compute(self,k = 100): self.svd.compute(k=k, min_values=None, pre_normalize=None, mean_center=False, post_normalize=True) def test(self,recom_num=3): hit_cnt = 0 self.ret = [] for user,item in self.test_set: re = self.recom(user,recom_num) #print re if type(re) != type([]): continue try: #item_index = self.svd._matrix._matrix.row_index(item) item_index = item except KeyError as e: continue for rec_index,rec_rate in re: self.ret.append((user,rec_index)) if item_index == rec_index: hit_cnt += 1 if hit_cnt == 0: return user_sum = len(self.test_set) recom_sum = recom_num * user_sum precise = float(hit_cnt) / recom_sum recall = float(hit_cnt) / user_sum f = 2.0 / (( 1.0 / precise) + (1.0 / recall)) print 'hit:',hit_cnt print 'precise:',precise print 'recall:',recall print 'F:',f def print_ret(self,filename): string = ["userid,newsid\n"] for user,item in self.ret: string.append(str(user)) string.append(',') string.append(str(item)) string.append('\n') with open(filename,'w') as f: f.write("".join(string))
#coding=utf-8 import recsys.algorithm recsys.algorithm.VERBOSE = True from recsys.algorithm.factorize import SVD svd = SVD() svd.load_data(filename='../data/movielens/ratings.csv', sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) #train,test=data.split_train_test(percent=70) #svd=SVD() #svd.set_data(train) #假设奇异值的个数为100 k = 100 svd.compute(k=k, min_values=1, pre_normalize=None, mean_center=False, post_normalize=True) #svd.compute(k=k,min_values=10,pre_normalize=None,mean_center=True,post_normalize=True,savefile='/tmp/movielens') #你可以计算两个电影的相似度
from evaluation.root_mean_square_error import RootMeanSquareError __author__ = 'fpena' from recsys.algorithm.factorize import SVD svd = SVD() # file_name = '/Users/fpena/UCC/Thesis/datasets/ml-1m/ratings.dat' # svd.load_data(filename=file_name, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) file_name = '/Users/fpena/tmp/reviews.csv' file_name_header = '/Users/fpena/tmp/reviews-header.csv' # file_name = '/Users/fpena/tmp/small-reviews-matrix.csv' # file_name_header = '/Users/fpena/tmp/small-reviews-header.csv' svd.load_data(filename=file_name, sep='|', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': str }) k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True) # predicted_rating = svd.predict(int(5), 'A1', 1, 10) # predicted_rating2 = svd.predict(int(1), 'A1', 1, 10) # print('Predicted rating', predicted_rating) # print('Predicted rating', predicted_rating2)
# In[2]: # enable verbose output recsys.algorithm.VERBOSE = True # In[3]: # Formatting the data svd = SVD() recsys.algorithm.VERBOSE = True # load movielens data dat_file = './ml-1m/ratings.dat' svd.load_data(filename=dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) # About format parameter: # 'row': 1 -> Rows in matrix come from column 1 in ratings.dat file # 'col': 0 -> Cols in matrix come from column 0 in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat # file # 'ids': int -> Ids (row and col ids) are integers (not strings) # In[4]: # compute svd k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True)
from recsys.algorithm.factorize import SVD # path = "datasets/ml-1m/ratings.dat" path = "datasets/ml-latest-small/ratings_train_1.csv" svd = SVD() svd.load_data(filename=path, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': float }) k = 30 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='/tmp/movielens') # ITEMID1 = 1 # Toy Story (1995) # ITEMID2 = 2355 # A bug's life (1998) # print svd.similarity(ITEMID1, ITEMID2) MIN_RATING = 1.0 MAX_RATING = 5.0
def test_load_pickle(): svd = SVD() svd.load_data(os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.pickle'), pickle=True) assert_true(isinstance(svd.get_data(), Data))
model = RSVD.train(20, train, dims, probeArray=val, learnRate=0.0005, regularization=0.005) sqerr=0.0 for movieID,userID,rating in test: err = rating - model(movieID,userID) sqerr += err * err sqerr /= test.shape[0] print "Test RMSE: ", np.sqrt(sqerr) ########## from recsys.algorithm.factorize import SVD svd = SVD() svd.load_data(filename='./data/behavior-ml.csv', sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='/tmp/movielens') ITEMID1 = 1 # Toy Story (1995) ITEMID2 = 2355 # A bug's life (1998) svd.similarity(ITEMID1, ITEMID2) # 0.67706936677315799
def rectest(): svd = SVD() svd.load_data(filename, sep="::", format={"col":0, "row":1, "value":2, "ids": int})
return 'Hello World!' @app.route("/rec") def rectest(): svd = SVD() svd.load_data(filename, sep="::", format={"col":0, "row":1, "value":2, "ids": int}) if __name__ == '__main__': #app.run() #import os #print os.getcwd() import time start_time = time.time() svd = SVD() data = svd.load_data(filename, sep="::", format={"col":0, "row":1, "value":2, "ids": int}) K = 100 svd.compute(k=K, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile=None) #print data #r = svd.predict(200, 1, MIN_VALUE=0, MAX_VALUE=5.0) r = svd.recommend(1, n=10, only_unknowns=True, is_row=False ) print r time_consumed = time.time() - start_time print time_consumed
import recsys.algorithm recsys.algorithm.VERBOSE = True from recsys.algorithm.factorize import SVD svd = SVD() filename = './data4' filename = './data3.csv' #filename = './data2.csv' filename = './data.csv' filename = './data_l2.csv' filename = './2016.6.29.for_svd.csv' svd.load_data(filename=filename, sep=',', format={'col':0, 'row':1, 'value':2, 'ids': str}) # col -> user, row -> item, value -> label, ids -> timestamp k = 100 r = svd.compute(k=k, min_values=2, pre_normalize=None, mean_center=False, post_normalize=True, savefile='/tmp/movielens') #ITEMID1 = 109 # Toy Story (1995) #ITEMID2 = 106 # A bug's life (1998) #print(svd.similarity(ITEMID1, ITEMID2)) # 0.67706936677315799
class Recommender: def __init__(self, datafile_path=None): self.svd = SVD() self.matrix = None self.datafile_path = datafile_path self.predict_matrix = None self.load_local_data(self.datafile_path, 100, 0) def load_web_data(self, filename, film_names_with_rate_list, K, min_values, MAX_COUNT_USER_FILMS=None, MAX_COUNT_FILM_USERS=None): self.matrix = rm.MatrixCreator(MAX_COUNT_USER_FILMS, MAX_COUNT_FILM_USERS).\ create_matrix_by_film_titles(film_names_with_rate_list) self.matrix.save_rating_matrix_as_file(filename) self.datafile_path = filename self.__compute_matrix(K, min_values) def load_local_data(self, filename, K, min_values): self.matrix = rm.MatrixCreator().restore_from_file(filename) self.datafile_path = filename self.__compute_matrix(K, min_values) def get_predictions_for_all_users(self, min_rate=1, max_rate=10, top = None, K=None, min_values=0): if K: self.__compute_matrix(K) self.predict_matrix = np.zeros((len(self.matrix.users_indexes_map), len(self.matrix.films_indexes_map))) for user in self.matrix.users_indexes_map.keys(): for film in self.matrix.films_indexes_map.keys(): user_index = self.matrix.users_indexes_map[user] film_index = self.matrix.films_indexes_map[film] self.predict_matrix[user_index][film_index] = self.svd.predict(user_index, film_index, MIN_VALUE=min_rate, MAX_VALUE=max_rate) return self.predict_matrix def predict_for_user(self, user_index, min_rate=1, max_rate=10, top = None, repeat=False, K=None, min_values=None): """ :param K: to change the number of properties :return: {Film : int(rate), ...} or [(Film, int(rate)), ...] if top is not None """ if K: self.__compute_matrix(K) prediction = {} np_matrix = self.matrix.get_rating_matrix() for index in xrange(np_matrix.shape[1]): rate = self.svd.predict(user_index, index, MIN_VALUE=min_rate, MAX_VALUE=max_rate) film = self.matrix.indexes_films_map[index] prediction[film] = rate if not repeat: fake_user_index = self.matrix.indexes_with_fake_user_ids.keys()[0] user = self.matrix.indexes_users_map[fake_user_index] films = user.get_preferences().keys() prediction = [(x, prediction[x]) for x in prediction if x not in films] if top: prediction = sorted(prediction.items(), key=operator.itemgetter(1)) prediction = list(reversed(prediction[-top:])) return prediction def predict_for_all_fake_users(self, min_rate=1, max_rate=10, top = None, K=None, min_values=0): """ :param K: to change the number of properties :return: [{Film : int(rate), ...}, ...] """ if K: self.__compute_matrix(K) predictions = [] for user_index in self.matrix.indexes_with_fake_user_ids.keys(): prediction = self.predict_for_user(user_index, min_rate, max_rate, top) predictions.append(prediction) return predictions def predicted_rating_submatrix(self, user_indexes): self.__compute_matrix(100) predicted = np.empty((1, self.matrix.rating_matrix.shape[1]), int) for index in user_indexes: row = [] for film_index in xrange(self.matrix.rating_matrix.shape[1]): row.append(self.svd.predict(index, film_index, MIN_VALUE=1, MAX_VALUE=10)) predicted = np.append(predicted, [row], axis=0) return predicted[1:] def predicted_rating_submatrix_for_fake(self): return self.predicted_rating_submatrix(self.matrix.indexes_with_fake_user_ids.keys()) def __compute_matrix(self, K, min_values=0, pre_normalize=None, mean_center=True, post_normalize=True): self.svd.load_data(self.datafile_path, sep=' ', format={'col': 1, 'row': 0, 'value': 2, 'ids': int}) self.svd.compute(K, min_values, pre_normalize, mean_center, post_normalize, savefile=None) def filter_films_data(self, min_user_votes): film_indexes = [] counter = collections.Counter() with open(self.datafile_path, 'rb') as my_file: r = csv.reader(my_file) for row in r: user_index, film_index, rate = row[0].split(' ') counter[int(film_index)] += 1 for k, v in counter.iteritems(): if v < min_user_votes: film_indexes.append(k) copyfile(self.datafile_path+'_user_map', self.datafile_path+'_'+str(min_user_votes)+'_user_map') new_indexes = {} with open(self.datafile_path+'_film_map', 'rb') as read_file: r = csv.reader(read_file) with open(self.datafile_path+'_'+str(min_user_votes)+'_film_map', 'wb') as write_file: wr = csv.writer(write_file, delimiter=' ') index = 0 for row in r: film_index, film_id = row[0].split(' ') if int(film_index) in film_indexes: continue new_indexes[film_index] = index wr.writerow([index, film_id]) index += 1 with open(self.datafile_path, 'rb') as read_file: r = csv.reader(read_file) with open(self.datafile_path+'_'+str(min_user_votes), 'wb') as write_file: wr = csv.writer(write_file, delimiter=' ') for row in r: user_index, film_index, rate = row[0].split(' ') if int(film_index) in film_indexes: continue wr.writerow([user_index, new_indexes[film_index], rate])
import recsys.algorithm recsys.algorithm.VERBOSE = True from recsys.algorithm.factorize import SVD svd = SVD() svd.load_data(filename='ml-1m/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
#This algorithm is called singular value decomposition and is used to compute the model from the ratings.csv file #This needs to be run only once. The computed model is created as a zip folder. # U(Sigma)V^T is the mathematical formula used for computing SVD. using the pyrecsys library to implement the SVD algorithm #Refer to docs for more details on SVD. import recsys.algorithm from recsys.algorithm.factorize import SVD #To obtain make the script verbose. recsys.algorithm.VERBOSE = True #computing the SVD model svd = SVD() #loading the ratings file. Format is used to create the matrix for SVD svd.load_data(filename='ratings_complete.csv', sep=',' , format={'col':0, 'row':1, 'value':2, 'ids':int}) #Now, lets compute the SVD. Formula is M = U(Sigma)V^T k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='movielens_complete') print("Model Computed and Created")
# -*- coding: utf-8 -*- from recsys.algorithm.factorize import SVD svd = SVD() # 1. Load Movielens dataset: svd.load_data(filename='/home/andy/xx/recommend/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) # 2. Compute Singular Value Decomposition (SVD), M=U Sigma V^t: k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='/tmp/movielens') # 3. Get similarity between two movies: ITEMID1 = 1 # Toy Story (1995) ITEMID2 = 2355 # A bug's life (1998) print svd.similarity(ITEMID1, ITEMID2) # 0.67706936677315799 """ # 4. Get movies similar to Toy Story:
def SVDloadData(): svd = SVD() recsys.algorithm.VERBOSE = True dat_file = 'ratings.dat' svd.load_data(filename=dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) return svd
class RecommendationSystem(): #def __init__(self, spark_context, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): def __init__(self, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): self.start = True self.rating_file = rating_file self.movie_file = movie_file self.detail_file = detail_file self.svd = SVD(filename=model) self.svd.load_data(filename=rating_file, sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids': int}) self.svd.create_matrix() self.ia = imdb.IMDb(accessSystem='http') def get_all_recomm(self, userid, movieid): recom1 = self.svd_recomm(userid, only_unknown=False) recom2 = self.svd_recomm(userid, only_unknown=True) recom3 = self.svd_similar(movieid) brief_info1 = self.get_brief_list(recom1) brief_info2 = self.get_brief_list(recom2) brief_info3 = self.get_brief_list(recom3) return [brief_info1, brief_info2, brief_info3] def svd_recomm(self, userid, only_unknown): user_found = False ratings = open(self.rating_file, 'r') for rating_row in ratings: rating_item = rating_row.split(',') if int(rating_item[0]) == userid: user_found = True break ratings.close() if not user_found: return None #output format: (movieid, similarity value) if only_unknown: similar_list = self.svd.recommend(userid, n=10, only_unknowns=True, is_row=True) else: similar_list = self.svd.recommend(userid, n=10, only_unknowns=False, is_row=False) movieid_list = self.get_id_list(similar_list) return movieid_list def svd_similar(self, movieid): movie_found = False movies = open(self.movie_file, 'r') for movie_row in movies: row_item = movie_row.split(',') if (int(row_item[0]) == movieid): movie_found = True break movies.close() if not movie_found: return None similar_list = self.svd.similar(movieid) movieid_list = self.get_id_list(similar_list) return movieid_list def get_id_list(self, l): movieid_list = [] for s in l: movieid_list.append(s[0]) return movieid_list def get_detail(self, imdb_id): #print type(imdb_id) m = self.ia.get_movie(str(imdb_id)) cover = m.get('cover url') if cover: path = "Images/" + str(imdb_id) + ".jpg" urllib.urlretrieve(cover, path) return m def get_brief_list(self, movieList): info_list = [] for m in movieList: info = self.get_brief(m) info_list.append(info) return info_list def get_brief(self, movieid): info = {} info['title'] = 'unknown' info['genre'] = 'unknown' info['rating'] = 0 info['imdb_id'] = 1 info['director'] = 'unknown' info['cast'] = 'unknown' movies = open(self.movie_file, 'r') for m in movies: row_item = m.split(',') if int(row_item[0]) == movieid: info['title'] = str(row_item[1].strip()) info['genre'] = str(row_item[2].strip()).split('|') break movies.close() ratings = open(self.rating_file, 'r') for r in ratings: row_item = r.split(',') if int(row_item[1]) == movieid: info['rating'] = float(row_item[2].strip()) break ratings.close() details = open(self.detail_file, 'r') #details = codecs.open(self.detail_file, 'r', 'utf-8') for d in details: row_item = d.split(',') if int(row_item[0]) == movieid: #print 'found!' info['imdb_id'] = int(row_item[1].strip()) info['director'] = str(row_item[3].strip()) info['cast'] = str(row_item[4].strip()).split('|') break details.close() return info
from boto.s3.connection import S3Connection import urllib2 db = DBConn() conn = S3Connection('AKIAI6F6HFFENFWSPN4Q', 'aP0OOVDj96AFUEr9vbHalvvNZz7rNNXyyH0Wof7i') bucket = conn.get_bucket('elasticbeanstalk-us-west-2-501394068089') ld_occurrences_key = bucket.get_key('files/data/ld_occurrences.dat') ld_occurrences_path = ld_occurrences_key.generate_url(3600, query_auth=True, force_http=True) ld_occurrences_content = urllib2.urlopen(ld_occurrences_path).read() svd = SVD() svd.load_data(filename=ld_occurrences_content, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': str}) all_items = svd.recommend(USER_ID, n=10, only_unknowns=False, is_row=False) for index, relevance in all_items: print index, items[index].get_data()['name'], relevance # genres = db.get_genres(USER_ID) # if len(genres['genres']) > 0: # pred_items = myFunctions.get_items_user_genre(items, all_items, genres)[:50] # else: # pred_items = all_items[:50] # # for index, relevance in pred_items: # print index, items[index].get_data()['name'], items[index].get_data()['genres'], relevance #
cur.execute('SELECT key_id, title from Books6') results = np.array(cur.fetchall()) bookkeys = np.array(results[:, 0], int) booktitles = np.array(results[:, 1], str) # create a new table called simscores with 3 column: item_id1, item_id2, similarityscore simtable = 'svdsimilarityscores6' cur.execute('drop table if exists %s' % simtable) cur.execute('CREATE TABLE %s (item_id1 INT NOT NULL, item_id2 INT NOT NULL, sim_cosine FLOAT NOT NULL)' % simtable) # load rating data svd = SVD() svd.load_data(filename='./data/MERGED6.csv', sep=',', format={'row':0, 'col':1, 'value':2, 'ids': int}) # About format parameter: # 'row': 0 -> Rows in matrix come from first column; itemkey_id # 'col': 1 -> Cols in matrix come from second column; usrkey_id # 'value': 2 -> Values (Mij) in matrix come from third column # 'ids': int -> Ids (row and col ids) are integers (not strings) # if row is item (not user), then it's item based, and the similarity scores will be between items. k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True,
#!/usr/bin/env python # coding=utf-8 from recsys.algorithm.factorize import SVD svd = SVD() svd.load_data(filename='../invited_info_train_question_sort.txt', sep='\t', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': str }) k = 200 svd.compute(k=k, savefile='../tmp/weight') svd2 = SVD(filename='../tmp/weight') # Loading already computed SVD model output_path = "./output.txt" output_file = open(output_path, 'w') validate_file = file("../validate_nolabel.txt") line = validate_file.readline() line = validate_file.readline().strip("\r\n") while line: question_id = line.split(',')[0] user_id = line.split(',')[1] try: predict = svd2.predict(user_id, question_id, 0.0, 1.0) except: predict = 0
import recsys.algorithm recsys.algorithm.VERBOSE = True from recsys.algorithm.factorize import SVD svd = SVD() svd.load_data(filename='train.csv', sep=',', format={'col':0, 'row':1, 'value':2}) k = 100 svd.compute(k=k, pre_normalize=None, mean_center=True, post_normalize=True) MIN_RATING = 0.0 MAX_RATING = 5000.0 import csv test_file = 'test.csv' soln_file = 'recsys.csv' with open(test_file, 'r') as test_fh: test_csv = csv.reader(test_fh, delimiter=',', quotechar='"') next(test_csv, None) with open(soln_file, 'w') as soln_fh: soln_csv = csv.writer(soln_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) soln_csv.writerow(['Id', 'plays']) for row in test_csv: id = row[0] user = row[1] artist = row[2] res = svd.predict(artist, user, MIN_RATING, MAX_RATING) soln_csv.writerow([id, res])
class RecommendationSystem(): # To run on your own machine, you need to initialize with your datapath to the frontend folder def __init__( self, sc, datapath='/media/psf/Home/CS/GIT_HUB/Movie-Recommendation-Project/frontend/', rating_file='ratings_small.csv', complete_rating_file='ratings.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): self.sc = sc self.start = True self.rating_file = datapath + rating_file self.complete_rating_file = datapath + complete_rating_file self.movie_file = datapath + movie_file self.detail_file = datapath + detail_file self.integration_folder = datapath self.svd = SVD(filename=datapath + model) self.svd.load_data(filename=self.rating_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) self.svd.create_matrix() self.ia = imdb.IMDb(accessSystem='http') # als stuff self.sqlContext = SQLContext(self.sc) self.movie_data = self.sc.textFile(self.movie_file) self.ratings_data = self.sc.textFile( self.complete_rating_file).map(lambda line: line.split(",")).map( lambda x: (int(x[0]), int(x[1]), float(x[2]))) self.als_model_path = datapath + 'Model_Collaborative_Filtering' self.als_model = MatrixFactorizationModel.load(sc, self.als_model_path) self.movie_df = self.sqlContext.read.load(datapath + 'tables/movies') self.detail_df = self.sqlContext.read.load(datapath + 'tables/detail') self.rating_df = self.sqlContext.read.load(datapath + 'tables/ratings') # call this function to get all recommendations def get_all_recomm(self, userid, moviename): movieid = self.get_movie_id(moviename) # all recommendation algorithms return a list of movie ids recom1 = self.svd_recomm(userid, only_unknown=True) recom2 = self.svd_similar(movieid) recom3 = self.als_new(userid) #get info about the movie based on movie ids brief_info1 = self.get_brief_list(recom1) brief_info2 = self.get_brief_list(recom2) brief_info3 = self.get_brief_list(recom3) # print to terminal for l1 in brief_info1: print l1 for l2 in brief_info2: print l2 for l3 in brief_info3: print l3 return [brief_info1, brief_info2, brief_info3] # get movie id based on movie name input def get_movie_id(self, moviename): r = self.movie_df.where( self.movie_df['name'].startswith(moviename)).first() # return movie id 1 if not found if r is None: return 1 return r['movieId'] # svd recommendation algorithm based on the user's rating history, set only_known to True for unseen movies def svd_recomm(self, userid, only_unknown): user_found = False ratings = open(self.rating_file, 'r') for rating_row in ratings: rating_item = rating_row.split(',') if int(rating_item[0]) == userid: user_found = True break ratings.close() if not user_found: return None # output format: (movieid, similarity value) if only_unknown: similar_list = self.svd.recommend(userid, n=10, only_unknowns=True, is_row=True) else: similar_list = self.svd.recommend(userid, n=10, only_unknowns=False, is_row=False) movieid_list = self.get_id_list(similar_list) return movieid_list # svd recommendation algorithm based on similar movie def svd_similar(self, movieid): movie_found = False movies = open(self.movie_file, 'r') for movie_row in movies: row_item = movie_row.split(',') if int(row_item[0]) == movieid: movie_found = True break movies.close() if not movie_found: return None similar_list = self.svd.similar(movieid) movieid_list = self.get_id_list(similar_list) return movieid_list # this ALS recommendation algorithm did not get to present to front end # future work is needed to improve this algorithm def als_recomm(self, userid): user_movie_ratings = [ 16, 24, 32, 47, 50, 110, 150, 161, 165, 204, 223, 256, 260, 261, 277 ] unrated_movies = self.movie_data.filter(lambda x: x[ 0] not in user_movie_ratings).map(lambda x: (userid, x[0])) recommended_movies_rdd = self.als_model.predictAll(unrated_movies) # Now we get a list of predictions for all the movies which user has not seen. We take only the top 10 predictions user_recommended_ratings_rdd = recommended_movies_rdd.map( lambda x: (x.product, x.rating)) movie_ID_with_ratings_RDD = self.ratings_data.map( lambda x: (x[1], x[2])).groupByKey() movie_ID_with_avg_ratings_RDD = movie_ID_with_ratings_RDD.map( get_counts_and_averages) movie_rating_counts_rdd = movie_ID_with_avg_ratings_RDD.map( lambda x: (x[0], x[1][0])) user_recommended_movies_ratings_count_rdd = ( user_recommended_ratings_rdd.join(movie_rating_counts_rdd) ).map(lambda l: (l[0], l[1][0], l[1][1])) recommended_movies_list = user_recommended_movies_ratings_count_rdd.filter( lambda l: l[2] >= 20).takeOrdered(20, key=lambda x: -x[1]) return recommended_movies_list # an ALS recommendation algorithm based on user rating history def als_new(self, userid): recommended_movies = self.als_model.recommendProducts(userid, 10) recommended_movie_list = [] for movie in recommended_movies: recommended_movie_list.append(movie[1]) return recommended_movie_list # return a list of movie id def get_id_list(self, l): movieid_list = [] for s in l: movieid_list.append(s[0]) return movieid_list # this function connects to imdb database to get info (including cover image) # did not make it to front end due to performance and latency issue # need future work for improvement def get_detail(self, movieid, imdb_id): m = self.ia.get_movie(str(imdb_id)) cover = m.get('cover url') if cover: path = self.integration_folder + "Images/" + str(movieid) + ".jpg" urllib.urlretrieve(cover, path) return m # get a list of movie info given a list of movie ids def get_brief_list(self, movieList): info_list = [] for m in movieList: info = self.get_brief(m) if info['title'] != 'unknown': info_list.append(info) if len(info_list) == 5: break return info_list # get movie info (title, direction, genres, rating, cast) from our rdd database def get_brief(self, movieid): info = {} info['movieid'] = movieid info['title'] = 'unknown' info['genres'] = 'unknown' info['rating'] = 0 #info['imdbid'] = 1 info['director'] = 'unknown' info['cast'] = 'unknown' m = self.movie_df.where(self.movie_df['movieId'] == movieid).first() if m is not None: info['title'] = m['name'] info['genres'] = m['genres'] if len(info['genres']) > 3: info['genres'] = info['genres'][0:3] d = self.detail_df.where(self.detail_df['movieId'] == movieid).first() if d is not None: info['director'] = d['director'] info['cast'] = d['cast'] r = self.rating_df.where(self.rating_df['movieId'] == movieid) # default rating to be 4.6 if r.count() == 0: info['rating'] = 4.6 else: avg = r.map(lambda row: row['rating']).reduce( lambda x, y: x + y) / r.count() info['rating'] = avg return info
class RecommendationSystem(): #def __init__(self, spark_context, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): def __init__(self, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): self.start = True self.rating_file = rating_file self.movie_file = movie_file self.detail_file = detail_file self.svd = SVD(filename=model) self.svd.load_data(filename=rating_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) self.svd.create_matrix() self.ia = imdb.IMDb(accessSystem='http') def get_all_recomm(self, userid, movieid): recom1 = self.svd_recomm(userid, only_unknown=False) recom2 = self.svd_recomm(userid, only_unknown=True) recom3 = self.svd_similar(movieid) brief_info1 = self.get_brief_list(recom1) brief_info2 = self.get_brief_list(recom2) brief_info3 = self.get_brief_list(recom3) return [brief_info1, brief_info2, brief_info3] def svd_recomm(self, userid, only_unknown): user_found = False ratings = open(self.rating_file, 'r') for rating_row in ratings: rating_item = rating_row.split(',') if int(rating_item[0]) == userid: user_found = True break ratings.close() if not user_found: return None #output format: (movieid, similarity value) if only_unknown: similar_list = self.svd.recommend(userid, n=10, only_unknowns=True, is_row=True) else: similar_list = self.svd.recommend(userid, n=10, only_unknowns=False, is_row=False) movieid_list = self.get_id_list(similar_list) return movieid_list def svd_similar(self, movieid): movie_found = False movies = open(self.movie_file, 'r') for movie_row in movies: row_item = movie_row.split(',') if (int(row_item[0]) == movieid): movie_found = True break movies.close() if not movie_found: return None similar_list = self.svd.similar(movieid) movieid_list = self.get_id_list(similar_list) return movieid_list def get_id_list(self, l): movieid_list = [] for s in l: movieid_list.append(s[0]) return movieid_list def get_detail(self, imdb_id): #print type(imdb_id) m = self.ia.get_movie(str(imdb_id)) cover = m.get('cover url') if cover: path = "Images/" + str(imdb_id) + ".jpg" urllib.urlretrieve(cover, path) return m def get_brief_list(self, movieList): info_list = [] for m in movieList: info = self.get_brief(m) info_list.append(info) return info_list def get_brief(self, movieid): info = {} info['title'] = 'unknown' info['genre'] = 'unknown' info['rating'] = 0 info['imdb_id'] = 1 info['director'] = 'unknown' info['cast'] = 'unknown' movies = open(self.movie_file, 'r') for m in movies: row_item = m.split(',') if int(row_item[0]) == movieid: info['title'] = str(row_item[1].strip()) info['genre'] = str(row_item[2].strip()).split('|') break movies.close() ratings = open(self.rating_file, 'r') for r in ratings: row_item = r.split(',') if int(row_item[1]) == movieid: info['rating'] = float(row_item[2].strip()) break ratings.close() details = open(self.detail_file, 'r') #details = codecs.open(self.detail_file, 'r', 'utf-8') for d in details: row_item = d.split(',') if int(row_item[0]) == movieid: #print 'found!' info['imdb_id'] = int(row_item[1].strip()) info['director'] = str(row_item[3].strip()) info['cast'] = str(row_item[4].strip()).split('|') break details.close() return info
def Compute(): svd = SVD() svd.load_data(filename='./ml-1m/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) svd.compute(k=100, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='./mvsvd')
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author: Li Zhijun from recsys.algorithm.factorize import SVD svd = SVD() svd.load_data(filename='ml-latest-small/ratings1.csv', sep=',', # format={'userId':0,'movieId':1,'rating':2,'ids':int}) format={'col': 0, 'row': 1, 'value': 2, 'ids': int}) k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='/tmp/movielens') def get_items_similarity(item_id1, item_id2): return svd.similarity(item_id1, item_id2) def get_similar_items(item_id,n=10): return svd.similar(item_id,n)
def svd(filepath): src_folder = parseOutputFolderPath(filepath) base_file_name = parseFileName(filepath) avg_rmse = 0.0 avg_mae = 0.0 out_file_base = base_file_name + "_pred_svd" out_file = open(src_folder + "output/" + out_file_base + EXT, "w") # for each fold for fold_index in xrange(1, NUM_FOLDS + 1): print "*** \t FOLD {0} \t ***".format(fold_index) M_test = lil_matrix((_N, _M)) rmse = 0.0 mae = 0.0 train_path = src_folder + base_file_name + TRAIN_PREFIX + str( fold_index) + EXT test_path = src_folder + base_file_name + TEST_PREFIX + str( fold_index) + EXT print train_path print test_path svd = SVD() svd.load_data(filename=train_path, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': float }) svd.compute(k=_K, min_values=1, pre_normalize=None, mean_center=True, post_normalize=True) with open(test_path, "r") as infile: reader = csv.reader(infile, delimiter=",") for line in reader: userid = int(line[0], 10) movieid = int(line[1], 10) score = float(line[2]) M_test[userid, movieid] = score # GROUND_TRUTH = [3.0, 1.0, 5.0, 2.0, 3.0] # TEST = [2.3, 0.9, 4.9, 0.9, 1.5] # mae = MAE() # mae.load_ground_truth(GROUND_TRUTH) # mae.load_test(TEST) # mae.compute() #returns 0.7 # write predictions only for first test (fold) if (fold_index == 1): rows, cols = M_test.nonzero() for row, col in zip(rows, cols): try: r_xi = svd.predict(col, row, MIN_RATING, MAX_RATING) except: print row, col out_file.write( str(row) + '\t' + str(col) + '\t' + str(r_xi) + '\n') print "..done" print "" exit() out_file.close() # average rmse and mae on validation folds eval_out_path = src_folder + "output/" + out_file_base + "_eval" + EXT with open(eval_out_path, "w") as file: file.write("RMSE" + "\t" + "MAE" + "\n") avg_rmse /= float(NUM_FOLDS) avg_mae /= float(NUM_FOLDS) file.write(str(avg_rmse) + "\t" + str(avg_mae))
class RecommendationSystem(): # To run on your own machine, you need to initialize with your datapath to the frontend folder def __init__(self, sc, datapath='/media/psf/Home/CS/GIT_HUB/Movie-Recommendation-Project/frontend/', rating_file='ratings_small.csv', complete_rating_file='ratings.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): self.sc = sc self.start = True self.rating_file = datapath+rating_file self.complete_rating_file = datapath+complete_rating_file self.movie_file = datapath+movie_file self.detail_file = datapath+detail_file self.integration_folder = datapath self.svd = SVD(filename=datapath+model) self.svd.load_data(filename=self.rating_file, sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids': int}) self.svd.create_matrix() self.ia = imdb.IMDb(accessSystem='http') # als stuff self.sqlContext = SQLContext(self.sc) self.movie_data = self.sc.textFile(self.movie_file) self.ratings_data = self.sc.textFile(self.complete_rating_file).map(lambda line: line.split(",")).map(lambda x: (int(x[0]), int(x[1]), float(x[2]))) self.als_model_path = datapath + 'Model_Collaborative_Filtering' self.als_model = MatrixFactorizationModel.load(sc, self.als_model_path) self.movie_df = self.sqlContext.read.load(datapath+'tables/movies') self.detail_df = self.sqlContext.read.load(datapath+'tables/detail') self.rating_df = self.sqlContext.read.load(datapath+'tables/ratings') # call this function to get all recommendations def get_all_recomm(self, userid, moviename): movieid = self.get_movie_id(moviename) # all recommendation algorithms return a list of movie ids recom1 = self.svd_recomm(userid, only_unknown=True) recom2 = self.svd_similar(movieid) recom3 = self.als_new(userid) #get info about the movie based on movie ids brief_info1 = self.get_brief_list(recom1) brief_info2 = self.get_brief_list(recom2) brief_info3 = self.get_brief_list(recom3) # print to terminal for l1 in brief_info1: print l1 for l2 in brief_info2: print l2 for l3 in brief_info3: print l3 return [brief_info1, brief_info2, brief_info3] # get movie id based on movie name input def get_movie_id(self, moviename): r = self.movie_df.where(self.movie_df['name'].startswith(moviename)).first() # return movie id 1 if not found if r is None: return 1 return r['movieId'] # svd recommendation algorithm based on the user's rating history, set only_known to True for unseen movies def svd_recomm(self, userid, only_unknown): user_found = False ratings = open(self.rating_file, 'r') for rating_row in ratings: rating_item = rating_row.split(',') if int(rating_item[0]) == userid: user_found = True break ratings.close() if not user_found: return None # output format: (movieid, similarity value) if only_unknown: similar_list = self.svd.recommend(userid, n=10, only_unknowns=True, is_row=True) else: similar_list = self.svd.recommend(userid, n=10, only_unknowns=False, is_row=False) movieid_list = self.get_id_list(similar_list) return movieid_list # svd recommendation algorithm based on similar movie def svd_similar(self, movieid): movie_found = False movies = open(self.movie_file, 'r') for movie_row in movies: row_item = movie_row.split(',') if int(row_item[0]) == movieid: movie_found = True break movies.close() if not movie_found: return None similar_list = self.svd.similar(movieid) movieid_list = self.get_id_list(similar_list) return movieid_list # this ALS recommendation algorithm did not get to present to front end # future work is needed to improve this algorithm def als_recomm(self, userid): user_movie_ratings = [16, 24, 32, 47, 50, 110, 150, 161, 165, 204, 223, 256, 260, 261, 277] unrated_movies = self.movie_data.filter(lambda x: x[0] not in user_movie_ratings).map(lambda x: (userid, x[0])) recommended_movies_rdd = self.als_model.predictAll(unrated_movies) # Now we get a list of predictions for all the movies which user has not seen. We take only the top 10 predictions user_recommended_ratings_rdd = recommended_movies_rdd.map(lambda x: (x.product, x.rating)) movie_ID_with_ratings_RDD = self.ratings_data.map(lambda x: (x[1], x[2])).groupByKey() movie_ID_with_avg_ratings_RDD = movie_ID_with_ratings_RDD.map(get_counts_and_averages) movie_rating_counts_rdd = movie_ID_with_avg_ratings_RDD.map(lambda x: (x[0], x[1][0])) user_recommended_movies_ratings_count_rdd = (user_recommended_ratings_rdd.join(movie_rating_counts_rdd)).map(lambda l: (l[0], l[1][0], l[1][1])) recommended_movies_list = user_recommended_movies_ratings_count_rdd.filter(lambda l: l[2] >= 20).takeOrdered(20, key=lambda x: -x[1]) return recommended_movies_list # an ALS recommendation algorithm based on user rating history def als_new(self, userid): recommended_movies = self.als_model.recommendProducts(userid, 10) recommended_movie_list = [] for movie in recommended_movies: recommended_movie_list.append(movie[1]) return recommended_movie_list # return a list of movie id def get_id_list(self, l): movieid_list = [] for s in l: movieid_list.append(s[0]) return movieid_list # this function connects to imdb database to get info (including cover image) # did not make it to front end due to performance and latency issue # need future work for improvement def get_detail(self, movieid, imdb_id): m = self.ia.get_movie(str(imdb_id)) cover = m.get('cover url') if cover: path = self.integration_folder + "Images/" + str(movieid) + ".jpg" urllib.urlretrieve(cover, path) return m # get a list of movie info given a list of movie ids def get_brief_list(self, movieList): info_list = [] for m in movieList: info = self.get_brief(m) if info['title'] != 'unknown': info_list.append(info) if len(info_list) == 5: break return info_list # get movie info (title, direction, genres, rating, cast) from our rdd database def get_brief(self, movieid): info = {} info['movieid'] = movieid info['title'] = 'unknown' info['genres'] = 'unknown' info['rating'] = 0 #info['imdbid'] = 1 info['director'] = 'unknown' info['cast'] = 'unknown' m = self.movie_df.where(self.movie_df['movieId'] == movieid).first() if m is not None: info['title'] = m['name'] info['genres'] = m['genres'] if len(info['genres']) > 3: info['genres'] = info['genres'][0:3] d = self.detail_df.where(self.detail_df['movieId'] == movieid).first() if d is not None: info['director'] = d['director'] info['cast'] = d['cast'] r = self.rating_df.where(self.rating_df['movieId'] == movieid) # default rating to be 4.6 if r.count()==0: info['rating'] = 4.6 else: avg = r.map(lambda row:row['rating']).reduce(lambda x, y: x+y)/r.count() info['rating'] = avg return info
import recsys.algorithm from recsys.datamodel.data import Data from recsys.algorithm.factorize import SVD recsys.algorithm.VERBOSE = True #Load a dataset svd = SVD() svd.load_data(filename='./data/ratings.dat', sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) #Haciendo el split al dataset filename = './data/ratings.dat' data = Data() format = {'col': 0, 'row': 1, 'value': 2, 'ids': int} data.load(filename, sep='::', format=format) train_80, test_20 = data.split_train_test(percent=80) # 80% train, 20% test svd = SVD() svd.set_data(train_80) #Ingresando variables para crear la matrizx k = 100 svd.compute(k=k, min_values=10,
import recsys.algorithm import pandas as pd from tqdm import tqdm recsys.algorithm.VERBOSE = True from recsys.algorithm.factorize import SVD svd = SVD() svd.load_data(filename='../input/user_item_cnt_noheader.csv', sep=',', format={ 'col': 1, 'row': 0, 'value': 2, 'ids': int }) k = 100 svd.compute(k=k, min_values=1, pre_normalize=None, mean_center=True, post_normalize=True, savefile='./tmp') users = pd.read_csv('../input/user_item_cnt.csv', usecols=['user_id'])['user_id'].unique() for user_id in tqdm(user): ret = svd.recommend(user_id, 100, is_row=False) import pdb pdb.set_trace()