Esempio n. 1
0
 def impute_to_file(self, tastings, k=100, min_values=2, verbose=True):
     # create a data file in Movielens format with the tastings data
     self.save_tastings_to_movielens_format_file(tastings)
     # for logging/testing purposes we may like this verbose
     if verbose:
         recsys.algorithm.VERBOSE = True
     svd = SVD()
     # load source data, perform SVD, save to zip file
     source_file = self.file_location(self.tastings_movielens_format)
     svd.load_data(filename=source_file,
                   sep='::',
                   format={
                       'col': 0,
                       'row': 1,
                       'value': 2,
                       'ids': int
                   })
     outfile = self.file_location(self.tastings_recsys_svd)
     svd.compute(k=k,
                 min_values=min_values,
                 pre_normalize=None,
                 mean_center=True,
                 post_normalize=True,
                 savefile=outfile)
     return svd
def SVDloadData():
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = '/home/commons/RecSys/MOVIEDATA/MOVIEDATA/ml-1m/ratings.dat'
    svd.load_data(filename=dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
    print svd.get_matrix()
    return svd
def recommend(dimension=100): 
    svd = SVD()
    svd.load_data(filename='rating.dat',
                sep='\t',
                format={'col':2, 'row':1, 'value':0, 'ids': int})

    k = dimension
    svd.compute(k=k, min_values=1, pre_normalize=None, mean_center=True, post_normalize=True)
    
    game_recdict={}
    for item in svd.recommend(1, is_row=False):
        appid=item[0]
        game=Game(appid)
        if (game.success==1):
            game_recdict[game.rec]=[game.appid, game.genre, game.name, game.img]
        
    sorted_list=sorted(game_recdict.keys(), reverse=True)
    print ("Games Recommended:")
    for i in sorted_list:
        # image
        urllib.urlretrieve(game_recdict[i][3], "local-filename.jpg")
        image = plt.imread("local-filename.jpg")
        plt.imshow(image)
        plt.show()
    
        #name
        print game_recdict[i][2]
Esempio n. 4
0
def setup():
    global users, items, svd

    print 'Reading items...'
    items = _read_items(os.path.join(MOVIELENS_DATA_PATH, 'movies.dat'))
    users = []

    svd = SVD()
    svd.load_data(filename=os.path.join(MOVIELENS_DATA_PATH, 'ratings.dat'), sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int})
Esempio n. 5
0
def getSVD():
    filename = "/home/udaysagar/Documents/Classes/239/recsys/model/movielens.zip"
    if os.path.exists(filename):
        return SVD("./model/movielens")
    else:
        svd = SVD()
        svd.load_data(filename='./data/movielens/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
        k = 100
        svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='./model/movielens')
        return svd
Esempio n. 6
0
def calculate_SVD_features():
    print "Thanks for input, calculating..."
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = 'feature_matrix.csv'
    svd.load_data(filename=dat_file, sep=',', 
                format = {'col':0, 'row':1, 'value': 2, 'ids': int})
    svd.compute(k=100, min_values=0, pre_normalize=None, 
                mean_center=False, post_normalize=True)
    return svd       
Esempio n. 7
0
def calculate_SVD_users():
    print "Thanks for input, calculating..."
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = 'user_data_working.csv'
    svd.load_data(filename=dat_file, sep=',', 
                format = {'col':0, 'row':1, 'value': 2, 'ids': int})
    svd.compute(k=100, min_values=2, pre_normalize=None, 
                mean_center=True, post_normalize=True)
    shutil.copy('user_data_original.csv','user_data_working.csv')
    return svd
Esempio n. 8
0
def get_model(model_name,datasource_name,start,end,model_params):
    if not model_name in model_data:
        model_data[model_name] = (datasource_name,start,end,model_params) 
    if not os.path.exists(model_dir+model_name):
        #initialize model with new data
        svd = SVD()
        svd.load_data(filename=data_dir+datasource_name+'.csv', sep=',', format={'col':0, 'row':1, 'value':2, 'ids': int})
        models[model_name] = svd
    else:
        if not model_name in models:
            models[model_name] = SVD(filename=model_dir+model_name)
Esempio n. 9
0
 def impute_to_file(self, tastings, k=100, min_values=2, verbose=True):
     # create a data file in Movielens format with the tastings data
     self.save_tastings_to_movielens_format_file(tastings)
     # for logging/testing purposes we may like this verbose
     if verbose:
         recsys.algorithm.VERBOSE = True
     svd = SVD()
     # load source data, perform SVD, save to zip file
     source_file = self.file_location(self.tastings_movielens_format)
     svd.load_data(filename=source_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
     outfile = self.file_location(self.tastings_recsys_svd)
     svd.compute(k=k, min_values=min_values, pre_normalize=None, mean_center=True, post_normalize=True, savefile=outfile)
     return svd
def SVDloadData():
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = '/home/commons/RecSys/MOVIEDATA/MOVIEDATA/ml-1m/ratings.dat'
    svd.load_data(filename=dat_file,
                  sep='::',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })
    print svd.get_matrix()
    return svd
Esempio n. 11
0
def setup():
    global users, items, svd

    print 'Reading items...'
    items = _read_items(os.path.join(MOVIELENS_DATA_PATH, 'movies.dat'))
    users = []

    svd = SVD()
    svd.load_data(filename=os.path.join(MOVIELENS_DATA_PATH, 'ratings.dat'),
                  sep='::',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })
Esempio n. 12
0
def Compute():
    svd = SVD()
    svd.load_data(filename='./ml-1m/ratings.dat',
                  sep='::',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })
    svd.compute(k=100,
                min_values=10,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True,
                savefile='./mvsvd')
Esempio n. 13
0
def quickstart():
    svd = SVD()
    recsys.algorithm.VERBOSE = True

    # load movielens data
    dat_file = DATA_DIR + 'ml-1m-ratings.dat'
    svd.load_data(filename=dat_file,
                  sep='::',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })

    # compute svd
    k = 100
    svd.compute(k=k,
                min_values=10,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    pdb.set_trace()

    # movie id's
    ITEMID1 = 1  # toy story
    ITEMID2 = 1221  # godfather II

    # get movies similar to toy story
    print svd.similar(ITEMID1)

    # get predicted rating for given user & movie
    MIN_RATING = 0.0
    MAX_RATING = 5.0
    USERID = 1
    ITEMID = 1

    # get predicted rating for user1 and item1, mapped onto min max
    pred = svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING)
    actual = svd.get_matrix().value(ITEMID, USERID)
    print 'predicted rating = {0}'.format(pred)
    print 'actual rating = {0}'.format(actual)

    print 'which users should see Toy Story?:'
    print svd.recommend(ITEMID)
Esempio n. 14
0
def get_model(model_name, datasource_name, start, end, model_params):
    if not model_name in model_data:
        model_data[model_name] = (datasource_name, start, end, model_params)
    if not os.path.exists(model_dir + model_name):
        #initialize model with new data
        svd = SVD()
        svd.load_data(filename=data_dir + datasource_name + '.csv',
                      sep=',',
                      format={
                          'col': 0,
                          'row': 1,
                          'value': 2,
                          'ids': int
                      })
        models[model_name] = svd
    else:
        if not model_name in models:
            models[model_name] = SVD(filename=model_dir + model_name)
Esempio n. 15
0
def calculate_SVD_features():
    print "Thanks for input, calculating..."
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = 'feature_matrix.csv'
    svd.load_data(filename=dat_file,
                  sep=',',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })
    svd.compute(k=100,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=True)
    return svd
Esempio n. 16
0
def compute(aws_region, s3_bucket, filename, sep, col_index, row_index, value_index, ids_type):
    download_from_s3(aws_region, s3_bucket, filename)
    svd = SVD()

    print 'Loading data to SVD module'
    svd.load_data(filename='./data/' + filename,
                  sep=sep,
                  format={'col':int(col_index), 'row':int(row_index), 'value':int(value_index), 'ids': ids_type})

    k = derive_latent_dimensions(svd, energy_level=0.6)

    print 'Stating to compute SVD at ', strftime("%Y-%m-%d %H:%M:%S", gmtime())
    svd.compute(k=k,
                min_values=10,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True,
                savefile='./models/recommender')
    print "SVD model saved at ", strftime("%Y-%m-%d %H:%M:%S", gmtime())
    sys.exit() # to make sure that process finishes at the end
Esempio n. 17
0
def calculate_SVD_users():
    print "Thanks for input, calculating..."
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = 'user_data_working.csv'
    svd.load_data(filename=dat_file,
                  sep=',',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })
    svd.compute(k=100,
                min_values=2,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)
    shutil.copy('user_data_original.csv', 'user_data_working.csv')
    return svd
Esempio n. 18
0
def loadSVD():        
    
    filename = 'favRate.dat'
    svd = SVD()
    svd.load_data(filename=filename, sep='::', format={'col':0, 'row':1, 'value':2})
    
    svd.save_data("svd.dat", False)
    
    K=20
    svd.compute(k=K, min_values=1, pre_normalize="rows", mean_center=False, post_normalize=True, savefile='.')
    
    
    #svd.recommend(USERID, n=10, only_unknowns=True, is_row=False)
    
    sparse_matrix = svd.get_matrix()
    
    sim_matrix = svd.get_matrix_similarity()
    
    
    
    print sparse_matrix
    
    #print sim_matrix
    
    #1173893,1396943
    sim = svd.similar(897346, 10)
    
    filename = 'swoffering.yaml'
    titleStream = file(filename, 'r')
    titleList = yaml.load(titleStream)
    
    #print sim
    
    for row in sim:
        
        (offid, similar) = row
        
        print offid, titleList[str(offid)], similar        
Esempio n. 19
0
def quickstart():
    svd = SVD()
    recsys.algorithm.VERBOSE = True

    # load movielens data
    dat_file = 'ml-1m/ratings.dat'
    svd.load_data(filename=dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})

    # compute svd
    k = 100
    svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True,
        post_normalize=True)

    pdb.set_trace()

    # movie id's
    ITEMID1 = 1      # toy story
    ITEMID2 = 1221   # godfather II

    # get movies similar to toy story
    svd.similar(ITEMID1)

    # get predicted rating for given user & movie
    MIN_RATING = 0.0
    MAX_RATING = 5.0
    USERID = 1
    ITEMID = 1

    # get predicted rating
    pred = svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING)
    actual = svd.get_matrix().value(ITEMID, USERID)
    print 'predicted rating = {0}'.format(pred)
    print 'actual rating = {0}'.format(actual)

    # which users should see Toy Story?
    svd.recommend(ITEMID)
Esempio n. 20
0
def test_load_pickle():
    svd = SVD()
    svd.load_data(os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.pickle'),
                  pickle=True)
    assert_true(isinstance(svd.get_data(), Data))
class Recommender:
    def __init__(self, datafile_path=None):
        self.svd = SVD()
        self.matrix = None
        self.datafile_path = datafile_path
        self.predict_matrix = None
        self.load_local_data(self.datafile_path, 100, 0)

    def load_web_data(self,
                      filename,
                      film_names_with_rate_list,
                      K,
                      min_values,
                      MAX_COUNT_USER_FILMS=None,
                      MAX_COUNT_FILM_USERS=None):
        self.matrix = rm.MatrixCreator(MAX_COUNT_USER_FILMS, MAX_COUNT_FILM_USERS).\
            create_matrix_by_film_titles(film_names_with_rate_list)
        self.matrix.save_rating_matrix_as_file(filename)
        self.datafile_path = filename
        self.__compute_matrix(K, min_values)

    def load_local_data(self, filename, K, min_values):
        self.matrix = rm.MatrixCreator().restore_from_file(filename)
        self.datafile_path = filename
        self.__compute_matrix(K, min_values)

    def get_predictions_for_all_users(self,
                                      min_rate=1,
                                      max_rate=10,
                                      top=None,
                                      K=None,
                                      min_values=0):
        if K:
            self.__compute_matrix(K)

        self.predict_matrix = np.zeros((len(self.matrix.users_indexes_map),
                                        len(self.matrix.films_indexes_map)))
        for user in self.matrix.users_indexes_map.keys():
            for film in self.matrix.films_indexes_map.keys():
                user_index = self.matrix.users_indexes_map[user]
                film_index = self.matrix.films_indexes_map[film]
                self.predict_matrix[user_index][film_index] = self.svd.predict(
                    user_index,
                    film_index,
                    MIN_VALUE=min_rate,
                    MAX_VALUE=max_rate)
        return self.predict_matrix

    def predict_for_user(self,
                         user_index,
                         min_rate=1,
                         max_rate=10,
                         top=None,
                         repeat=False,
                         K=None,
                         min_values=None):
        """
        :param K: to change the number of properties
        :return: {Film : int(rate), ...} or
                [(Film, int(rate)), ...] if top is not None
        """
        if K:
            self.__compute_matrix(K)

        prediction = {}
        np_matrix = self.matrix.get_rating_matrix()
        for index in xrange(np_matrix.shape[1]):
            rate = self.svd.predict(user_index,
                                    index,
                                    MIN_VALUE=min_rate,
                                    MAX_VALUE=max_rate)
            film = self.matrix.indexes_films_map[index]
            prediction[film] = rate

        if not repeat:
            fake_user_index = self.matrix.indexes_with_fake_user_ids.keys()[0]
            user = self.matrix.indexes_users_map[fake_user_index]
            films = user.get_preferences().keys()

            prediction = [(x, prediction[x]) for x in prediction
                          if x not in films]

        if top:
            prediction = sorted(prediction.items(), key=operator.itemgetter(1))
            prediction = list(reversed(prediction[-top:]))

        return prediction

    def predict_for_all_fake_users(self,
                                   min_rate=1,
                                   max_rate=10,
                                   top=None,
                                   K=None,
                                   min_values=0):
        """
        :param K: to change the number of properties
        :return: [{Film : int(rate), ...}, ...]
        """
        if K:
            self.__compute_matrix(K)

        predictions = []

        for user_index in self.matrix.indexes_with_fake_user_ids.keys():
            prediction = self.predict_for_user(user_index, min_rate, max_rate,
                                               top)
            predictions.append(prediction)

        return predictions

    def predicted_rating_submatrix(self, user_indexes):
        self.__compute_matrix(100)
        predicted = np.empty((1, self.matrix.rating_matrix.shape[1]), int)
        for index in user_indexes:
            row = []
            for film_index in xrange(self.matrix.rating_matrix.shape[1]):
                row.append(
                    self.svd.predict(index,
                                     film_index,
                                     MIN_VALUE=1,
                                     MAX_VALUE=10))

            predicted = np.append(predicted, [row], axis=0)
        return predicted[1:]

    def predicted_rating_submatrix_for_fake(self):
        return self.predicted_rating_submatrix(
            self.matrix.indexes_with_fake_user_ids.keys())

    def __compute_matrix(self,
                         K,
                         min_values=0,
                         pre_normalize=None,
                         mean_center=True,
                         post_normalize=True):
        self.svd.load_data(self.datafile_path,
                           sep=' ',
                           format={
                               'col': 1,
                               'row': 0,
                               'value': 2,
                               'ids': int
                           })
        self.svd.compute(K,
                         min_values,
                         pre_normalize,
                         mean_center,
                         post_normalize,
                         savefile=None)

    def filter_films_data(self, min_user_votes):
        film_indexes = []
        counter = collections.Counter()
        with open(self.datafile_path, 'rb') as my_file:
            r = csv.reader(my_file)
            for row in r:
                user_index, film_index, rate = row[0].split(' ')
                counter[int(film_index)] += 1

            for k, v in counter.iteritems():
                if v < min_user_votes:
                    film_indexes.append(k)

        copyfile(self.datafile_path + '_user_map',
                 self.datafile_path + '_' + str(min_user_votes) + '_user_map')

        new_indexes = {}
        with open(self.datafile_path + '_film_map', 'rb') as read_file:
            r = csv.reader(read_file)
            with open(
                    self.datafile_path + '_' + str(min_user_votes) +
                    '_film_map', 'wb') as write_file:
                wr = csv.writer(write_file, delimiter=' ')
                index = 0
                for row in r:
                    film_index, film_id = row[0].split(' ')
                    if int(film_index) in film_indexes:
                        continue
                    new_indexes[film_index] = index
                    wr.writerow([index, film_id])
                    index += 1

        with open(self.datafile_path, 'rb') as read_file:
            r = csv.reader(read_file)
            with open(self.datafile_path + '_' + str(min_user_votes),
                      'wb') as write_file:
                wr = csv.writer(write_file, delimiter=' ')
                for row in r:
                    user_index, film_index, rate = row[0].split(' ')
                    if int(film_index) in film_indexes:
                        continue
                    wr.writerow([user_index, new_indexes[film_index], rate])
# This code can be run in real time but the model has to be pre-computed

import recsys.algorithm
from recsys.algorithm.factorize import SVD
'''
SVD recommendation only for unknown movies
'''

# Lets make things Verbose
recsys.algorithm.VERBOSE = True
# Loading the computed model
svd = SVD(filename='movielens_small')
svd.load_data(filename='ratings_small.csv',
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
svd.create_matrix()
# Loading the movielens file of movies which has a mapping of movies to movie-id
loop = True

while (loop):
    ratings_file = open('ratings_small.csv', 'r+')
    movie_lens = open('movies.csv', 'r+')
    user_found = False
    movie_found = False
    USERID = int(input("Enter user id: "))
    # Check if the user_id exists. Since currently we are using the small database, we need to check each and every field.
    # If using the complete database, just check if the number lies in the range.
Esempio n. 23
0
class NewsRec():
    def __init__(self):
        self.svd = SVD()
        self.test_set = []

    def load_data(self, filename='train_set_for_svd'):
        self.svd.load_data(filename,
                           sep='\t',
                           format={
                               'value': 0,
                               'row': 2,
                               'col': 1,
                               'ids': int
                           })

    def load_test(self, filename='test_set_for_svd'):
        with open(filename, 'r') as f:
            for line in f:
                strs = line.split('\t')
                self.test_set.append((int(strs[1]), int(strs[2])))

    def recom(self, user_id, recom_num=3, only_unknown=True):
        try:
            #index = self.svd._matrix._matrix.col_index(user_id)
            index = user_id
            return self.svd.recommend(index,
                                      recom_num,
                                      only_unknowns=only_unknown,
                                      is_row=False)
        except IndexError as e:
            return -1

    def compute(self, k=100):
        self.svd.compute(k=k,
                         min_values=None,
                         pre_normalize=None,
                         mean_center=False,
                         post_normalize=True)

    def test(self, recom_num=3):
        hit_cnt = 0
        self.ret = []
        for user, item in self.test_set:
            re = self.recom(user, recom_num)
            #print re
            if type(re) != type([]):
                continue
            try:
                #item_index = self.svd._matrix._matrix.row_index(item)
                item_index = item
            except KeyError as e:
                continue
            for rec_index, rec_rate in re:
                self.ret.append((user, rec_index))
                if item_index == rec_index:
                    hit_cnt += 1
        if hit_cnt == 0:
            return
        user_sum = len(self.test_set)
        recom_sum = recom_num * user_sum
        precise = float(hit_cnt) / recom_sum
        recall = float(hit_cnt) / user_sum
        f = 2.0 / ((1.0 / precise) + (1.0 / recall))
        print 'hit:', hit_cnt
        print 'precise:', precise
        print 'recall:', recall
        print 'F:', f

    def print_ret(self, filename):
        string = ["userid,newsid\n"]
        for user, item in self.ret:
            string.append(str(user))
            string.append(',')
            string.append(str(item))
            string.append('\n')
        with open(filename, 'w') as f:
            f.write("".join(string))
class NewsRec():
	def __init__(self):
		self.svd = SVD()
		self.test_set = []

	def load_data(self,filename = 'train_set_for_svd'):
		self.svd.load_data(filename,sep='\t',format={'value':0,'row':2,'col':1,'ids':int})
	
	def load_test(self,filename = 'test_set_for_svd'):
		with open(filename,'r') as f:
			for line in f:
				strs = line.split('\t')
				self.test_set.append((int(strs[1]),int(strs[2])))

	def recom(self,user_id,recom_num=3,only_unknown=True):
		try:
			#index = self.svd._matrix._matrix.col_index(user_id)
			index = user_id
			return self.svd.recommend(index,recom_num,only_unknowns=only_unknown,is_row=False)
		except IndexError as e:
			return -1

	def compute(self,k = 100):
		self.svd.compute(k=k, min_values=None, pre_normalize=None, mean_center=False, post_normalize=True)

	def test(self,recom_num=3):
		hit_cnt = 0
		self.ret = []
		for user,item in self.test_set:
			re = self.recom(user,recom_num)
			#print re
			if type(re) !=	type([]):
				continue
			try:
				#item_index = self.svd._matrix._matrix.row_index(item)
				item_index = item
			except KeyError as e:
				continue
			for rec_index,rec_rate in re:
				self.ret.append((user,rec_index))
				if item_index == rec_index:
					hit_cnt += 1
		if hit_cnt == 0:
			return
		user_sum = len(self.test_set)
		recom_sum = recom_num * user_sum
		precise = float(hit_cnt) / recom_sum
		recall = float(hit_cnt) / user_sum
		f = 2.0 / (( 1.0 / precise) + (1.0 / recall))
		print 'hit:',hit_cnt
		print 'precise:',precise
		print 'recall:',recall
		print 'F:',f

	def print_ret(self,filename):
		string = ["userid,newsid\n"]
		for user,item in self.ret:
			string.append(str(user))
			string.append(',')
			string.append(str(item))
			string.append('\n')
		with open(filename,'w') as f:
			f.write("".join(string))
Esempio n. 25
0
#coding=utf-8
import recsys.algorithm

recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD

svd = SVD()
svd.load_data(filename='../data/movielens/ratings.csv',
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })

#train,test=data.split_train_test(percent=70)
#svd=SVD()
#svd.set_data(train)

#假设奇异值的个数为100
k = 100
svd.compute(k=k,
            min_values=1,
            pre_normalize=None,
            mean_center=False,
            post_normalize=True)
#svd.compute(k=k,min_values=10,pre_normalize=None,mean_center=True,post_normalize=True,savefile='/tmp/movielens')

#你可以计算两个电影的相似度
Esempio n. 26
0
from evaluation.root_mean_square_error import RootMeanSquareError

__author__ = 'fpena'

from recsys.algorithm.factorize import SVD
svd = SVD()
# file_name = '/Users/fpena/UCC/Thesis/datasets/ml-1m/ratings.dat'
# svd.load_data(filename=file_name, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
file_name = '/Users/fpena/tmp/reviews.csv'
file_name_header = '/Users/fpena/tmp/reviews-header.csv'
# file_name = '/Users/fpena/tmp/small-reviews-matrix.csv'
# file_name_header = '/Users/fpena/tmp/small-reviews-header.csv'
svd.load_data(filename=file_name,
              sep='|',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': str
              })

k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True)
# predicted_rating = svd.predict(int(5), 'A1', 1, 10)
# predicted_rating2 = svd.predict(int(1), 'A1', 1, 10)

# print('Predicted rating', predicted_rating)
# print('Predicted rating', predicted_rating2)
Esempio n. 27
0

# In[2]:

# enable verbose output
recsys.algorithm.VERBOSE = True


# In[3]:
# Formatting the data
svd = SVD()
recsys.algorithm.VERBOSE = True

    # load movielens data
dat_file = './ml-1m/ratings.dat'
svd.load_data(filename=dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
 # About format parameter:
        #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
        #   'col': 0 -> Cols in matrix come from column 0 in ratings.dat file
        #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat
        #   file
        #   'ids': int -> Ids (row and col ids) are integers (not strings)

# In[4]:

# compute svd
k = 100
svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True,
    post_normalize=True)

Esempio n. 28
0
from recsys.algorithm.factorize import SVD

# path = "datasets/ml-1m/ratings.dat"
path = "datasets/ml-latest-small/ratings_train_1.csv"

svd = SVD()
svd.load_data(filename=path,
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': float
              })

k = 30
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True,
            savefile='/tmp/movielens')

# ITEMID1 = 1    # Toy Story (1995)
# ITEMID2 = 2355 # A bug's life (1998)

# print svd.similarity(ITEMID1, ITEMID2)

MIN_RATING = 1.0
MAX_RATING = 5.0
Esempio n. 29
0
def test_load_pickle():
    svd = SVD()
    svd.load_data(os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.pickle'), pickle=True)
    assert_true(isinstance(svd.get_data(), Data))
Esempio n. 30
0
model = RSVD.train(20, train, dims, probeArray=val,
                   learnRate=0.0005, regularization=0.005)

sqerr=0.0
for movieID,userID,rating in test:
    err = rating - model(movieID,userID)
    sqerr += err * err
sqerr /= test.shape[0]
print "Test RMSE: ", np.sqrt(sqerr)


##########
from recsys.algorithm.factorize import SVD
svd = SVD()
svd.load_data(filename='./data/behavior-ml.csv',
            sep='::',
            format={'col':0, 'row':1, 'value':2, 'ids': int})

k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True,
            savefile='/tmp/movielens')

ITEMID1 = 1    # Toy Story (1995)
ITEMID2 = 2355 # A bug's life (1998)

svd.similarity(ITEMID1, ITEMID2)
# 0.67706936677315799
def rectest():


    svd = SVD()
    svd.load_data(filename, sep="::", format={"col":0, "row":1, "value":2, "ids": int})
    return 'Hello World!'

@app.route("/rec")
def rectest():


    svd = SVD()
    svd.load_data(filename, sep="::", format={"col":0, "row":1, "value":2, "ids": int})




if __name__ == '__main__':
    #app.run()
    #import os
    #print os.getcwd()
    import time
    start_time = time.time()

    svd = SVD()
    data = svd.load_data(filename, sep="::", format={"col":0, "row":1, "value":2, "ids": int})
    K = 100
    svd.compute(k=K, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile=None)
    #print data
    #r = svd.predict(200, 1, MIN_VALUE=0, MAX_VALUE=5.0)
    r = svd.recommend(1, n=10, only_unknowns=True, is_row=False )
    print r


    time_consumed = time.time() - start_time
    print time_consumed
import recsys.algorithm
recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD
svd = SVD()
filename = './data4'
filename = './data3.csv'
#filename = './data2.csv'
filename = './data.csv'
filename = './data_l2.csv'
filename = './2016.6.29.for_svd.csv'
svd.load_data(filename=filename,
        sep=',',
        format={'col':0, 'row':1, 'value':2, 'ids': str})
# col -> user, row -> item, value -> label, ids -> timestamp

k = 100
r = svd.compute(k=k,
            min_values=2,
            pre_normalize=None,
            mean_center=False,
            post_normalize=True,
            savefile='/tmp/movielens')

#ITEMID1 = 109    # Toy Story (1995)
#ITEMID2 = 106 # A bug's life (1998)

#print(svd.similarity(ITEMID1, ITEMID2))
# 0.67706936677315799

Esempio n. 34
0
class Recommender:
    def __init__(self, datafile_path=None):
        self.svd = SVD()
        self.matrix = None
        self.datafile_path = datafile_path
        self.predict_matrix = None
        self.load_local_data(self.datafile_path, 100, 0)

    def load_web_data(self, filename, film_names_with_rate_list, K, min_values,
                  MAX_COUNT_USER_FILMS=None, MAX_COUNT_FILM_USERS=None):
        self.matrix = rm.MatrixCreator(MAX_COUNT_USER_FILMS, MAX_COUNT_FILM_USERS).\
            create_matrix_by_film_titles(film_names_with_rate_list)
        self.matrix.save_rating_matrix_as_file(filename)
        self.datafile_path = filename
        self.__compute_matrix(K, min_values)

    def load_local_data(self, filename, K, min_values):
        self.matrix = rm.MatrixCreator().restore_from_file(filename)
        self.datafile_path = filename
        self.__compute_matrix(K, min_values)

    def get_predictions_for_all_users(self, min_rate=1, max_rate=10, top = None, K=None, min_values=0):
        if K:
            self.__compute_matrix(K)

        self.predict_matrix = np.zeros((len(self.matrix.users_indexes_map), len(self.matrix.films_indexes_map)))
        for user in self.matrix.users_indexes_map.keys():
            for film in self.matrix.films_indexes_map.keys():
                user_index = self.matrix.users_indexes_map[user]
                film_index = self.matrix.films_indexes_map[film]
                self.predict_matrix[user_index][film_index] = self.svd.predict(user_index, film_index, MIN_VALUE=min_rate, MAX_VALUE=max_rate)
        return self.predict_matrix


    def predict_for_user(self, user_index, min_rate=1, max_rate=10, top = None, repeat=False, K=None, min_values=None):
        """
        :param K: to change the number of properties
        :return: {Film : int(rate), ...} or
                [(Film, int(rate)), ...] if top is not None
        """
        if K:
            self.__compute_matrix(K)

        prediction = {}
        np_matrix = self.matrix.get_rating_matrix()
        for index in xrange(np_matrix.shape[1]):
            rate = self.svd.predict(user_index, index,
                                    MIN_VALUE=min_rate,
                                    MAX_VALUE=max_rate)
            film = self.matrix.indexes_films_map[index]
            prediction[film] = rate

        if not repeat:
            fake_user_index = self.matrix.indexes_with_fake_user_ids.keys()[0]
            user = self.matrix.indexes_users_map[fake_user_index]
            films = user.get_preferences().keys()

            prediction = [(x, prediction[x]) for x in prediction if x not in films]

        if top:
            prediction = sorted(prediction.items(), key=operator.itemgetter(1))
            prediction = list(reversed(prediction[-top:]))

        return prediction

    def predict_for_all_fake_users(self, min_rate=1, max_rate=10, top = None, K=None, min_values=0):
        """
        :param K: to change the number of properties
        :return: [{Film : int(rate), ...}, ...]
        """
        if K:
            self.__compute_matrix(K)

        predictions = []

        for user_index in self.matrix.indexes_with_fake_user_ids.keys():
            prediction = self.predict_for_user(user_index, min_rate, max_rate, top)
            predictions.append(prediction)

        return predictions

    def predicted_rating_submatrix(self, user_indexes):
        self.__compute_matrix(100)
        predicted = np.empty((1, self.matrix.rating_matrix.shape[1]), int)
        for index in user_indexes:
            row = []
            for film_index in xrange(self.matrix.rating_matrix.shape[1]):
                row.append(self.svd.predict(index, film_index,
                                    MIN_VALUE=1,
                                    MAX_VALUE=10))

            predicted = np.append(predicted, [row], axis=0)
        return predicted[1:]

    def predicted_rating_submatrix_for_fake(self):
        return self.predicted_rating_submatrix(self.matrix.indexes_with_fake_user_ids.keys())

    def __compute_matrix(self, K,
                         min_values=0,
                         pre_normalize=None,
                         mean_center=True,
                         post_normalize=True):
        self.svd.load_data(self.datafile_path, sep=' ', format={'col': 1, 'row': 0, 'value': 2, 'ids': int})
        self.svd.compute(K, min_values, pre_normalize, mean_center, post_normalize, savefile=None)

    def filter_films_data(self, min_user_votes):
        film_indexes = []
        counter = collections.Counter()
        with open(self.datafile_path, 'rb') as my_file:
            r = csv.reader(my_file)
            for row in r:
                user_index, film_index, rate = row[0].split(' ')
                counter[int(film_index)] += 1

            for k, v in counter.iteritems():
                if v < min_user_votes:
                    film_indexes.append(k)

        copyfile(self.datafile_path+'_user_map', self.datafile_path+'_'+str(min_user_votes)+'_user_map')

        new_indexes = {}
        with open(self.datafile_path+'_film_map', 'rb') as read_file:
            r = csv.reader(read_file)
            with open(self.datafile_path+'_'+str(min_user_votes)+'_film_map', 'wb') as write_file:
                wr = csv.writer(write_file, delimiter=' ')
                index = 0
                for row in r:
                    film_index, film_id = row[0].split(' ')
                    if int(film_index) in film_indexes:
                        continue
                    new_indexes[film_index] = index
                    wr.writerow([index, film_id])
                    index += 1

        with open(self.datafile_path, 'rb') as read_file:
            r = csv.reader(read_file)
            with open(self.datafile_path+'_'+str(min_user_votes), 'wb') as write_file:
                wr = csv.writer(write_file, delimiter=' ')
                for row in r:
                    user_index, film_index, rate = row[0].split(' ')
                    if int(film_index) in film_indexes:
                        continue
                    wr.writerow([user_index, new_indexes[film_index], rate])
Esempio n. 35
0
import recsys.algorithm
recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD
svd = SVD()
svd.load_data(filename='ml-1m/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
#This algorithm is called singular value decomposition and is used to compute the model from the ratings.csv file
#This needs to be run only once. The computed model is created as a zip folder. 
# U(Sigma)V^T is the mathematical formula used for computing SVD. using the pyrecsys library to implement the SVD algorithm
#Refer to docs for more details on SVD. 

import recsys.algorithm
from recsys.algorithm.factorize import SVD


#To obtain make the script verbose.
recsys.algorithm.VERBOSE = True

#computing the SVD model
svd = SVD()
#loading the ratings file. Format is used to create the matrix for SVD
svd.load_data(filename='ratings_complete.csv', sep=',' , format={'col':0, 'row':1,  'value':2, 'ids':int})
#Now, lets compute the SVD. Formula is M = U(Sigma)V^T
k = 100
svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='movielens_complete')

print("Model Computed and Created")
Esempio n. 37
0
# -*- coding: utf-8 -*-

from recsys.algorithm.factorize import SVD
svd = SVD()

# 1. Load Movielens dataset:
svd.load_data(filename='/home/andy/xx/recommend/ratings.dat',
            sep='::',
            format={'col':0, 'row':1, 'value':2, 'ids': int})


# 2. Compute Singular Value Decomposition (SVD), M=U Sigma V^t:
k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True,
            savefile='/tmp/movielens')

# 3. Get similarity between two movies:
ITEMID1 = 1    # Toy Story (1995)
ITEMID2 = 2355 # A bug's life (1998)

print svd.similarity(ITEMID1, ITEMID2)
# 0.67706936677315799


"""

# 4. Get movies similar to Toy Story:
Esempio n. 38
0
def SVDloadData():
    svd = SVD()
    recsys.algorithm.VERBOSE = True
    dat_file = 'ratings.dat'
    svd.load_data(filename=dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
    return svd
class RecommendationSystem():
    #def __init__(self, spark_context, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'):
    def __init__(self, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'):
        self.start = True
        self.rating_file = rating_file
        self.movie_file = movie_file
        self.detail_file = detail_file
        self.svd = SVD(filename=model)
        self.svd.load_data(filename=rating_file, sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids': int})
        self.svd.create_matrix()
        self.ia = imdb.IMDb(accessSystem='http')

    def get_all_recomm(self, userid, movieid):
        recom1 = self.svd_recomm(userid, only_unknown=False)
        recom2 = self.svd_recomm(userid, only_unknown=True)
        recom3 = self.svd_similar(movieid)

        brief_info1 = self.get_brief_list(recom1)
        brief_info2 = self.get_brief_list(recom2)
        brief_info3 = self.get_brief_list(recom3)

        return [brief_info1, brief_info2, brief_info3]

    def svd_recomm(self, userid, only_unknown):
        user_found = False
        ratings = open(self.rating_file, 'r')
        for rating_row in ratings:
            rating_item = rating_row.split(',')
            if int(rating_item[0]) == userid:
                user_found = True
                break

        ratings.close()
        if not user_found:
            return None

        #output format: (movieid, similarity value)
        if only_unknown:
            similar_list = self.svd.recommend(userid, n=10, only_unknowns=True, is_row=True)
        else:
            similar_list = self.svd.recommend(userid, n=10, only_unknowns=False, is_row=False)

        movieid_list = self.get_id_list(similar_list)
        return movieid_list

    def svd_similar(self, movieid):
        movie_found = False
        movies = open(self.movie_file, 'r')
        for movie_row in movies:
            row_item = movie_row.split(',')
            if (int(row_item[0]) == movieid):
                movie_found = True
                break

        movies.close()
        if not movie_found:
            return None

        similar_list = self.svd.similar(movieid)
        movieid_list = self.get_id_list(similar_list)
        return movieid_list

    def get_id_list(self, l):
        movieid_list = []
        for s in l:
            movieid_list.append(s[0])
        return movieid_list

    def get_detail(self, imdb_id):
        #print type(imdb_id)
        m = self.ia.get_movie(str(imdb_id))

        cover = m.get('cover url')
        if cover:
            path = "Images/" + str(imdb_id) + ".jpg"
            urllib.urlretrieve(cover, path)

        return m

    def get_brief_list(self, movieList):
        info_list = []
        for m in movieList:
            info = self.get_brief(m)
            info_list.append(info)
        return info_list

    def get_brief(self, movieid):
        info = {}
        info['title'] = 'unknown'
        info['genre'] = 'unknown'
        info['rating'] = 0
        info['imdb_id'] = 1
        info['director'] = 'unknown'
        info['cast'] = 'unknown'

        movies = open(self.movie_file, 'r')
        for m in movies:
            row_item = m.split(',')
            if int(row_item[0]) == movieid:
                info['title'] = str(row_item[1].strip())
                info['genre'] = str(row_item[2].strip()).split('|')
                break
        movies.close()

        ratings = open(self.rating_file, 'r')
        for r in ratings:
            row_item = r.split(',')
            if int(row_item[1]) == movieid:
                info['rating'] = float(row_item[2].strip())
                break
        ratings.close()

        details = open(self.detail_file, 'r')
        #details = codecs.open(self.detail_file, 'r', 'utf-8')
        for d in details:
            row_item = d.split(',')
            if int(row_item[0]) == movieid:
                #print 'found!'
                info['imdb_id'] = int(row_item[1].strip())
                info['director'] = str(row_item[3].strip())
                info['cast'] = str(row_item[4].strip()).split('|')
                break
        details.close()

        return info
Esempio n. 40
0


from boto.s3.connection import S3Connection
import urllib2

db = DBConn()
conn = S3Connection('AKIAI6F6HFFENFWSPN4Q', 'aP0OOVDj96AFUEr9vbHalvvNZz7rNNXyyH0Wof7i')
bucket = conn.get_bucket('elasticbeanstalk-us-west-2-501394068089')

ld_occurrences_key = bucket.get_key('files/data/ld_occurrences.dat')
ld_occurrences_path = ld_occurrences_key.generate_url(3600, query_auth=True, force_http=True)
ld_occurrences_content = urllib2.urlopen(ld_occurrences_path).read()

svd = SVD()
svd.load_data(filename=ld_occurrences_content, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': str})
all_items = svd.recommend(USER_ID, n=10, only_unknowns=False, is_row=False)
for index, relevance in all_items:
    print index, items[index].get_data()['name'], relevance

# genres = db.get_genres(USER_ID)
# if len(genres['genres']) > 0:
#     pred_items = myFunctions.get_items_user_genre(items, all_items, genres)[:50]
# else:
#     pred_items = all_items[:50]
#
# for index, relevance in pred_items:
#     print index, items[index].get_data()['name'], items[index].get_data()['genres'], relevance


#
cur.execute('SELECT key_id, title from Books6')
results = np.array(cur.fetchall())
bookkeys = np.array(results[:, 0], int)
booktitles = np.array(results[:, 1], str)

# create a new table called simscores with 3 column: item_id1, item_id2, similarityscore
simtable = 'svdsimilarityscores6'
cur.execute('drop table if exists %s' % simtable)
cur.execute('CREATE TABLE %s (item_id1 INT NOT NULL, item_id2 INT NOT NULL, sim_cosine FLOAT NOT NULL)' % simtable)



# load rating data
svd = SVD()
svd.load_data(filename='./data/MERGED6.csv',
              sep=',',
              format={'row':0, 'col':1, 'value':2, 'ids': int})
# About format parameter:
#   'row': 0 -> Rows in matrix come from first column; itemkey_id
#   'col': 1 -> Cols in matrix come from second column; usrkey_id 
#   'value': 2 -> Values (Mij) in matrix come from third column
#   'ids': int -> Ids (row and col ids) are integers (not strings)
# if row is item (not user), then it's item based, and the similarity scores will be between items.


k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True,
Esempio n. 42
0
#!/usr/bin/env python
# coding=utf-8

from recsys.algorithm.factorize import SVD
svd = SVD()
svd.load_data(filename='../invited_info_train_question_sort.txt',
              sep='\t',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': str
              })
k = 200
svd.compute(k=k, savefile='../tmp/weight')

svd2 = SVD(filename='../tmp/weight')  # Loading already computed SVD model

output_path = "./output.txt"
output_file = open(output_path, 'w')
validate_file = file("../validate_nolabel.txt")
line = validate_file.readline()
line = validate_file.readline().strip("\r\n")

while line:
    question_id = line.split(',')[0]
    user_id = line.split(',')[1]
    try:
        predict = svd2.predict(user_id, question_id, 0.0, 1.0)
    except:
        predict = 0
Esempio n. 43
0
import recsys.algorithm
recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD
svd = SVD()
svd.load_data(filename='train.csv', sep=',', format={'col':0, 'row':1, 'value':2})

k = 100
svd.compute(k=k, pre_normalize=None, mean_center=True, post_normalize=True)

MIN_RATING = 0.0
MAX_RATING = 5000.0

import csv
test_file = 'test.csv'
soln_file = 'recsys.csv'

with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]
            res    = svd.predict(artist, user, MIN_RATING, MAX_RATING)
            soln_csv.writerow([id, res])
class RecommendationSystem():
    # To run on your own machine, you need to initialize with your datapath to the frontend folder
    def __init__(
            self,
            sc,
            datapath='/media/psf/Home/CS/GIT_HUB/Movie-Recommendation-Project/frontend/',
            rating_file='ratings_small.csv',
            complete_rating_file='ratings.csv',
            movie_file='movies.csv',
            detail_file='modified.csv',
            model='movielens_small'):
        self.sc = sc
        self.start = True
        self.rating_file = datapath + rating_file
        self.complete_rating_file = datapath + complete_rating_file
        self.movie_file = datapath + movie_file
        self.detail_file = datapath + detail_file
        self.integration_folder = datapath
        self.svd = SVD(filename=datapath + model)
        self.svd.load_data(filename=self.rating_file,
                           sep=',',
                           format={
                               'col': 0,
                               'row': 1,
                               'value': 2,
                               'ids': int
                           })
        self.svd.create_matrix()
        self.ia = imdb.IMDb(accessSystem='http')

        # als stuff
        self.sqlContext = SQLContext(self.sc)
        self.movie_data = self.sc.textFile(self.movie_file)
        self.ratings_data = self.sc.textFile(
            self.complete_rating_file).map(lambda line: line.split(",")).map(
                lambda x: (int(x[0]), int(x[1]), float(x[2])))
        self.als_model_path = datapath + 'Model_Collaborative_Filtering'
        self.als_model = MatrixFactorizationModel.load(sc, self.als_model_path)
        self.movie_df = self.sqlContext.read.load(datapath + 'tables/movies')
        self.detail_df = self.sqlContext.read.load(datapath + 'tables/detail')
        self.rating_df = self.sqlContext.read.load(datapath + 'tables/ratings')

    # call this function to get all recommendations
    def get_all_recomm(self, userid, moviename):
        movieid = self.get_movie_id(moviename)

        # all recommendation algorithms return a list of movie ids
        recom1 = self.svd_recomm(userid, only_unknown=True)
        recom2 = self.svd_similar(movieid)
        recom3 = self.als_new(userid)

        #get info about the movie based on movie ids
        brief_info1 = self.get_brief_list(recom1)
        brief_info2 = self.get_brief_list(recom2)
        brief_info3 = self.get_brief_list(recom3)

        # print to terminal
        for l1 in brief_info1:
            print l1
        for l2 in brief_info2:
            print l2
        for l3 in brief_info3:
            print l3

        return [brief_info1, brief_info2, brief_info3]

    # get movie id based on movie name input
    def get_movie_id(self, moviename):
        r = self.movie_df.where(
            self.movie_df['name'].startswith(moviename)).first()

        # return movie id 1 if not found
        if r is None:
            return 1

        return r['movieId']

    # svd recommendation algorithm based on the user's rating history, set only_known to True for unseen movies
    def svd_recomm(self, userid, only_unknown):
        user_found = False
        ratings = open(self.rating_file, 'r')
        for rating_row in ratings:
            rating_item = rating_row.split(',')
            if int(rating_item[0]) == userid:
                user_found = True
                break

        ratings.close()
        if not user_found:
            return None

        # output format: (movieid, similarity value)
        if only_unknown:
            similar_list = self.svd.recommend(userid,
                                              n=10,
                                              only_unknowns=True,
                                              is_row=True)
        else:
            similar_list = self.svd.recommend(userid,
                                              n=10,
                                              only_unknowns=False,
                                              is_row=False)

        movieid_list = self.get_id_list(similar_list)
        return movieid_list

    # svd recommendation algorithm based on similar movie
    def svd_similar(self, movieid):
        movie_found = False
        movies = open(self.movie_file, 'r')
        for movie_row in movies:
            row_item = movie_row.split(',')
            if int(row_item[0]) == movieid:
                movie_found = True
                break

        movies.close()
        if not movie_found:
            return None

        similar_list = self.svd.similar(movieid)
        movieid_list = self.get_id_list(similar_list)
        return movieid_list

    # this ALS recommendation algorithm did not get to present to front end
    # future work is needed to improve this algorithm
    def als_recomm(self, userid):
        user_movie_ratings = [
            16, 24, 32, 47, 50, 110, 150, 161, 165, 204, 223, 256, 260, 261,
            277
        ]
        unrated_movies = self.movie_data.filter(lambda x: x[
            0] not in user_movie_ratings).map(lambda x: (userid, x[0]))
        recommended_movies_rdd = self.als_model.predictAll(unrated_movies)
        # Now we get a list of predictions for all the movies which user has not seen. We take only the top 10 predictions
        user_recommended_ratings_rdd = recommended_movies_rdd.map(
            lambda x: (x.product, x.rating))

        movie_ID_with_ratings_RDD = self.ratings_data.map(
            lambda x: (x[1], x[2])).groupByKey()
        movie_ID_with_avg_ratings_RDD = movie_ID_with_ratings_RDD.map(
            get_counts_and_averages)
        movie_rating_counts_rdd = movie_ID_with_avg_ratings_RDD.map(
            lambda x: (x[0], x[1][0]))

        user_recommended_movies_ratings_count_rdd = (
            user_recommended_ratings_rdd.join(movie_rating_counts_rdd)
        ).map(lambda l: (l[0], l[1][0], l[1][1]))
        recommended_movies_list = user_recommended_movies_ratings_count_rdd.filter(
            lambda l: l[2] >= 20).takeOrdered(20, key=lambda x: -x[1])

        return recommended_movies_list

    # an ALS recommendation algorithm based on user rating history
    def als_new(self, userid):
        recommended_movies = self.als_model.recommendProducts(userid, 10)
        recommended_movie_list = []
        for movie in recommended_movies:
            recommended_movie_list.append(movie[1])

        return recommended_movie_list

    # return a list of movie id
    def get_id_list(self, l):
        movieid_list = []
        for s in l:
            movieid_list.append(s[0])
        return movieid_list

    # this function connects to imdb database to get info (including cover image)
    # did not make it to front end due to performance and latency issue
    # need future work for improvement
    def get_detail(self, movieid, imdb_id):
        m = self.ia.get_movie(str(imdb_id))

        cover = m.get('cover url')
        if cover:
            path = self.integration_folder + "Images/" + str(movieid) + ".jpg"
            urllib.urlretrieve(cover, path)

        return m

    # get a list of movie info given a list of movie ids
    def get_brief_list(self, movieList):
        info_list = []
        for m in movieList:
            info = self.get_brief(m)
            if info['title'] != 'unknown':
                info_list.append(info)
            if len(info_list) == 5:
                break

        return info_list

    # get movie info (title, direction, genres, rating, cast) from our rdd database
    def get_brief(self, movieid):
        info = {}
        info['movieid'] = movieid
        info['title'] = 'unknown'
        info['genres'] = 'unknown'
        info['rating'] = 0
        #info['imdbid'] = 1
        info['director'] = 'unknown'
        info['cast'] = 'unknown'

        m = self.movie_df.where(self.movie_df['movieId'] == movieid).first()
        if m is not None:
            info['title'] = m['name']
            info['genres'] = m['genres']
            if len(info['genres']) > 3:
                info['genres'] = info['genres'][0:3]

        d = self.detail_df.where(self.detail_df['movieId'] == movieid).first()
        if d is not None:
            info['director'] = d['director']
            info['cast'] = d['cast']

        r = self.rating_df.where(self.rating_df['movieId'] == movieid)

        # default rating to be 4.6
        if r.count() == 0:
            info['rating'] = 4.6
        else:
            avg = r.map(lambda row: row['rating']).reduce(
                lambda x, y: x + y) / r.count()
            info['rating'] = avg

        return info
class RecommendationSystem():
    #def __init__(self, spark_context, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'):
    def __init__(self,
                 rating_file='ratings_small.csv',
                 movie_file='movies.csv',
                 detail_file='modified.csv',
                 model='movielens_small'):
        self.start = True
        self.rating_file = rating_file
        self.movie_file = movie_file
        self.detail_file = detail_file
        self.svd = SVD(filename=model)
        self.svd.load_data(filename=rating_file,
                           sep=',',
                           format={
                               'col': 0,
                               'row': 1,
                               'value': 2,
                               'ids': int
                           })
        self.svd.create_matrix()
        self.ia = imdb.IMDb(accessSystem='http')

    def get_all_recomm(self, userid, movieid):
        recom1 = self.svd_recomm(userid, only_unknown=False)
        recom2 = self.svd_recomm(userid, only_unknown=True)
        recom3 = self.svd_similar(movieid)

        brief_info1 = self.get_brief_list(recom1)
        brief_info2 = self.get_brief_list(recom2)
        brief_info3 = self.get_brief_list(recom3)

        return [brief_info1, brief_info2, brief_info3]

    def svd_recomm(self, userid, only_unknown):
        user_found = False
        ratings = open(self.rating_file, 'r')
        for rating_row in ratings:
            rating_item = rating_row.split(',')
            if int(rating_item[0]) == userid:
                user_found = True
                break

        ratings.close()
        if not user_found:
            return None

        #output format: (movieid, similarity value)
        if only_unknown:
            similar_list = self.svd.recommend(userid,
                                              n=10,
                                              only_unknowns=True,
                                              is_row=True)
        else:
            similar_list = self.svd.recommend(userid,
                                              n=10,
                                              only_unknowns=False,
                                              is_row=False)

        movieid_list = self.get_id_list(similar_list)
        return movieid_list

    def svd_similar(self, movieid):
        movie_found = False
        movies = open(self.movie_file, 'r')
        for movie_row in movies:
            row_item = movie_row.split(',')
            if (int(row_item[0]) == movieid):
                movie_found = True
                break

        movies.close()
        if not movie_found:
            return None

        similar_list = self.svd.similar(movieid)
        movieid_list = self.get_id_list(similar_list)
        return movieid_list

    def get_id_list(self, l):
        movieid_list = []
        for s in l:
            movieid_list.append(s[0])
        return movieid_list

    def get_detail(self, imdb_id):
        #print type(imdb_id)
        m = self.ia.get_movie(str(imdb_id))

        cover = m.get('cover url')
        if cover:
            path = "Images/" + str(imdb_id) + ".jpg"
            urllib.urlretrieve(cover, path)

        return m

    def get_brief_list(self, movieList):
        info_list = []
        for m in movieList:
            info = self.get_brief(m)
            info_list.append(info)
        return info_list

    def get_brief(self, movieid):
        info = {}
        info['title'] = 'unknown'
        info['genre'] = 'unknown'
        info['rating'] = 0
        info['imdb_id'] = 1
        info['director'] = 'unknown'
        info['cast'] = 'unknown'

        movies = open(self.movie_file, 'r')
        for m in movies:
            row_item = m.split(',')
            if int(row_item[0]) == movieid:
                info['title'] = str(row_item[1].strip())
                info['genre'] = str(row_item[2].strip()).split('|')
                break
        movies.close()

        ratings = open(self.rating_file, 'r')
        for r in ratings:
            row_item = r.split(',')
            if int(row_item[1]) == movieid:
                info['rating'] = float(row_item[2].strip())
                break
        ratings.close()

        details = open(self.detail_file, 'r')
        #details = codecs.open(self.detail_file, 'r', 'utf-8')
        for d in details:
            row_item = d.split(',')
            if int(row_item[0]) == movieid:
                #print 'found!'
                info['imdb_id'] = int(row_item[1].strip())
                info['director'] = str(row_item[3].strip())
                info['cast'] = str(row_item[4].strip()).split('|')
                break
        details.close()

        return info
Esempio n. 46
0
def Compute():
	svd = SVD()
	svd.load_data(filename='./ml-1m/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
	svd.compute(k=100, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='./mvsvd')
Esempio n. 47
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author: Li Zhijun


from recsys.algorithm.factorize import SVD


svd = SVD()
svd.load_data(filename='ml-latest-small/ratings1.csv',
              sep=',',
              # format={'userId':0,'movieId':1,'rating':2,'ids':int})
              format={'col': 0, 'row': 1, 'value': 2, 'ids': int})

k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True,
            savefile='/tmp/movielens')


def get_items_similarity(item_id1, item_id2):
    return svd.similarity(item_id1, item_id2)


def get_similar_items(item_id,n=10):
    return svd.similar(item_id,n)

Esempio n. 48
0
def svd(filepath):

    src_folder = parseOutputFolderPath(filepath)
    base_file_name = parseFileName(filepath)

    avg_rmse = 0.0
    avg_mae = 0.0

    out_file_base = base_file_name + "_pred_svd"
    out_file = open(src_folder + "output/" + out_file_base + EXT, "w")

    # for each fold
    for fold_index in xrange(1, NUM_FOLDS + 1):

        print "*** \t FOLD {0} \t ***".format(fold_index)

        M_test = lil_matrix((_N, _M))
        rmse = 0.0
        mae = 0.0

        train_path = src_folder + base_file_name + TRAIN_PREFIX + str(
            fold_index) + EXT
        test_path = src_folder + base_file_name + TEST_PREFIX + str(
            fold_index) + EXT

        print train_path
        print test_path

        svd = SVD()
        svd.load_data(filename=train_path,
                      sep=',',
                      format={
                          'col': 0,
                          'row': 1,
                          'value': 2,
                          'ids': float
                      })

        svd.compute(k=_K,
                    min_values=1,
                    pre_normalize=None,
                    mean_center=True,
                    post_normalize=True)

        with open(test_path, "r") as infile:
            reader = csv.reader(infile, delimiter=",")
            for line in reader:
                userid = int(line[0], 10)
                movieid = int(line[1], 10)
                score = float(line[2])
                M_test[userid, movieid] = score

        # GROUND_TRUTH = [3.0, 1.0, 5.0, 2.0, 3.0]
        # TEST = [2.3, 0.9, 4.9, 0.9, 1.5]
        # mae = MAE()
        # mae.load_ground_truth(GROUND_TRUTH)
        # mae.load_test(TEST)
        # mae.compute() #returns 0.7

        # write predictions only for first test (fold)
        if (fold_index == 1):
            rows, cols = M_test.nonzero()
            for row, col in zip(rows, cols):
                try:
                    r_xi = svd.predict(col, row, MIN_RATING, MAX_RATING)
                except:
                    print row, col
                out_file.write(
                    str(row) + '\t' + str(col) + '\t' + str(r_xi) + '\n')

        print "..done"
        print ""

        exit()

    out_file.close()

    # average rmse and mae on validation folds
    eval_out_path = src_folder + "output/" + out_file_base + "_eval" + EXT

    with open(eval_out_path, "w") as file:
        file.write("RMSE" + "\t" + "MAE" + "\n")
        avg_rmse /= float(NUM_FOLDS)
        avg_mae /= float(NUM_FOLDS)
        file.write(str(avg_rmse) + "\t" + str(avg_mae))
class RecommendationSystem():
    # To run on your own machine, you need to initialize with your datapath to the frontend folder
    def __init__(self, sc, datapath='/media/psf/Home/CS/GIT_HUB/Movie-Recommendation-Project/frontend/', rating_file='ratings_small.csv', complete_rating_file='ratings.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'):
        self.sc = sc
        self.start = True
        self.rating_file = datapath+rating_file
        self.complete_rating_file = datapath+complete_rating_file
        self.movie_file = datapath+movie_file
        self.detail_file = datapath+detail_file
        self.integration_folder = datapath
        self.svd = SVD(filename=datapath+model)
        self.svd.load_data(filename=self.rating_file, sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids': int})
        self.svd.create_matrix()
        self.ia = imdb.IMDb(accessSystem='http')

        # als stuff
        self.sqlContext = SQLContext(self.sc)
        self.movie_data = self.sc.textFile(self.movie_file)
        self.ratings_data = self.sc.textFile(self.complete_rating_file).map(lambda line: line.split(",")).map(lambda x: (int(x[0]), int(x[1]), float(x[2])))
        self.als_model_path = datapath + 'Model_Collaborative_Filtering'
        self.als_model = MatrixFactorizationModel.load(sc, self.als_model_path)
        self.movie_df = self.sqlContext.read.load(datapath+'tables/movies')
        self.detail_df = self.sqlContext.read.load(datapath+'tables/detail')
        self.rating_df = self.sqlContext.read.load(datapath+'tables/ratings')


    # call this function to get all recommendations
    def get_all_recomm(self, userid, moviename):
        movieid = self.get_movie_id(moviename)

        # all recommendation algorithms return a list of movie ids
        recom1 = self.svd_recomm(userid, only_unknown=True)
        recom2 = self.svd_similar(movieid)
        recom3 = self.als_new(userid)

        #get info about the movie based on movie ids
        brief_info1 = self.get_brief_list(recom1)
        brief_info2 = self.get_brief_list(recom2)
        brief_info3 = self.get_brief_list(recom3)

        # print to terminal
        for l1 in brief_info1:
            print l1
        for l2 in brief_info2:
            print l2
        for l3 in brief_info3:
            print l3

        return [brief_info1, brief_info2, brief_info3]

    # get movie id based on movie name input
    def get_movie_id(self, moviename):
        r = self.movie_df.where(self.movie_df['name'].startswith(moviename)).first()

        # return movie id 1 if not found
        if r is None:
            return 1

        return r['movieId']

    # svd recommendation algorithm based on the user's rating history, set only_known to True for unseen movies
    def svd_recomm(self, userid, only_unknown):
        user_found = False
        ratings = open(self.rating_file, 'r')
        for rating_row in ratings:
            rating_item = rating_row.split(',')
            if int(rating_item[0]) == userid:
                user_found = True
                break

        ratings.close()
        if not user_found:
            return None

        # output format: (movieid, similarity value)
        if only_unknown:
            similar_list = self.svd.recommend(userid, n=10, only_unknowns=True, is_row=True)
        else:
            similar_list = self.svd.recommend(userid, n=10, only_unknowns=False, is_row=False)

        movieid_list = self.get_id_list(similar_list)
        return movieid_list

    # svd recommendation algorithm based on similar movie
    def svd_similar(self, movieid):
        movie_found = False
        movies = open(self.movie_file, 'r')
        for movie_row in movies:
            row_item = movie_row.split(',')
            if int(row_item[0]) == movieid:
                movie_found = True
                break

        movies.close()
        if not movie_found:
            return None

        similar_list = self.svd.similar(movieid)
        movieid_list = self.get_id_list(similar_list)
        return movieid_list

    # this ALS recommendation algorithm did not get to present to front end
    # future work is needed to improve this algorithm
    def als_recomm(self, userid):
        user_movie_ratings = [16, 24, 32, 47, 50, 110, 150, 161, 165, 204, 223, 256, 260, 261, 277]
        unrated_movies = self.movie_data.filter(lambda x: x[0] not in user_movie_ratings).map(lambda x: (userid, x[0]))
        recommended_movies_rdd = self.als_model.predictAll(unrated_movies)
        # Now we get a list of predictions for all the movies which user has not seen. We take only the top 10 predictions
        user_recommended_ratings_rdd = recommended_movies_rdd.map(lambda x: (x.product, x.rating))

        movie_ID_with_ratings_RDD = self.ratings_data.map(lambda x: (x[1], x[2])).groupByKey()
        movie_ID_with_avg_ratings_RDD = movie_ID_with_ratings_RDD.map(get_counts_and_averages)
        movie_rating_counts_rdd = movie_ID_with_avg_ratings_RDD.map(lambda x: (x[0], x[1][0]))

        user_recommended_movies_ratings_count_rdd = (user_recommended_ratings_rdd.join(movie_rating_counts_rdd)).map(lambda l: (l[0], l[1][0], l[1][1]))
        recommended_movies_list = user_recommended_movies_ratings_count_rdd.filter(lambda l: l[2] >= 20).takeOrdered(20, key=lambda x: -x[1])

        return recommended_movies_list

    # an ALS recommendation algorithm based on user rating history
    def als_new(self, userid):
        recommended_movies = self.als_model.recommendProducts(userid, 10)
        recommended_movie_list = []
        for movie in recommended_movies:
            recommended_movie_list.append(movie[1])

        return recommended_movie_list

    # return a list of movie id
    def get_id_list(self, l):
        movieid_list = []
        for s in l:
            movieid_list.append(s[0])
        return movieid_list

    # this function connects to imdb database to get info (including cover image)
    # did not make it to front end due to performance and latency issue
    # need future work for improvement
    def get_detail(self, movieid, imdb_id):
        m = self.ia.get_movie(str(imdb_id))

        cover = m.get('cover url')
        if cover:
            path = self.integration_folder + "Images/" + str(movieid) + ".jpg"
            urllib.urlretrieve(cover, path)

        return m

    # get a list of movie info given a list of movie ids
    def get_brief_list(self, movieList):
        info_list = []
        for m in movieList:
            info = self.get_brief(m)
            if info['title'] != 'unknown':
                info_list.append(info)
            if len(info_list) == 5:
                break

        return info_list

    # get movie info (title, direction, genres, rating, cast) from our rdd database
    def get_brief(self, movieid):
        info = {}
        info['movieid'] = movieid
        info['title'] = 'unknown'
        info['genres'] = 'unknown'
        info['rating'] = 0
        #info['imdbid'] = 1
        info['director'] = 'unknown'
        info['cast'] = 'unknown'

        m = self.movie_df.where(self.movie_df['movieId'] == movieid).first()
        if m is not None:
            info['title'] = m['name']
            info['genres'] = m['genres']
            if len(info['genres']) > 3:
                info['genres'] = info['genres'][0:3]

        d = self.detail_df.where(self.detail_df['movieId'] == movieid).first()
        if d is not None:
            info['director'] = d['director']
            info['cast'] = d['cast']

        r = self.rating_df.where(self.rating_df['movieId'] == movieid)

        # default rating to be 4.6
        if r.count()==0:
            info['rating'] = 4.6
        else:
            avg = r.map(lambda row:row['rating']).reduce(lambda x, y: x+y)/r.count()
            info['rating'] = avg

        return info
Esempio n. 50
0
import recsys.algorithm
from recsys.datamodel.data import Data
from recsys.algorithm.factorize import SVD

recsys.algorithm.VERBOSE = True

#Load a dataset

svd = SVD()
svd.load_data(filename='./data/ratings.dat',
              sep='::',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })

#Haciendo el split al dataset
filename = './data/ratings.dat'
data = Data()
format = {'col': 0, 'row': 1, 'value': 2, 'ids': int}
data.load(filename, sep='::', format=format)
train_80, test_20 = data.split_train_test(percent=80)  # 80% train, 20% test
svd = SVD()
svd.set_data(train_80)

#Ingresando  variables para crear la matrizx
k = 100
svd.compute(k=k,
            min_values=10,
Esempio n. 51
0
import recsys.algorithm
import pandas as pd
from tqdm import tqdm
recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD
svd = SVD()
svd.load_data(filename='../input/user_item_cnt_noheader.csv',
              sep=',',
              format={
                  'col': 1,
                  'row': 0,
                  'value': 2,
                  'ids': int
              })

k = 100
svd.compute(k=k,
            min_values=1,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True,
            savefile='./tmp')

users = pd.read_csv('../input/user_item_cnt.csv',
                    usecols=['user_id'])['user_id'].unique()

for user_id in tqdm(user):
    ret = svd.recommend(user_id, 100, is_row=False)
    import pdb
    pdb.set_trace()