Exemple #1
0
    def fit(self, reviews_pth, movies_pth, latent_features=12, learning_rate=0.0001, iters=100):
        '''
        This function performs matrix factorization using a basic form of FunkSVD with no regularization

        INPUT:
        reviews_pth - path to csv with at least the four columns: 'user_id', 'movie_id', 'rating', 'timestamp'
        movies_pth - path to csv with each movie and movie information in each row
        latent_features - (int) the number of latent features used
        learning_rate - (float) the learning rate
        iters - (int) the number of iterations

        OUTPUT:
        None - stores the following as attributes:
        n_users - the number of users (int)
        n_movies - the number of movies (int)
        num_ratings - the number of ratings made (int)
        reviews - dataframe with four columns: 'user_id', 'movie_id', 'rating', 'timestamp'
        movies - dataframe of
        user_item_mat - (np array) a user by item numpy array with ratings and nans for values
        latent_features - (int) the number of latent features used
        learning_rate - (float) the learning rate
        iters - (int) the number of iterations
        '''
        # Store inputs as attributes
        self.reviews = pd.read_csv(reviews_pth)
        self.movies = pd.read_csv(movies_pth)

        # Create user-item matrix
        usr_itm = self.reviews[['user_id', 'movie_id', 'rating', 'timestamp']]
        self.user_item_df = usr_itm.groupby(['user_id','movie_id'])['rating'].max().unstack()
        self.user_item_mat= np.array(self.user_item_df)

        # Store more inputs
        self.latent_features = latent_features
        self.learning_rate = learning_rate
        self.iters = iters

        # Set up useful values to be used through the rest of the function
        self.n_users = self.user_item_mat.shape[0]
        self.n_movies = self.user_item_mat.shape[1]
        self.num_ratings = np.count_nonzero(~np.isnan(self.user_item_mat))
        self.user_ids_series = np.array(self.user_item_df.index)
        self.movie_ids_series = np.array(self.user_item_df.columns)

        # initialize the user and movie matrices with random values
        user_mat = np.random.rand(self.n_users, self.latent_features)
        movie_mat = np.random.rand(self.latent_features, self.n_movies)

        # initialize sse at 0 for first iteration
        sse_accum = 0

        # keep track of iteration and MSE
        print("Optimizaiton Statistics")
        print("Iterations | Mean Squared Error ")

        # for each iteration
        for iteration in range(self.iters):

            # update our sse
            old_sse = sse_accum
            sse_accum = 0

            # For each user-movie pair
            for i in range(self.n_users):
                for j in range(self.n_movies):

                    # if the rating exists
                    if self.user_item_mat[i, j] > 0:

                        # compute the error as the actual minus the dot product of the user and movie latent features
                        diff = self.user_item_mat[i, j] - np.dot(user_mat[i, :], movie_mat[:, j])

                        # Keep track of the sum of squared errors for the matrix
                        sse_accum += diff**2

                        # update the values in each matrix in the direction of the gradient
                        for k in range(self.latent_features):
                            user_mat[i, k] += self.learning_rate * (2*diff*movie_mat[k, j])
                            movie_mat[k, j] += self.learning_rate * (2*diff*user_mat[i, k])

            # print results
            print("%d \t\t %f" % (iteration+1, sse_accum / self.num_ratings))

        # SVD based fit
        # Keep user_mat and movie_mat for safe keeping
        self.user_mat = user_mat
        self.movie_mat = movie_mat

        # Knowledge based fit
        self.ranked_movies = rf.create_ranked_df(self.movies, self.reviews)
Exemple #2
0
    def fit(self,
            reviews_loc,
            movies_loc,
            latent_features=15,
            n_iter=100,
            learning_rate=0.001):
        '''
        fit the recommender to your dataset and also have this save the results
        to pull from when you need to make predictions

        :param reviews_loc: path to the reviews dataset (str)
        :param movies_loc: path to the movies dataset (str)
        :param latent_features: number of latent features to keep (int)
        :param n_iter: number of iterations (int)
        :param learning_rate: the learning rate (float)

        :returns None
        '''

        # Read in the datasets
        self.movies = pd.read_csv(movies_loc)
        self.reviews = pd.read_csv(reviews_loc)

        del self.movies['Unnamed: 0']
        del self.reviews['Unnamed: 0']

        # Create user-by-item matrix
        self.train_df = self.reviews[[
            'user_id', 'movie_id', 'rating', 'timestamp'
        ]]
        self.user_item_df = self.train_df.groupby(
            ['user_id', 'movie_id'])['rating'].max().unstack()

        self.user_item_matrix = np.array(self.user_item_df)
        self.latent_features = latent_features
        self.learning_rate = learning_rate
        self.iter = n_iter

        # Set up useful values to be used through the rest of the function
        self.n_users = self.user_item_matrix.shape[0]
        self.n_movies = self.user_item_matrix.shape[1]
        self.n_ratings = np.count_nonzero(~np.isnan(self.user_item_matrix))
        self.movie_ids = np.array(self.user_item_df.columns)
        self.user_ids = np.array(self.user_item_df.index)

        # initialize the user and movie matrices with random values
        user_mat = np.random.rand(self.n_users, self.latent_features)
        movie_mat = np.random.rand(self.latent_features, self.n_movies)

        # initialize sse at 0 for first iteration
        sse_accum = 0

        # header for running results
        print("Optimization Statistics")
        print("Iterations | Mean Squared Error ")

        # for each iteration
        for i in range(n_iter):
            # update our sse
            old_sse = sse_accum
            sse_accum = 0

            # For each user-movie pair
            for user in range(self.n_users):
                for movie in range(self.n_movies):
                    # if the rating exists
                    if self.user_item_matrix[user, movie] > 0:
                        # compute the error as the actual minus the dot product of the user
                        # and movie latent features
                        prediction = np.dot(user_mat[user], movie_mat[:,
                                                                      movie])
                        diff = self.user_item_matrix[user, movie] - prediction

                        # Keep track of the sum of squared errors for the matrix
                        sse_accum += diff**2

                        # update the values in each matrix in the direction of the gradient
                        user_mat[
                            user] += learning_rate * 2 * diff * movie_mat[:,
                                                                          movie]
                        movie_mat[:,
                                  movie] += learning_rate * 2 * diff * user_mat[
                                      user]

            #print results for iteration
            print("%d \t\t %f" % (i + 1, sse_accum / self.n_ratings))

        # FunkSVD solution
        # Storing the user mat and movie mat
        self.user_mat = user_mat
        self.movie_mat = movie_mat

        # Knowledge base solution
        self.ranked_movies = rf.create_ranked_df(self.movies, self.reviews)
Exemple #3
0
    def fit(self,
            trainpath,
            moviepath,
            latent_features=15,
            learning_rate=0.005,
            iters=100):
        '''
        fit the recommender to your dataset and also have this save the results
        to pull from when you need to make predictions
        INPUT:
        trainpath - train set path
        moviepath - movie data set path
        latent_features - (int) the number of latent features used (defule 15)
        learning_rate - (float) the learning rate (defule 0.005)
        iters - (int) the number of iterations (defule 100)

        OUTPUT: None
        attributes:
        train_df - review df
        movies -  movie df
        train_data_df - unstacked train df
        ratings_mat -rating matrix 
        n_users - num users
        n_movies - num movies
        num_ratings - num ratings
        user_mat - (numpy array) a user by latent feature matrix
        movie_mat - (numpy array) a latent feature by movie matrix
        ranked_movies - a dataframe with movies that are sorted by highest avg rating, more reviews, then time, and must have more than 4 ratings
        '''
        self.train_df = pd.read_csv(trainpath)
        self.movies = pd.read_csv(moviepath)
        # Create user-by-item matrix - nothing to do here
        train_user_item = self.train_df[[
            'user_id', 'movie_id', 'rating', 'timestamp'
        ]]
        self.train_data_df = train_user_item.groupby(
            ['user_id', 'movie_id'])['rating'].max().unstack()
        train_data_np = np.array(self.train_data_df)
        self.ratings_mat = train_data_np
        # Set up useful values to be used through the rest of the function
        self.n_users = self.ratings_mat.shape[0]
        self.n_movies = self.ratings_mat.shape[1]
        self.num_ratings = np.count_nonzero(~np.isnan(self.ratings_mat))

        # initialize the user and movie matrices with random values
        user_mat = np.random.rand(self.n_users, latent_features)
        movie_mat = np.random.rand(latent_features, self.n_movies)

        # initialize sse at 0 for first iteration
        sse_accum = 0

        # keep track of iteration and MSE
        print("Optimizaiton Statistics")
        print("Iterations | Mean Squared Error ")

        # for each iteration
        for iteration in range(iters):

            # update our sse
            old_sse = sse_accum
            sse_accum = 0

            # For each user-movie pair
            for i in range(self.n_users):
                for j in range(self.n_movies):

                    # if the rating exists
                    if self.ratings_mat[i, j] > 0:

                        # compute the error as the actual minus the dot product of the user and movie latent features
                        diff = self.ratings_mat[i, j] - np.dot(
                            user_mat[i, :], movie_mat[:, j])

                        # Keep track of the sum of squared errors for the matrix
                        sse_accum += diff**2

                        # update the values in each matrix in the direction of the gradient
                        for k in range(latent_features):
                            user_mat[i, k] += learning_rate * (2 * diff *
                                                               movie_mat[k, j])
                            movie_mat[k, j] += learning_rate * (2 * diff *
                                                                user_mat[i, k])

            # print results
            print("%d \t\t %f" % (iteration + 1, sse_accum / self.num_ratings))
        self.user_mat = user_mat
        self.movie_mat = movie_mat
        self.ranked_movies = rf.create_ranked_df(self.movies, self.train_df)
    def fit(self,
            movies_path,
            reviews_path,
            latent_features=15,
            learning_rate=0.001,
            iters=50):
        """
        Fits recommender to dataset, using the FunkSVD and knowledge-based approach.

        Args:
            movies_path: Path of CSV file with movies data with necessary columns 'movie', 'rating', 'date'
            reviews_path: Path of CSV file with reviews (ratings) data with necessary columns 'user_id', 'movie_id',
            'rating', 'timestamp'
            latent_features: Number of latent features (for FunkSVD) to be considered
            learning_rate: Learning rate for FunkSVD
            iters: Iterations of FunkSVD to find best user_mat and movies_mat

        Returns:
            None - stores the following attributes
            n_users - the number of users (int)
            n_movies - the number of movies (int)
            num_ratings - the number of ratings made (int)
            reviews - DataFrame with four columns: 'user_id', 'movie_id', 'rating', 'timestamp'
            movies - DataFrame of movies
            user_item_mat - (np array) a user by item numpy array with ratings and nans for values
            user_mat - Matrix with number of users (rows) and latent features (columns)
            movies_mat - Matrix with number of movies (columns) and latent features (rows)
            ranked_movies - DataFrame with with movies that are sorted by highest avg rating, more reviews,
            then time, and must have more than 4 ratings
        """

        # Read in the data
        self.movies = pd.read_csv(movies_path)
        self.reviews = pd.read_csv(reviews_path)

        # Create user-by-item matrix
        train_user_item = self.reviews[[
            'user_id', 'movie_id', 'rating', 'timestamp'
        ]]
        self.train_data_df = train_user_item.groupby(
            ['user_id', 'movie_id'])['rating'].max().unstack()
        self.train_data_np = np.array(self.train_data_df)

        # Set up useful values to be used through the rest of the function
        self.n_users = self.train_data_np.shape[0]
        self.n_movies = self.train_data_np.shape[1]
        self.num_ratings = np.count_nonzero(~np.isnan(self.train_data_np))

        # Store more inputs
        self.latent_features = latent_features
        self.learning_rate = learning_rate
        self.iters = iters

        # initialize the user and movie matrices with random values
        user_mat = np.random.rand(self.n_users, self.latent_features)
        movie_mat = np.random.rand(self.latent_features, self.n_movies)

        # initialize sse at 0 for first iteration
        sse_accum = 0

        # keep track of iteration and MSE
        print("Optimization Statistics")
        print("Iterations | Mean Squared Error ")

        # for each iteration
        for iteration in range(iters):

            # update our sse
            sse_accum = 0

            # For each user-movie pair
            for i in range(self.n_users):
                for j in range(self.n_movies):

                    # if the rating exists
                    if self.train_data_np[i, j] > 0:

                        # compute the error as the actual minus the dot product of the user and movie latent features
                        diff = self.train_data_np[i, j] - np.dot(
                            user_mat[i, :], movie_mat[:, j])

                        # Keep track of the sum of squared errors for the matrix
                        sse_accum += diff**2

                        # update the values in each matrix in the direction of the gradient
                        for k in range(latent_features):
                            user_mat[i, k] += learning_rate * (2 * diff *
                                                               movie_mat[k, j])
                            movie_mat[k, j] += learning_rate * (2 * diff *
                                                                user_mat[i, k])

            # print results
            print("%d \t\t %f" % (iteration + 1, sse_accum / self.num_ratings))

        # SVD approach:
        self.user_mat = user_mat
        self.movie_mat = movie_mat

        # Knowledge-based approach:
        self.ranked_movies = rf.create_ranked_df(self.movies, self.reviews)
Exemple #5
0
    def fit(self, reviews_pth, movies_pth, latent_features=5, iters=100, learning_rate = 0.001 ):#FunkSVD & Knowledge based
        '''
        fit the recommender to your dataset and also have this save the results
        to pull from when you need to make predictions
        
        This function performs matrix factorization using FunkSVD 
        
        INPUT:
        reviews_pth - path to csv with at least the four columns: 'user_id', 'movie_id'. 'rating', 'timestamp'
        movies_pth
        latent_features -  the number of latent  features used
        iters - the number of iterations
        learning_rate - the learning rate        
        
        OUTPUT:
        No Output - Stores the fllw attributes
        
        n_users - the number of users(int)
        n_movies - the number of movies(int)
        num_ratings - the number of ratings made
        reviews - dataframe with four columns: 'user_id', 'movie_id', 'rating', 'timestamp
        movies - dataframe of
        user_item_mat - (np arrays) a use by items numpy array with rating and nans for values
        Latent_features - the number of latent features used
        learning_rate - the learning rate
        iters - the number of iterations
        
        '''
        #Store inputs as attributes
        self.reviews = pd.read_csv(reviews_pth)
        self.movies = pd.read_csv(movies_pth)
        
        #Create user-item matrix
        usr_itm = self.reviews[['user_id', 'movie_id', 'rating', 'timestamp']]
        self.user_item_df = usr_itm.groupby(['user_id', 'movie_id'])['rating'].max().unstack()
        self.user_item_mat = np.array(self.user_item_df)
        
        #Store more inputs
        self.latent_features = latent_features
        self.learning_rate = learning_rate
        self.iters = iters
        
        #set up useful values to be used throught the rest of the function
        self.n_users = self.user_item_mat.shape[0]
        self.n_movies = self.user_item_mat.shape[0]
        self.num_ratings = np.count_nonzero(~np.isnan(self.user_item_mat))
        self.user_ids_series =np.array(self.user_item_df.index)
        self.movie_ids_series = np.array(self.user_item_df.columns)
        
        #initialize the user and movie matrices with random values
        user_mat = np.random.rand(self.n_users, self.latent_features)
        movie_mat = np.random.rand(self.latent_features, self.n_movies)
        
        #intialize sse at 0 for first iteration
        sse_accum = 0
        
        #keeping track of the iteration and MSE
        print('Optimization Statistics')
        print('Iterations | Mean Squared Error')
        
        #for each iteration
        for iteration in range(sellf.iters): 
            #update our sse
            old_sse  = see_accum
            see_accum = 0    
            #for each user-movie pair
            for i in range (self.n_users):
                for j in range(self.n_movies):
                    
                    #if  the rating exists
                    if self.user_item_mat[i, j] > 0:
                        
                        # compute the error as the actual minus the dot product of the user and movie latent features
                        diff = self.user_item_mat[i, j] - np.dot(user_mat[i, :], movie_mat[:, j])

                        # Keep track of the sum of squared errors for the matrix
                        sse_accum += diff**2

                        # update the values in each matrix in the direction of the gradient
                        for k in range(self.latent_features):
                            user_mat[i, k] += self.learning_rate * (2*diff*movie_mat[k, j])
                            movie_mat[k, j] += self.learning_rate * (2*diff*user_mat[i, k])
             # print results
        print("%d \t\t %f" % (iteration+1, sse_accum / self.num_ratings))
                
                   
        #svd based fit
        # Keep user_mat and movie-mat for safe keeping
        self.user_mat = user_mat
        self.movie_mat = movie_mat
        # Knowledge based fir
        self.ranked_movies = rf.create_ranked_df(self.movies, self.reviews)           
    def fit(self,
            reviews_path,
            movies_path,
            latent_features=15,
            learning_rate=0.0001,
            iterations=250):
        """
        This function performs matrix factorization using a basic form of FunkSVD with no regularization

        INPUT:
            reviews_path - (string) path to a matrix with users as rows, movies as columns, and ratings as values
            movies_path - (string) path to a matrx with XXX
            latent_features - (int) the number of latent features used
            learning_rate - (float) the learning rate
            iterations - (int) the number of iterations

        OUTPUT:

        """

        self.reviews = pd.read_csv(reviews_path)
        self.movies = pd.read_csv(movies_path)
        self.latent_features = latent_features
        self.learning_rate = learning_rate
        self.iterations = iterations

        # create user item matrix for collaborative filtering
        user_item = self.reviews[[
            'user_id', 'movie_id', 'rating', 'timestamp'
        ]]
        self.user_item_df = user_item.groupby(['user_id', 'movie_id'
                                               ])['rating'].max().unstack()
        self.user_item_matrix = np.array(self.user_item_df)

        self.amount_users = self.user_item_matrix.shape[0]
        self.amount_movies = self.user_item_matrix.shape[1]
        self.amount_ratings = np.count_nonzero(
            ~np.isnan(self.user_item_matrix))
        self.user_ids_series = np.array(self.user_item_df.index)
        self.movie_ids_series = np.array(self.user_item_df.columns)

        # intialize user and movie matrices with random values for FunkSVD
        user_matrix = np.random.rand(self.amount_users, self.latent_features)
        movie_matrix = np.random.rand(self.latent_features, self.amount_movies)

        # initialize sse at 0 for first iteration
        sum_squared_error_accumulated = 0

        # keep track of iteration and MSE
        print("Optimizaiton Statistics")
        print("Iterations | Mean Squared Error ")

        # for each iteration
        for iteration in range(self.iterations):

            # update our sse
            old_sum_squared_error_accumulated = sum_squared_error_accumulated
            sum_squared_error_accumulated = 0

            # For each user-movie pair
            for i in range(self.amount_users):
                for j in range(self.amount_movies):

                    # if the rating exists
                    if self.user_item_matrix[i, j] > 0:

                        # compute the error as the actual minus the dot product of the user and movie latent features
                        difference = self.user_item_matrix[i, j] - np.dot(
                            user_matrix[i, :], movie_matrix[:, j])

                        # Keep track of the sum of squared errors for the matrix
                        sum_squared_error_accumulated += difference**2

                        # update the values in each matrix in the direction of the gradient
                        for k in range(self.latent_features):
                            user_matrix[i, k] += self.learning_rate * (
                                2 * difference * movie_matrix[k, j])
                            movie_matrix[k, j] += self.learning_rate * (
                                2 * difference * user_matrix[i, k])

            print("%d \t\t %f" %
                  (iteration + 1,
                   sum_squared_error_accumulated / self.amount_ratings))

        # SVD based fit
        self.user_matrix = user_matrix
        self.movie_matrix = movie_matrix

        # Knowledge based fit
        self.ranked_movies = rf.create_ranked_df(self.movies, self.reviews)
Exemple #7
0
    def fit(self, movies_pth, reviews_pth):
        '''
        This function performs matrix factorization using a basic form of FunkSVD with no regularization

        Params:
        --------
            reviews_pth : path to csv with at least the four columns: 'user_id', 'movie_id', 'rating', 'timestamp'
            movies_pth : path to csv with each movie and movie information in each row
            latent_features : (int) the number of latent features used
            learning_rate : (float) the learning rate
            iters : (int) the number of iterations

        Returns:
        --------

            None
            Stores the following as attributes:
            n_users : the number of users (int)
            n_movies : the number of movies (int)
            num_ratings : the number of ratings made (int)
            reviews : dataframe with four columns: 'user_id', 'movie_id', 'rating', 'timestamp'
            movies : dataframe of
            user_item_mat : (np array) a user by item numpy array with ratings and nans for values
            latent_features : (int) the number of latent features used
            learning_rate : (float) the learning rate
            iters : (int) the number of iterations
        '''
        from numpy import dot, subtract, add, multiply, square

        # Store inputs as attributes
        self.reviews = pd.read_csv(reviews_pth)[:60000]
        self.movies = pd.read_csv(movies_pth)

        # Create user-item matrix
        usr_itm = self.reviews[['user_id', 'movie_id', 'rating']]
        self.user_item_df = usr_itm.groupby(['user_id','movie_id'])['rating'].max().unstack()
        self.user_item_mat = self.user_item_df.values
        del usr_itm
        gc.collect()

        # Set up useful values to be used through the rest of the function
        self.n_users = self.user_item_mat.shape[0]
        self.n_movies = self.user_item_mat.shape[1]
        self.num_ratings = np.count_nonzero(~np.isnan(self.user_item_mat))

        # TODO: get index of user ids
        # self.user_ids_series = np.array(self.user_item_df.index)
        # self.movie_ids_series = np.array(self.user_item_df.columns)

        # initialize the user and movie matrices with random values
        user_mat = np.random.rand(self.n_users, self.latent_features)
        movie_mat = np.random.rand(self.latent_features, self.n_movies)

        # initialize sse at 0 for first iteration
        sse_accum = 0

        # keep track of iteration and MSE
        print("Optimization Statistics")
        print("Iterations | Mean Squared Error ")

        start_time = time.perf_counter()
        # for each iteration
        for iteration in range(self.iters):

            # update our sse
            old_sse = sse_accum
            sse_accum = 0

            # For each user-movie pair
            for i in range(self.n_users):
                for j in range(self.n_movies):

                    # if the rating exists
                    if self.user_item_mat[i, j] > 0:

                        # compute the error as the actual minus the dot product of the user and movie latent features
                        actual_rating = self.user_item_mat[i, j]
                        dot_prod = dot(user_mat[i, :], movie_mat[:, j])
                        diff =  subtract(actual_rating, dot_prod)
                        del actual_rating, dot_prod

                        # Keep track of the sum of squared errors for the matrix
                        sse_accum += square(diff)

                        # update the values in each matrix in the direction of the gradient
                        for k in range(self.latent_features):
                            user_mat[i, k] += self.learning_rate * (2 * diff * movie_mat[k, j])

                            movie_mat[k, j] += self.learning_rate * (2 * diff * user_mat[i, k])

            # print results
            print("%d \t\t %f" % (iteration+1, sse_accum / self.num_ratings))

        print('Update time:', time.perf_counter() - start_time)

        # SVD based fit
        # Keep user_mat and movie_mat for safe keeping
        self.user_mat = user_mat
        self.movie_mat = movie_mat

        # Knowledge based fit
        self.ranked_movies = rf.create_ranked_df(self.movies, self.reviews)